summaryrefslogtreecommitdiffstats
path: root/src/spdk/intel-ipsec-mb/avx
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
commit19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /src/spdk/intel-ipsec-mb/avx
parentInitial commit. (diff)
downloadceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz
ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes128_cbc_dec_by8_avx.asm306
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes128_cbc_mac_x8.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes128_cntr_by8_avx.asm606
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes128_cntr_ccm_by8_avx.asm32
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes192_cbc_dec_by8_avx.asm328
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes192_cntr_by8_avx.asm504
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes256_cbc_dec_by8_avx.asm344
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes256_cntr_by8_avx.asm516
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_128_x8.asm494
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_192_x8.asm501
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_256_x8.asm536
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes_cfb_128_avx.asm165
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes_ecb_by4_avx.asm654
-rw-r--r--src/spdk/intel-ipsec-mb/avx/aes_xcbc_mac_128_x8.asm418
-rw-r--r--src/spdk/intel-ipsec-mb/avx/gcm128_avx_gen2.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx/gcm192_avx_gen2.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx/gcm256_avx_gen2.asm30
-rw-r--r--src/spdk/intel-ipsec-mb/avx/gcm_avx_gen2.asm2515
-rw-r--r--src/spdk/intel-ipsec-mb/avx/kasumi_avx.c386
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_flush_avx.asm30
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_submit_avx.asm30
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_flush_avx.asm30
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_submit_avx.asm30
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_ccm_auth_submit_flush_avx.asm537
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_cmac_submit_flush_avx.asm518
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_flush_avx.asm239
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_submit_avx.asm194
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_flush_avx.asm264
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_submit_avx.asm272
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_avx.c733
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_flush_avx.asm298
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_flush_avx.asm321
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_submit_avx.asm355
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_flush_avx.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_submit_avx.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_flush_avx.asm356
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_submit_avx.asm428
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_flush_avx.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_submit_avx.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_flush_avx.asm339
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_submit_avx.asm416
-rw-r--r--src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_submit_avx.asm358
-rw-r--r--src/spdk/intel-ipsec-mb/avx/md5_x4x2_avx.asm716
-rw-r--r--src/spdk/intel-ipsec-mb/avx/pon_avx.asm1170
-rw-r--r--src/spdk/intel-ipsec-mb/avx/sha1_mult_avx.asm434
-rw-r--r--src/spdk/intel-ipsec-mb/avx/sha1_one_block_avx.asm501
-rw-r--r--src/spdk/intel-ipsec-mb/avx/sha224_one_block_avx.asm33
-rw-r--r--src/spdk/intel-ipsec-mb/avx/sha256_one_block_avx.asm553
-rw-r--r--src/spdk/intel-ipsec-mb/avx/sha384_one_block_avx.asm33
-rw-r--r--src/spdk/intel-ipsec-mb/avx/sha512_one_block_avx.asm473
-rw-r--r--src/spdk/intel-ipsec-mb/avx/sha512_x2_avx.asm381
-rw-r--r--src/spdk/intel-ipsec-mb/avx/sha_256_mult_avx.asm391
-rw-r--r--src/spdk/intel-ipsec-mb/avx/snow3g_avx.c42
-rwxr-xr-xsrc/spdk/intel-ipsec-mb/avx/zuc_avx.asm1146
-rwxr-xr-xsrc/spdk/intel-ipsec-mb/avx/zuc_avx_top.c548
-rw-r--r--src/spdk/intel-ipsec-mb/avx2/gcm128_avx_gen4.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx2/gcm192_avx_gen4.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx2/gcm256_avx_gen4.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx2/gcm_avx_gen4.asm3641
-rw-r--r--src/spdk/intel-ipsec-mb/avx2/mb_mgr_avx2.c676
-rw-r--r--src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_flush_avx2.asm315
-rw-r--r--src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_md5_flush_avx2.asm362
-rw-r--r--src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_md5_submit_avx2.asm373
-rw-r--r--src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_224_flush_avx2.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_224_submit_avx2.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_256_flush_avx2.asm379
-rw-r--r--src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_256_submit_avx2.asm426
-rw-r--r--src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_384_flush_avx2.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_384_submit_avx2.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_512_flush_avx2.asm353
-rw-r--r--src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_512_submit_avx2.asm416
-rw-r--r--src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_submit_avx2.asm369
-rw-r--r--src/spdk/intel-ipsec-mb/avx2/md5_x8x2_avx2.asm820
-rw-r--r--src/spdk/intel-ipsec-mb/avx2/sha1_x8_avx2.asm466
-rw-r--r--src/spdk/intel-ipsec-mb/avx2/sha256_oct_avx2.asm587
-rw-r--r--src/spdk/intel-ipsec-mb/avx2/sha512_x4_avx2.asm452
-rw-r--r--src/spdk/intel-ipsec-mb/avx2/snow3g_avx2.c49
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/aes_cbc_dec_vaes_avx512.asm477
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/aes_cbc_enc_vaes_avx512.asm727
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/cntr_vaes_avx512.asm1524
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/des_x16_avx512.asm2382
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/gcm128_avx512.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/gcm128_vaes_avx512.asm32
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/gcm192_avx512.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/gcm192_vaes_avx512.asm32
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/gcm256_avx512.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/gcm256_vaes_avx512.asm32
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/gcm_avx512.asm3536
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/gcm_vaes_avx512.asm4272
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes192_flush_avx512.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes192_submit_avx512.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes256_flush_avx512.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes256_submit_avx512.asm31
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes_flush_avx512.asm320
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes_submit_avx512.asm280
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_avx512.c1066
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_des_avx512.asm524
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_flush_avx512.asm367
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_224_flush_avx512.asm28
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_224_submit_avx512.asm28
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_256_flush_avx512.asm433
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_256_submit_avx512.asm445
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_384_flush_avx512.asm29
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_384_submit_avx512.asm29
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_512_flush_avx512.asm384
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_512_submit_avx512.asm413
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_submit_avx512.asm402
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/sha1_x16_avx512.asm439
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/sha256_x16_avx512.asm758
-rw-r--r--src/spdk/intel-ipsec-mb/avx512/sha512_x8_avx512.asm595
110 files changed, 50393 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/avx/aes128_cbc_dec_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes128_cbc_dec_by8_avx.asm
new file mode 100644
index 000000000..a4de936ff
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes128_cbc_dec_by8_avx.asm
@@ -0,0 +1,306 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; routine to do AES128 CBC decrypt "by8"
+
+;; clobbers xmm0-15
+
+%include "include/os.asm"
+
+%define CONCAT(a,b) a %+ b
+%define VMOVDQ vmovdqu
+
+%define xdata0 xmm0
+%define xdata1 xmm1
+%define xdata2 xmm2
+%define xdata3 xmm3
+%define xdata4 xmm4
+%define xdata5 xmm5
+%define xdata6 xmm6
+%define xdata7 xmm7
+%define xIV xmm8
+%define xkey0 xmm9
+%define xkey2 xmm10
+%define xkey4 xmm11
+%define xkey6 xmm12
+%define xkey8 xmm13
+%define xkey10 xmm14
+%define xkeytmp xmm15
+
+%ifdef LINUX
+%define p_in rdi
+%define p_IV rsi
+%define p_keys rdx
+%define p_out rcx
+%define num_bytes r8
+%else
+%define p_in rcx
+%define p_IV rdx
+%define p_keys r8
+%define p_out r9
+%define num_bytes rax
+%endif
+
+%define tmp r10
+
+%macro do_aes_load 1
+ do_aes %1, 1
+%endmacro
+
+%macro do_aes_noload 1
+ do_aes %1, 0
+%endmacro
+
+; do_aes num_in_par load_keys
+; This increments p_in, but not p_out
+%macro do_aes 2
+%define %%by %1
+%define %%load_keys %2
+
+%if (%%load_keys)
+ vmovdqa xkey0, [p_keys + 0*16]
+%endif
+
+%assign i 0
+%rep %%by
+ VMOVDQ CONCAT(xdata,i), [p_in + i*16]
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey2, [p_keys + 2*16]
+%endif
+%assign i 0
+%rep %%by
+ vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkey0
+%assign i (i+1)
+%endrep
+
+ add p_in, 16*%%by
+
+ vmovdqa xkeytmp, [p_keys + 1*16]
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey4, [p_keys + 4*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey2
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeytmp, [p_keys + 3*16]
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey6, [p_keys + 6*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey4
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeytmp, [p_keys + 5*16]
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey8, [p_keys + 8*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey6
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeytmp, [p_keys + 7*16]
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey10, [p_keys + 10*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey8
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeytmp, [p_keys + 9*16]
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep %%by
+ vaesdeclast CONCAT(xdata,i), CONCAT(xdata,i), xkey10
+%assign i (i+1)
+%endrep
+
+ vpxor xdata0, xdata0, xIV
+%assign i 1
+%if (%%by > 1)
+%rep (%%by - 1)
+ VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by]
+ vpxor CONCAT(xdata,i), CONCAT(xdata,i), xIV
+%assign i (i+1)
+%endrep
+%endif
+ VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by]
+
+%assign i 0
+%rep %%by
+ VMOVDQ [p_out + i*16], CONCAT(xdata,i)
+%assign i (i+1)
+%endrep
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+section .text
+
+;; aes_cbc_dec_128_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+MKGLOBAL(aes_cbc_dec_128_avx,function,internal)
+aes_cbc_dec_128_avx:
+
+%ifndef LINUX
+ mov num_bytes, [rsp + 8*5]
+%endif
+
+ vmovdqu xIV, [p_IV]
+
+ mov tmp, num_bytes
+ and tmp, 7*16
+ jz mult_of_8_blks
+
+ ; 1 <= tmp <= 7
+ cmp tmp, 4*16
+ jg gt4
+ je eq4
+
+lt4:
+ cmp tmp, 2*16
+ jg eq3
+ je eq2
+eq1:
+ do_aes_load 1
+ add p_out, 1*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq2:
+ do_aes_load 2
+ add p_out, 2*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq3:
+ do_aes_load 3
+ add p_out, 3*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq4:
+ do_aes_load 4
+ add p_out, 4*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+gt4:
+ cmp tmp, 6*16
+ jg eq7
+ je eq6
+
+eq5:
+ do_aes_load 5
+ add p_out, 5*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq6:
+ do_aes_load 6
+ add p_out, 6*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq7:
+ do_aes_load 7
+ add p_out, 7*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+mult_of_8_blks:
+ vmovdqa xkey0, [p_keys + 0*16]
+ vmovdqa xkey2, [p_keys + 2*16]
+ vmovdqa xkey4, [p_keys + 4*16]
+ vmovdqa xkey6, [p_keys + 6*16]
+ vmovdqa xkey8, [p_keys + 8*16]
+ vmovdqa xkey10, [p_keys + 10*16]
+
+main_loop2:
+ ; num_bytes is a multiple of 8 and >0
+ do_aes_noload 8
+ add p_out, 8*16
+ sub num_bytes, 8*16
+ jne main_loop2
+
+do_return2:
+; Don't write back IV
+; vmovdqu [p_IV], xIV
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/aes128_cbc_mac_x8.asm b/src/spdk/intel-ipsec-mb/avx/aes128_cbc_mac_x8.asm
new file mode 100644
index 000000000..4d08bfde5
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes128_cbc_mac_x8.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;;; Routine to compute CBC-MAC. It is based on 128 bit CBC AES encrypt code.
+
+%define CBC_MAC 1
+%include "avx/aes_cbc_enc_128_x8.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/aes128_cntr_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes128_cntr_by8_avx.asm
new file mode 100644
index 000000000..d46a29192
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes128_cntr_by8_avx.asm
@@ -0,0 +1,606 @@
+;;
+;; Copyright (c) 2012-2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "include/memcpy.asm"
+%include "include/const.inc"
+%include "include/reg_sizes.asm"
+
+; routine to do AES128 CNTR enc/decrypt "by8"
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+section .data
+default rel
+
+%ifndef CNTR_CCM_AVX
+MKGLOBAL(byteswap_const,data,internal)
+MKGLOBAL(set_byte15,data,internal)
+MKGLOBAL(ddq_add_1,data,internal)
+MKGLOBAL(ddq_add_2,data,internal)
+MKGLOBAL(ddq_add_3,data,internal)
+MKGLOBAL(ddq_add_4,data,internal)
+MKGLOBAL(ddq_add_5,data,internal)
+MKGLOBAL(ddq_add_6,data,internal)
+MKGLOBAL(ddq_add_7,data,internal)
+MKGLOBAL(ddq_add_8,data,internal)
+%endif ;; CNTR_CCM_AVX
+
+align 16
+byteswap_const: ;DDQ 0x000102030405060708090A0B0C0D0E0F
+ DQ 0x08090A0B0C0D0E0F, 0x0001020304050607
+set_byte15: DQ 0x0000000000000000, 0x0100000000000000
+
+ddq_add_1: ;DDQ 0x00000000000000000000000000000001
+ DQ 0x0000000000000001, 0x0000000000000000
+ddq_add_2: ;DDQ 0x00000000000000000000000000000002
+ DQ 0x0000000000000002, 0x0000000000000000
+ddq_add_3: ;DDQ 0x00000000000000000000000000000003
+ DQ 0x0000000000000003, 0x0000000000000000
+ddq_add_4: ;DDQ 0x00000000000000000000000000000004
+ DQ 0x0000000000000004, 0x0000000000000000
+ddq_add_5: ;DDQ 0x00000000000000000000000000000005
+ DQ 0x0000000000000005, 0x0000000000000000
+ddq_add_6: ;DDQ 0x00000000000000000000000000000006
+ DQ 0x0000000000000006, 0x0000000000000000
+ddq_add_7: ;DDQ 0x00000000000000000000000000000007
+ DQ 0x0000000000000007, 0x0000000000000000
+ddq_add_8: ;DDQ 0x00000000000000000000000000000008
+ DQ 0x0000000000000008, 0x0000000000000000
+
+section .text
+
+%define CONCAT(a,b) a %+ b
+%define VMOVDQ vmovdqu
+
+%define xdata0 xmm0
+%define xdata1 xmm1
+%define xpart xmm1
+%define xdata2 xmm2
+%define xdata3 xmm3
+%define xdata4 xmm4
+%define xdata5 xmm5
+%define xdata6 xmm6
+%define xdata7 xmm7
+%define xcounter xmm8
+%define xtmp xmm8
+%define xbyteswap xmm9
+%define xtmp2 xmm9
+%define xkey0 xmm10
+%define xtmp3 xmm10
+%define xkey3 xmm11
+%define xkey6 xmm12
+%define xkey9 xmm13
+%define xkeyA xmm14
+%define xkeyB xmm15
+
+%ifdef CNTR_CCM_AVX
+%ifdef LINUX
+%define job rdi
+%define p_in rsi
+%define p_keys rdx
+%define p_out rcx
+%define num_bytes r8
+%define p_ivlen r9
+%else ;; LINUX
+%define job rcx
+%define p_in rdx
+%define p_keys r8
+%define p_out r9
+%define num_bytes r10
+%define p_ivlen rax
+%endif ;; LINUX
+%define p_IV r11
+%else ;; CNTR_CCM_AVX
+%ifdef LINUX
+%define p_in rdi
+%define p_IV rsi
+%define p_keys rdx
+%define p_out rcx
+%define num_bytes r8
+%define num_bits r8
+%define p_ivlen r9
+%else ;; LINUX
+%define p_in rcx
+%define p_IV rdx
+%define p_keys r8
+%define p_out r9
+%define num_bytes r10
+%define num_bits r10
+%define p_ivlen qword [rsp + 8*6]
+%endif ;; LINUX
+%endif ;; CNTR_CCM_AVX
+
+%define tmp r11
+%define flags r11
+
+%define r_bits r12
+%define tmp2 r13
+%define mask r14
+
+%macro do_aes_load 2
+ do_aes %1, %2, 1
+%endmacro
+
+%macro do_aes_noload 2
+ do_aes %1, %2, 0
+%endmacro
+
+; do_aes num_in_par load_keys
+; This increments p_in, but not p_out
+%macro do_aes 3
+%define %%by %1
+%define %%cntr_type %2
+%define %%load_keys %3
+
+%if (%%load_keys)
+ vmovdqa xkey0, [p_keys + 0*16]
+%endif
+
+ vpshufb xdata0, xcounter, xbyteswap
+%assign i 1
+%rep (%%by - 1)
+ vpaddd CONCAT(xdata,i), xcounter, [rel CONCAT(ddq_add_,i)]
+ vpshufb CONCAT(xdata,i), CONCAT(xdata,i), xbyteswap
+%assign i (i + 1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 1*16]
+
+ vpxor xdata0, xkey0
+%ifidn %%cntr_type, CNTR_BIT
+ vpaddd xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)]
+%else
+ vpaddq xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)]
+%endif
+
+%assign i 1
+%rep (%%by - 1)
+ vpxor CONCAT(xdata,i), xkey0
+%assign i (i + 1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 2*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 1
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey3, [p_keys + 3*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 2
+%assign i (i+1)
+%endrep
+
+ add p_in, 16*%%by
+
+ vmovdqa xkeyB, [p_keys + 4*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey3 ; key 3
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 5*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 4
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey6, [p_keys + 6*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 5
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 7*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey6 ; key 6
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 8*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 7
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey9, [p_keys + 9*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 8
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 10*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey9 ; key 9
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep %%by
+ vaesenclast CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 10
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep (%%by / 2)
+%assign j (i+1)
+ VMOVDQ xkeyA, [p_in + i*16 - 16*%%by]
+ VMOVDQ xkeyB, [p_in + j*16 - 16*%%by]
+ vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+ vpxor CONCAT(xdata,j), CONCAT(xdata,j), xkeyB
+%assign i (i+2)
+%endrep
+%if (i < %%by)
+ VMOVDQ xkeyA, [p_in + i*16 - 16*%%by]
+ vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+%endif
+
+%ifidn %%cntr_type, CNTR_BIT
+ ;; check if this is the end of the message
+ mov tmp, num_bytes
+ and tmp, ~(%%by*16)
+ jnz %%skip_preserve
+ ;; Check if there is a partial byte
+ or r_bits, r_bits
+ jz %%skip_preserve
+
+%assign idx (%%by - 1)
+ ;; Load output to get last partial byte
+ vmovdqu xtmp, [p_out + idx * 16]
+
+ ;; Save RCX in temporary GP register
+ mov tmp, rcx
+ mov mask, 0xff
+ mov cl, BYTE(r_bits)
+ shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111
+ mov rcx, tmp
+
+ vmovq xtmp2, mask
+ vpslldq xtmp2, 15
+ ;; At this point, xtmp2 contains a mask with all 0s, but with some ones
+ ;; in the partial byte
+
+ ;; Clear all the bits that do not need to be preserved from the output
+ vpand xtmp, xtmp, xtmp2
+
+ ;; Clear all bits from the input that are not to be ciphered
+ vpandn CONCAT(xdata,idx), xtmp2, CONCAT(xdata,idx)
+ vpor CONCAT(xdata,idx), xtmp
+
+%%skip_preserve:
+%endif
+
+%assign i 0
+%rep %%by
+ VMOVDQ [p_out + i*16], CONCAT(xdata,i)
+%assign i (i+1)
+%endrep
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Macro performing AES-CTR.
+;;
+%macro DO_CNTR 1
+%define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT/CCM)
+
+%ifidn %%CNTR_TYPE, CCM
+ mov p_in, [job + _src]
+ add p_in, [job + _cipher_start_src_offset_in_bytes]
+ mov p_ivlen, [job + _iv_len_in_bytes]
+ mov num_bytes, [job + _msg_len_to_cipher_in_bytes]
+ mov p_keys, [job + _aes_enc_key_expanded]
+ mov p_out, [job + _dst]
+
+ vmovdqa xbyteswap, [rel byteswap_const]
+ ;; Prepare IV ;;
+
+ ;; Byte 0: flags with L'
+ ;; Calculate L' = 15 - Nonce length - 1 = 14 - IV length
+ mov flags, 14
+ sub flags, p_ivlen
+ vmovd xcounter, DWORD(flags)
+ ;; Bytes 1 - 13: Nonce (7 - 13 bytes long)
+
+ ;; Bytes 1 - 7 are always copied (first 7 bytes)
+ mov p_IV, [job + _iv]
+ vpinsrb xcounter, [p_IV], 1
+ vpinsrw xcounter, [p_IV + 1], 1
+ vpinsrd xcounter, [p_IV + 3], 1
+
+ cmp p_ivlen, 7
+ je _finish_nonce_move
+
+ cmp p_ivlen, 8
+ je _iv_length_8
+ cmp p_ivlen, 9
+ je _iv_length_9
+ cmp p_ivlen, 10
+ je _iv_length_10
+ cmp p_ivlen, 11
+ je _iv_length_11
+ cmp p_ivlen, 12
+ je _iv_length_12
+
+ ;; Bytes 8 - 13
+_iv_length_13:
+ vpinsrb xcounter, [p_IV + 12], 13
+_iv_length_12:
+ vpinsrb xcounter, [p_IV + 11], 12
+_iv_length_11:
+ vpinsrd xcounter, [p_IV + 7], 2
+ jmp _finish_nonce_move
+_iv_length_10:
+ vpinsrb xcounter, [p_IV + 9], 10
+_iv_length_9:
+ vpinsrb xcounter, [p_IV + 8], 9
+_iv_length_8:
+ vpinsrb xcounter, [p_IV + 7], 8
+
+_finish_nonce_move:
+ ; last byte = 1
+ vpor xcounter, [rel set_byte15]
+%else ;; CNTR/CNTR_BIT
+%ifndef LINUX
+ mov num_bytes, [rsp + 8*5] ; arg5
+%endif
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ push r12
+ push r13
+ push r14
+%endif
+
+ vmovdqa xbyteswap, [rel byteswap_const]
+%ifidn %%CNTR_TYPE, CNTR
+ test p_ivlen, 16
+ jnz %%iv_is_16_bytes
+ ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001
+ mov DWORD(tmp), 0x01000000
+ vpinsrq xcounter, [p_IV], 0
+ vpinsrd xcounter, [p_IV + 8], 2
+ vpinsrd xcounter, DWORD(tmp), 3
+
+%else ;; CNTR_BIT
+ ; Read 16 byte IV: Nonce + 8-byte block counter (BE)
+ vmovdqu xcounter, [p_IV]
+%endif
+%endif ;; CNTR/CNTR_BIT/CCM
+%%bswap_iv:
+ vpshufb xcounter, xbyteswap
+
+ ;; calculate len
+ ;; convert bits to bytes (message length in bits for CNTR_BIT)
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ mov r_bits, num_bits
+ add num_bits, 7
+ shr num_bits, 3 ; "num_bits" and "num_bytes" registers are the same
+ and r_bits, 7 ; Check if there are remainder bits (0-7)
+%endif
+
+ mov tmp, num_bytes
+ and tmp, 7*16
+ jz %%chk ; x8 > or < 15 (not 7 lines)
+
+ ; 1 <= tmp <= 7
+ cmp tmp, 4*16
+ jg %%gt4
+ je %%eq4
+
+%%lt4:
+ cmp tmp, 2*16
+ jg %%eq3
+ je %%eq2
+%%eq1:
+ do_aes_load 1, %%CNTR_TYPE
+ add p_out, 1*16
+ jmp %%chk
+
+%%eq2:
+ do_aes_load 2, %%CNTR_TYPE
+ add p_out, 2*16
+ jmp %%chk
+
+%%eq3:
+ do_aes_load 3, %%CNTR_TYPE
+ add p_out, 3*16
+ jmp %%chk
+
+%%eq4:
+ do_aes_load 4, %%CNTR_TYPE
+ add p_out, 4*16
+ jmp %%chk
+
+%%gt4:
+ cmp tmp, 6*16
+ jg %%eq7
+ je %%eq6
+
+%%eq5:
+ do_aes_load 5, %%CNTR_TYPE
+ add p_out, 5*16
+ jmp %%chk
+
+%%eq6:
+ do_aes_load 6, %%CNTR_TYPE
+ add p_out, 6*16
+ jmp %%chk
+
+%%eq7:
+ do_aes_load 7, %%CNTR_TYPE
+ add p_out, 7*16
+ ; fall through to chk
+%%chk:
+ and num_bytes, ~(7*16)
+ jz %%do_return2
+
+ cmp num_bytes, 16
+ jb %%last
+
+ ; process multiples of 8 blocks
+ vmovdqa xkey0, [p_keys + 0*16]
+ vmovdqa xkey3, [p_keys + 3*16]
+ vmovdqa xkey6, [p_keys + 6*16]
+ vmovdqa xkey9, [p_keys + 9*16]
+ jmp %%main_loop2
+
+align 32
+%%main_loop2:
+ ; num_bytes is a multiple of 8 blocks + partial bytes
+ do_aes_noload 8, %%CNTR_TYPE
+ add p_out, 8*16
+ sub num_bytes, 8*16
+ cmp num_bytes, 8*16
+ jae %%main_loop2
+
+ ; Check if there is a partial block
+ or num_bytes, num_bytes
+ jnz %%last
+
+%%do_return2:
+%ifidn %%CNTR_TYPE, CCM
+ mov rax, job
+ or dword [rax + _status], STS_COMPLETED_AES
+%endif
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ pop r14
+ pop r13
+ pop r12
+%endif
+
+ ret
+
+%%last:
+
+ ; load partial block into XMM register
+ simd_load_avx_15_1 xpart, p_in, num_bytes
+
+%%final_ctr_enc:
+ ; Encryption of a single partial block
+ vpshufb xcounter, xbyteswap
+ vmovdqa xdata0, xcounter
+ vpxor xdata0, [p_keys + 16*0]
+%assign i 1
+%rep 9
+ vaesenc xdata0, [p_keys + 16*i]
+%assign i (i+1)
+%endrep
+ ; created keystream
+ vaesenclast xdata0, [p_keys + 16*i]
+
+ ; xor keystream with the message (scratch)
+ vpxor xdata0, xpart
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ ;; Check if there is a partial byte
+ or r_bits, r_bits
+ jz %%store_output
+
+ ;; Load output to get last partial byte
+ simd_load_avx_15_1 xtmp, p_out, num_bytes
+
+ ;; Save RCX in temporary GP register
+ mov tmp, rcx
+ mov mask, 0xff
+%ifidn r_bits, rcx
+%error "r_bits cannot be mapped to rcx!"
+%endif
+ mov cl, BYTE(r_bits)
+ shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111
+ mov rcx, tmp
+
+ vmovq xtmp2, mask
+
+ ;; Get number of full bytes in last block of 16 bytes
+ mov tmp, num_bytes
+ dec tmp
+ XVPSLLB xtmp2, tmp, xtmp3, tmp2
+ ;; At this point, xtmp2 contains a mask with all 0s, but with some ones
+ ;; in the partial byte
+
+ ;; Clear all the bits that do not need to be preserved from the output
+ vpand xtmp, xtmp, xtmp2
+
+ ;; Clear the bits from the input that are not to be ciphered
+ vpandn xdata0, xtmp2, xdata0
+ vpor xdata0, xtmp
+%endif
+
+%%store_output:
+ ; copy result into the output buffer
+ simd_store_avx_15 p_out, xdata0, num_bytes, tmp, rax
+
+ jmp %%do_return2
+
+%%iv_is_16_bytes:
+ ; Read 16 byte IV: Nonce + ESP IV + block counter (BE)
+ vmovdqu xcounter, [p_IV]
+ jmp %%bswap_iv
+%endmacro
+
+align 32
+%ifdef CNTR_CCM_AVX
+; JOB_AES_HMAC * aes_cntr_ccm_128_avx(JOB_AES_HMAC *job)
+; arg 1 : job
+MKGLOBAL(aes_cntr_ccm_128_avx,function,internal)
+aes_cntr_ccm_128_avx:
+ DO_CNTR CCM
+%else
+;; aes_cntr_128_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes,
+;; UINT64 iv_len)
+MKGLOBAL(aes_cntr_128_avx,function,internal)
+aes_cntr_128_avx:
+ DO_CNTR CNTR
+
+;; aes_cntr_bit_128_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bits,
+;; UINT64 iv_len)
+MKGLOBAL(aes_cntr_bit_128_avx,function,internal)
+aes_cntr_bit_128_avx:
+ DO_CNTR CNTR_BIT
+%endif ;; CNTR_CCM_AVX
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/aes128_cntr_ccm_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes128_cntr_ccm_by8_avx.asm
new file mode 100644
index 000000000..1a4c11602
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes128_cntr_ccm_by8_avx.asm
@@ -0,0 +1,32 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define CNTR_CCM_AVX
+%ifndef AES_CNTR_CCM_128
+%define AES_CNTR_CCM_128 aes_cntr_ccm_128_avx
+%endif
+%include "avx/aes128_cntr_by8_avx.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/aes192_cbc_dec_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes192_cbc_dec_by8_avx.asm
new file mode 100644
index 000000000..9952c2552
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes192_cbc_dec_by8_avx.asm
@@ -0,0 +1,328 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; routine to do AES192 CBC decrypt "by8"
+
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+%include "include/os.asm"
+
+%define CONCAT(a,b) a %+ b
+%define VMOVDQ vmovdqu
+
+%define xdata0 xmm0
+%define xdata1 xmm1
+%define xdata2 xmm2
+%define xdata3 xmm3
+%define xdata4 xmm4
+%define xdata5 xmm5
+%define xdata6 xmm6
+%define xdata7 xmm7
+%define xIV xmm8
+%define xkey0 xmm9
+%define xkey3 xmm10
+%define xkey6 xmm11
+%define xkey9 xmm12
+%define xkey12 xmm13
+%define xkeyA xmm14
+%define xkeyB xmm15
+
+%ifdef LINUX
+%define p_in rdi
+%define p_IV rsi
+%define p_keys rdx
+%define p_out rcx
+%define num_bytes r8
+%else
+%define p_in rcx
+%define p_IV rdx
+%define p_keys r8
+%define p_out r9
+%define num_bytes rax
+%endif
+
+%define tmp r10
+
+%macro do_aes_load 1
+ do_aes %1, 1
+%endmacro
+
+%macro do_aes_noload 1
+ do_aes %1, 0
+%endmacro
+
+; do_aes num_in_par load_keys
+; This increments p_in, but not p_out
+%macro do_aes 2
+%define %%by %1
+%define %%load_keys %2
+
+%if (%%load_keys)
+ vmovdqa xkey0, [p_keys + 0*16]
+%endif
+
+%assign i 0
+%rep %%by
+ VMOVDQ CONCAT(xdata,i), [p_in + i*16]
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 1*16]
+
+%assign i 0
+%rep %%by
+ vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkey0
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 2*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+%assign i (i+1)
+%endrep
+
+ add p_in, 16*%%by
+
+%if (%%load_keys)
+ vmovdqa xkey3, [p_keys + 3*16]
+%endif
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 4*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey3
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 5*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey6, [p_keys + 6*16]
+%endif
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 7*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey6
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 8*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey9, [p_keys + 9*16]
+%endif
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 10*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey9
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 11*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey12, [p_keys + 12*16]
+%endif
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep %%by
+ vaesdeclast CONCAT(xdata,i), CONCAT(xdata,i), xkey12
+%assign i (i+1)
+%endrep
+
+ vpxor xdata0, xdata0, xIV
+%assign i 1
+%if (%%by > 1)
+%rep (%%by - 1)
+ VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by]
+ vpxor CONCAT(xdata,i), CONCAT(xdata,i), xIV
+%assign i (i+1)
+%endrep
+%endif
+ VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by]
+
+%assign i 0
+%rep %%by
+ VMOVDQ [p_out + i*16], CONCAT(xdata,i)
+%assign i (i+1)
+%endrep
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+section .text
+
+;; aes_cbc_dec_192_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+MKGLOBAL(aes_cbc_dec_192_avx,function,internal)
+aes_cbc_dec_192_avx:
+
+%ifndef LINUX
+ mov num_bytes, [rsp + 8*5]
+%endif
+
+ vmovdqu xIV, [p_IV]
+
+ mov tmp, num_bytes
+ and tmp, 7*16
+ jz mult_of_8_blks
+
+ ; 1 <= tmp <= 7
+ cmp tmp, 4*16
+ jg gt4
+ je eq4
+
+lt4:
+ cmp tmp, 2*16
+ jg eq3
+ je eq2
+eq1:
+ do_aes_load 1
+ add p_out, 1*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq2:
+ do_aes_load 2
+ add p_out, 2*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq3:
+ do_aes_load 3
+ add p_out, 3*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq4:
+ do_aes_load 4
+ add p_out, 4*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+gt4:
+ cmp tmp, 6*16
+ jg eq7
+ je eq6
+
+eq5:
+ do_aes_load 5
+ add p_out, 5*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq6:
+ do_aes_load 6
+ add p_out, 6*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq7:
+ do_aes_load 7
+ add p_out, 7*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+mult_of_8_blks:
+ vmovdqa xkey0, [p_keys + 0*16]
+ vmovdqa xkey3, [p_keys + 3*16]
+ vmovdqa xkey6, [p_keys + 6*16]
+ vmovdqa xkey9, [p_keys + 9*16]
+ vmovdqa xkey12, [p_keys + 12*16]
+
+main_loop2:
+ ; num_bytes is a multiple of 8 and >0
+ do_aes_noload 8
+ add p_out, 8*16
+ sub num_bytes, 8*16
+ jne main_loop2
+
+do_return2:
+; Don't write back IV
+; vmovdqu [p_IV], xIV
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/aes192_cntr_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes192_cntr_by8_avx.asm
new file mode 100644
index 000000000..e926b4413
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes192_cntr_by8_avx.asm
@@ -0,0 +1,504 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "include/memcpy.asm"
+%include "include/const.inc"
+%include "include/reg_sizes.asm"
+
+; routine to do AES192 CNTR enc/decrypt "by8"
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+extern byteswap_const
+extern ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4
+extern ddq_add_5, ddq_add_6, ddq_add_7, ddq_add_8
+
+%define CONCAT(a,b) a %+ b
+%define VMOVDQ vmovdqu
+
+%define xdata0 xmm0
+%define xdata1 xmm1
+%define xpart xmm1
+%define xdata2 xmm2
+%define xdata3 xmm3
+%define xdata4 xmm4
+%define xdata5 xmm5
+%define xdata6 xmm6
+%define xdata7 xmm7
+%define xcounter xmm8
+%define xtmp xmm8
+%define xbyteswap xmm9
+%define xtmp2 xmm9
+%define xkey0 xmm10
+%define xtmp3 xmm10
+%define xkey4 xmm11
+%define xkey8 xmm12
+%define xkey12 xmm13
+%define xkeyA xmm14
+%define xkeyB xmm15
+
+%ifdef LINUX
+%define p_in rdi
+%define p_IV rsi
+%define p_keys rdx
+%define p_out rcx
+%define num_bytes r8
+%define num_bits r8
+%define p_ivlen r9
+%else
+%define p_in rcx
+%define p_IV rdx
+%define p_keys r8
+%define p_out r9
+%define num_bytes r10
+%define num_bits r10
+%define p_ivlen qword [rsp + 8*6]
+%endif
+
+%define tmp r11
+
+%define r_bits r12
+%define tmp2 r13
+%define mask r14
+
+%macro do_aes_load 2
+ do_aes %1, %2, 1
+%endmacro
+
+%macro do_aes_noload 2
+ do_aes %1, %2, 0
+%endmacro
+
+; do_aes num_in_par load_keys
+; This increments p_in, but not p_out
+%macro do_aes 3
+%define %%by %1
+%define %%cntr_type %2
+%define %%load_keys %3
+
+%if (%%load_keys)
+ vmovdqa xkey0, [p_keys + 0*16]
+%endif
+
+ vpshufb xdata0, xcounter, xbyteswap
+%assign i 1
+%rep (%%by - 1)
+ vpaddd CONCAT(xdata,i), xcounter, [rel CONCAT(ddq_add_,i)]
+ vpshufb CONCAT(xdata,i), CONCAT(xdata,i), xbyteswap
+%assign i (i + 1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 1*16]
+
+ vpxor xdata0, xkey0
+%ifidn %%cntr_type, CNTR_BIT
+ vpaddd xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)]
+%else
+ vpaddq xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)]
+%endif
+
+%assign i 1
+%rep (%%by - 1)
+ vpxor CONCAT(xdata,i), xkey0
+%assign i (i + 1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 2*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 1
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 3*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 2
+%assign i (i+1)
+%endrep
+
+ add p_in, 16*%%by
+
+%if (%%load_keys)
+ vmovdqa xkey4, [p_keys + 4*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 3
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 5*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey4 ; key 4
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 6*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 5
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 7*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 6
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey8, [p_keys + 8*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 7
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 9*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey8 ; key 8
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 10*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 9
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 11*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 10
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey12, [p_keys + 12*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 11
+%assign i (i+1)
+%endrep
+
+
+%assign i 0
+%rep %%by
+ vaesenclast CONCAT(xdata,i), CONCAT(xdata,i), xkey12 ; key 12
+%assign i (i+1)
+%endrep
+
+
+%assign i 0
+%rep (%%by / 2)
+%assign j (i+1)
+ VMOVDQ xkeyA, [p_in + i*16 - 16*%%by]
+ VMOVDQ xkeyB, [p_in + j*16 - 16*%%by]
+ vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+ vpxor CONCAT(xdata,j), CONCAT(xdata,j), xkeyB
+%assign i (i+2)
+%endrep
+%if (i < %%by)
+ VMOVDQ xkeyA, [p_in + i*16 - 16*%%by]
+ vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+%endif
+
+%ifidn %%cntr_type, CNTR_BIT
+ ;; check if this is the end of the message
+ mov tmp, num_bytes
+ and tmp, ~(%%by*16)
+ jnz %%skip_preserve
+ ;; Check if there is a partial byte
+ or r_bits, r_bits
+ jz %%skip_preserve
+
+%assign idx (%%by - 1)
+ ;; Load output to get last partial byte
+ vmovdqu xtmp, [p_out + idx * 16]
+
+ ;; Save RCX in temporary GP register
+ mov tmp, rcx
+ mov mask, 0xff
+ mov cl, BYTE(r_bits)
+ shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111
+ mov rcx, tmp
+
+ vmovq xtmp2, mask
+ vpslldq xtmp2, 15
+ ;; At this point, xtmp2 contains a mask with all 0s, but with some ones
+ ;; in the partial byte
+
+ ;; Clear all the bits that do not need to be preserved from the output
+ vpand xtmp, xtmp, xtmp2
+
+ ;; Clear all bits from the input that are not to be ciphered
+ vpandn CONCAT(xdata,idx), xtmp2, CONCAT(xdata,idx)
+ vpor CONCAT(xdata,idx), xtmp
+
+%%skip_preserve:
+%endif
+
+%assign i 0
+%rep %%by
+ VMOVDQ [p_out + i*16], CONCAT(xdata,i)
+%assign i (i+1)
+%endrep
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+section .text
+;; Macro performing AES-CTR.
+;;
+%macro DO_CNTR 1
+%define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT/CCM)
+
+%ifndef LINUX
+ mov num_bytes, [rsp + 8*5]
+%endif
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ push r12
+ push r13
+ push r14
+%endif
+
+ vmovdqa xbyteswap, [rel byteswap_const]
+%ifidn %%CNTR_TYPE, CNTR
+ test p_ivlen, 16
+ jnz %%iv_is_16_bytes
+ ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001
+ mov DWORD(tmp), 0x01000000
+ vpinsrq xcounter, [p_IV], 0
+ vpinsrd xcounter, [p_IV + 8], 2
+ vpinsrd xcounter, DWORD(tmp), 3
+
+%else ;; CNTR_BIT
+ ; Read 16 byte IV: Nonce + 8-byte block counter (BE)
+ vmovdqu xcounter, [p_IV]
+%endif
+%%bswap_iv:
+ vpshufb xcounter, xbyteswap
+
+ ;; calculate len
+ ;; convert bits to bytes (message length in bits for CNTR_BIT)
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ mov r_bits, num_bits
+ add num_bits, 7
+ shr num_bits, 3 ; "num_bits" and "num_bytes" registers are the same
+ and r_bits, 7 ; Check if there are remainder bits (0-7)
+%endif
+
+ mov tmp, num_bytes
+ and tmp, 7*16
+ jz %%chk ; x8 > or < 15 (not 7 lines)
+
+ ; 1 <= tmp <= 7
+ cmp tmp, 4*16
+ jg %%gt4
+ je %%eq4
+
+%%lt4:
+ cmp tmp, 2*16
+ jg %%eq3
+ je %%eq2
+%%eq1:
+ do_aes_load 1, %%CNTR_TYPE
+ add p_out, 1*16
+ jmp %%chk
+
+%%eq2:
+ do_aes_load 2, %%CNTR_TYPE
+ add p_out, 2*16
+ jmp %%chk
+
+%%eq3:
+ do_aes_load 3, %%CNTR_TYPE
+ add p_out, 3*16
+ jmp %%chk
+
+%%eq4:
+ do_aes_load 4, %%CNTR_TYPE
+ add p_out, 4*16
+ jmp %%chk
+
+%%gt4:
+ cmp tmp, 6*16
+ jg %%eq7
+ je %%eq6
+
+%%eq5:
+ do_aes_load 5, %%CNTR_TYPE
+ add p_out, 5*16
+ jmp %%chk
+
+%%eq6:
+ do_aes_load 6, %%CNTR_TYPE
+ add p_out, 6*16
+ jmp %%chk
+
+%%eq7:
+ do_aes_load 7, %%CNTR_TYPE
+ add p_out, 7*16
+ ; fall through to chk
+%%chk:
+ and num_bytes, ~(7*16)
+ jz %%do_return2
+
+ cmp num_bytes, 16
+ jb %%last
+
+ ; process multiples of 8 blocks
+ vmovdqa xkey0, [p_keys + 0*16]
+ vmovdqa xkey4, [p_keys + 4*16]
+ vmovdqa xkey8, [p_keys + 8*16]
+ vmovdqa xkey12, [p_keys + 12*16]
+ jmp %%main_loop2
+
+align 32
+%%main_loop2:
+ ; num_bytes is a multiple of 8 blocks + partial bytes
+ do_aes_noload 8, %%CNTR_TYPE
+ add p_out, 8*16
+ sub num_bytes, 8*16
+ cmp num_bytes, 8*16
+ jae %%main_loop2
+
+ ; Check if there is a partial block
+ or num_bytes, num_bytes
+ jnz %%last
+
+%%do_return2:
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ pop r14
+ pop r13
+ pop r12
+%endif
+
+ ret
+
+%%last:
+
+ ; load partial block into XMM register
+ simd_load_avx_15_1 xpart, p_in, num_bytes
+
+%%final_ctr_enc:
+ ; Encryption of a single partial block
+ vpshufb xcounter, xbyteswap
+ vmovdqa xdata0, xcounter
+ vpxor xdata0, [p_keys + 16*0]
+%assign i 1
+%rep 11
+ vaesenc xdata0, [p_keys + 16*i]
+%assign i (i+1)
+%endrep
+ ; created keystream
+ vaesenclast xdata0, [p_keys + 16*i]
+
+ ; xor keystream with the message (scratch)
+ vpxor xdata0, xpart
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ ;; Check if there is a partial byte
+ or r_bits, r_bits
+ jz %%store_output
+
+ ;; Load output to get last partial byte
+ simd_load_avx_15_1 xtmp, p_out, num_bytes
+
+ ;; Save RCX in temporary GP register
+ mov tmp, rcx
+ mov mask, 0xff
+%ifidn r_bits, rcx
+%error "r_bits cannot be mapped to rcx!"
+%endif
+ mov cl, BYTE(r_bits)
+ shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111
+ mov rcx, tmp
+
+ vmovq xtmp2, mask
+
+ ;; Get number of full bytes in last block of 16 bytes
+ mov tmp, num_bytes
+ dec tmp
+ XVPSLLB xtmp2, tmp, xtmp3, tmp2
+ ;; At this point, xtmp2 contains a mask with all 0s, but with some ones
+ ;; in the partial byte
+
+ ;; Clear all the bits that do not need to be preserved from the output
+ vpand xtmp, xtmp, xtmp2
+
+ ;; Clear the bits from the input that are not to be ciphered
+ vpandn xdata0, xtmp2, xdata0
+ vpor xdata0, xtmp
+%endif
+
+%%store_output:
+ ; copy result into the output buffer
+ simd_store_avx_15 p_out, xdata0, num_bytes, tmp, rax
+
+ jmp %%do_return2
+
+%%iv_is_16_bytes:
+ ; Read 16 byte IV: Nonce + ESP IV + block counter (BE)
+ vmovdqu xcounter, [p_IV]
+ jmp %%bswap_iv
+%endmacro
+
+align 32
+%ifdef CNTR_CCM_AVX
+; JOB_AES_HMAC * aes_cntr_ccm_192_avx(JOB_AES_HMAC *job)
+; arg 1 : job
+MKGLOBAL(aes_cntr_ccm_192_avx,function,internal)
+aes_cntr_ccm_192_avx:
+ DO_CNTR CCM
+%else
+;; aes_cntr_192_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes,
+;; UINT64 iv_len)
+MKGLOBAL(aes_cntr_192_avx,function,internal)
+aes_cntr_192_avx:
+ DO_CNTR CNTR
+
+;; aes_cntr_bit_192_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bits,
+;; UINT64 iv_len)
+MKGLOBAL(aes_cntr_bit_192_avx,function,internal)
+aes_cntr_bit_192_avx:
+ DO_CNTR CNTR_BIT
+%endif ;; CNTR_CCM_AVX
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/aes256_cbc_dec_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes256_cbc_dec_by8_avx.asm
new file mode 100644
index 000000000..6a8f100ec
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes256_cbc_dec_by8_avx.asm
@@ -0,0 +1,344 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; routine to do AES256 CBC decrypt "by8"
+
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+%include "include/os.asm"
+
+%define CONCAT(a,b) a %+ b
+%define VMOVDQ vmovdqu
+
+%define xdata0 xmm0
+%define xdata1 xmm1
+%define xdata2 xmm2
+%define xdata3 xmm3
+%define xdata4 xmm4
+%define xdata5 xmm5
+%define xdata6 xmm6
+%define xdata7 xmm7
+%define xIV xmm8
+%define xkey0 xmm9
+%define xkey3 xmm10
+%define xkey6 xmm11
+%define xkey9 xmm12
+%define xkey12 xmm13
+%define xkeyA xmm14
+%define xkeyB xmm15
+
+%ifdef LINUX
+%define p_in rdi
+%define p_IV rsi
+%define p_keys rdx
+%define p_out rcx
+%define num_bytes r8
+%else
+%define p_in rcx
+%define p_IV rdx
+%define p_keys r8
+%define p_out r9
+%define num_bytes rax
+%endif
+
+%define tmp r10
+
+%macro do_aes_load 1
+ do_aes %1, 1
+%endmacro
+
+%macro do_aes_noload 1
+ do_aes %1, 0
+%endmacro
+
+; do_aes num_in_par load_keys
+; This increments p_in, but not p_out
+%macro do_aes 2
+%define %%by %1
+%define %%load_keys %2
+
+%if (%%load_keys)
+ vmovdqa xkey0, [p_keys + 0*16]
+%endif
+
+%assign i 0
+%rep %%by
+ VMOVDQ CONCAT(xdata,i), [p_in + i*16]
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 1*16]
+
+%assign i 0
+%rep %%by
+ vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkey0
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 2*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+%assign i (i+1)
+%endrep
+
+ add p_in, 16*%%by
+
+%if (%%load_keys)
+ vmovdqa xkey3, [p_keys + 3*16]
+%endif
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 4*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey3
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 5*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey6, [p_keys + 6*16]
+%endif
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 7*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey6
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 8*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey9, [p_keys + 9*16]
+%endif
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 10*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey9
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 11*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey12, [p_keys + 12*16]
+%endif
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 13*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey12
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 14*16]
+
+%assign i 0
+%rep %%by
+ vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep %%by
+ vaesdeclast CONCAT(xdata,i), CONCAT(xdata,i), xkeyB
+%assign i (i+1)
+%endrep
+
+ vpxor xdata0, xdata0, xIV
+%assign i 1
+%if (%%by > 1)
+%rep (%%by - 1)
+ VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by]
+ vpxor CONCAT(xdata,i), CONCAT(xdata,i), xIV
+%assign i (i+1)
+%endrep
+%endif
+ VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by]
+
+%assign i 0
+%rep %%by
+ VMOVDQ [p_out + i*16], CONCAT(xdata,i)
+%assign i (i+1)
+%endrep
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+section .text
+
+;; aes_cbc_dec_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+MKGLOBAL(aes_cbc_dec_256_avx,function,internal)
+aes_cbc_dec_256_avx:
+
+%ifndef LINUX
+ mov num_bytes, [rsp + 8*5]
+%endif
+
+ vmovdqu xIV, [p_IV]
+
+ mov tmp, num_bytes
+ and tmp, 7*16
+ jz mult_of_8_blks
+
+ ; 1 <= tmp <= 7
+ cmp tmp, 4*16
+ jg gt4
+ je eq4
+
+lt4:
+ cmp tmp, 2*16
+ jg eq3
+ je eq2
+eq1:
+ do_aes_load 1
+ add p_out, 1*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq2:
+ do_aes_load 2
+ add p_out, 2*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq3:
+ do_aes_load 3
+ add p_out, 3*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq4:
+ do_aes_load 4
+ add p_out, 4*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+gt4:
+ cmp tmp, 6*16
+ jg eq7
+ je eq6
+
+eq5:
+ do_aes_load 5
+ add p_out, 5*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq6:
+ do_aes_load 6
+ add p_out, 6*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+eq7:
+ do_aes_load 7
+ add p_out, 7*16
+ and num_bytes, ~7*16
+ jz do_return2
+ jmp main_loop2
+
+mult_of_8_blks:
+ vmovdqa xkey0, [p_keys + 0*16]
+ vmovdqa xkey3, [p_keys + 3*16]
+ vmovdqa xkey6, [p_keys + 6*16]
+ vmovdqa xkey9, [p_keys + 9*16]
+ vmovdqa xkey12, [p_keys + 12*16]
+
+main_loop2:
+ ; num_bytes is a multiple of 8 and >0
+ do_aes_noload 8
+ add p_out, 8*16
+ sub num_bytes, 8*16
+ jne main_loop2
+
+do_return2:
+; Don't write back IV
+; vmovdqu [p_IV], xIV
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/aes256_cntr_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes256_cntr_by8_avx.asm
new file mode 100644
index 000000000..e201339da
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes256_cntr_by8_avx.asm
@@ -0,0 +1,516 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "include/memcpy.asm"
+%include "include/const.inc"
+%include "include/reg_sizes.asm"
+
+; routine to do AES256 CNTR enc/decrypt "by8"
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+extern byteswap_const
+extern ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4
+extern ddq_add_5, ddq_add_6, ddq_add_7, ddq_add_8
+
+%define CONCAT(a,b) a %+ b
+%define VMOVDQ vmovdqu
+
+%define xdata0 xmm0
+%define xdata1 xmm1
+%define xpart xmm1
+%define xdata2 xmm2
+%define xdata3 xmm3
+%define xdata4 xmm4
+%define xdata5 xmm5
+%define xdata6 xmm6
+%define xdata7 xmm7
+%define xcounter xmm8
+%define xtmp xmm8
+%define xbyteswap xmm9
+%define xtmp2 xmm9
+%define xkey0 xmm10
+%define xtmp3 xmm10
+%define xkey4 xmm11
+%define xkey8 xmm12
+%define xkey12 xmm13
+%define xkeyA xmm14
+%define xkeyB xmm15
+
+%ifdef LINUX
+%define p_in rdi
+%define p_IV rsi
+%define p_keys rdx
+%define p_out rcx
+%define num_bytes r8
+%define num_bits r8
+%define p_ivlen r9
+%else
+%define p_in rcx
+%define p_IV rdx
+%define p_keys r8
+%define p_out r9
+%define num_bytes r10
+%define num_bits r10
+%define p_ivlen qword [rsp + 8*6]
+%endif
+
+%define tmp r11
+
+%define r_bits r12
+%define tmp2 r13
+%define mask r14
+
+%macro do_aes_load 2
+ do_aes %1, %2, 1
+%endmacro
+
+%macro do_aes_noload 2
+ do_aes %1, %2, 0
+%endmacro
+
+; do_aes num_in_par load_keys
+; This increments p_in, but not p_out
+%macro do_aes 3
+%define %%by %1
+%define %%cntr_type %2
+%define %%load_keys %3
+
+%if (%%load_keys)
+ vmovdqa xkey0, [p_keys + 0*16]
+%endif
+
+ vpshufb xdata0, xcounter, xbyteswap
+%assign i 1
+%rep (%%by - 1)
+ vpaddd CONCAT(xdata,i), xcounter, [rel CONCAT(ddq_add_,i)]
+ vpshufb CONCAT(xdata,i), CONCAT(xdata,i), xbyteswap
+%assign i (i + 1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 1*16]
+
+ vpxor xdata0, xkey0
+%ifidn %%cntr_type, CNTR_BIT
+ vpaddd xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)]
+%else
+ vpaddq xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)]
+%endif
+
+%assign i 1
+%rep (%%by - 1)
+ vpxor CONCAT(xdata,i), xkey0
+%assign i (i + 1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 2*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 1
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 3*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 2
+%assign i (i+1)
+%endrep
+
+ add p_in, 16*%%by
+
+%if (%%load_keys)
+ vmovdqa xkey4, [p_keys + 4*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 3
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 5*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey4 ; key 4
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 6*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 5
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 7*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 6
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey8, [p_keys + 8*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 7
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 9*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey8 ; key 8
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 10*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 9
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 11*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 10
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+ vmovdqa xkey12, [p_keys + 12*16]
+%endif
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 11
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyA, [p_keys + 13*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey12 ; key 12
+%assign i (i+1)
+%endrep
+
+ vmovdqa xkeyB, [p_keys + 14*16]
+%assign i 0
+%rep %%by
+ vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 13
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep %%by
+ vaesenclast CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 14
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep (%%by / 2)
+%assign j (i+1)
+ VMOVDQ xkeyA, [p_in + i*16 - 16*%%by]
+ VMOVDQ xkeyB, [p_in + j*16 - 16*%%by]
+ vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+ vpxor CONCAT(xdata,j), CONCAT(xdata,j), xkeyB
+%assign i (i+2)
+%endrep
+%if (i < %%by)
+ VMOVDQ xkeyA, [p_in + i*16 - 16*%%by]
+ vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA
+%endif
+
+%ifidn %%cntr_type, CNTR_BIT
+ ;; check if this is the end of the message
+ mov tmp, num_bytes
+ and tmp, ~(%%by*16)
+ jnz %%skip_preserve
+ ;; Check if there is a partial byte
+ or r_bits, r_bits
+ jz %%skip_preserve
+
+%assign idx (%%by - 1)
+ ;; Load output to get last partial byte
+ vmovdqu xtmp, [p_out + idx * 16]
+
+ ;; Save RCX in temporary GP register
+ mov tmp, rcx
+ mov mask, 0xff
+ mov cl, BYTE(r_bits)
+ shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111
+ mov rcx, tmp
+
+ vmovq xtmp2, mask
+ vpslldq xtmp2, 15
+ ;; At this point, xtmp2 contains a mask with all 0s, but with some ones
+ ;; in the partial byte
+
+ ;; Clear all the bits that do not need to be preserved from the output
+ vpand xtmp, xtmp, xtmp2
+
+ ;; Clear all bits from the input that are not to be ciphered
+ vpandn CONCAT(xdata,idx), xtmp2, CONCAT(xdata,idx)
+ vpor CONCAT(xdata,idx), xtmp
+
+%%skip_preserve:
+%endif
+
+%assign i 0
+%rep %%by
+ VMOVDQ [p_out + i*16], CONCAT(xdata,i)
+%assign i (i+1)
+%endrep
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+section .text
+;; Macro performing AES-CTR.
+;;
+%macro DO_CNTR 1
+%define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT/CCM)
+
+%ifndef LINUX
+ mov num_bytes, [rsp + 8*5]
+%endif
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ push r12
+ push r13
+ push r14
+%endif
+
+ vmovdqa xbyteswap, [rel byteswap_const]
+%ifidn %%CNTR_TYPE, CNTR
+ test p_ivlen, 16
+ jnz %%iv_is_16_bytes
+ ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001
+ mov DWORD(tmp), 0x01000000
+ vpinsrq xcounter, [p_IV], 0
+ vpinsrd xcounter, [p_IV + 8], 2
+ vpinsrd xcounter, DWORD(tmp), 3
+
+%else ;; CNTR_BIT
+ ; Read 16 byte IV: Nonce + 8-byte block counter (BE)
+ vmovdqu xcounter, [p_IV]
+%endif
+%%bswap_iv:
+ vpshufb xcounter, xbyteswap
+
+ ;; calculate len
+ ;; convert bits to bytes (message length in bits for CNTR_BIT)
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ mov r_bits, num_bits
+ add num_bits, 7
+ shr num_bits, 3 ; "num_bits" and "num_bytes" registers are the same
+ and r_bits, 7 ; Check if there are remainder bits (0-7)
+%endif
+
+ mov tmp, num_bytes
+ and tmp, 7*16
+ jz %%chk ; x8 > or < 15 (not 7 lines)
+
+ ; 1 <= tmp <= 7
+ cmp tmp, 4*16
+ jg %%gt4
+ je %%eq4
+
+%%lt4:
+ cmp tmp, 2*16
+ jg %%eq3
+ je %%eq2
+%%eq1:
+ do_aes_load 1, %%CNTR_TYPE
+ add p_out, 1*16
+ jmp %%chk
+
+%%eq2:
+ do_aes_load 2, %%CNTR_TYPE
+ add p_out, 2*16
+ jmp %%chk
+
+%%eq3:
+ do_aes_load 3, %%CNTR_TYPE
+ add p_out, 3*16
+ jmp %%chk
+
+%%eq4:
+ do_aes_load 4, %%CNTR_TYPE
+ add p_out, 4*16
+ jmp %%chk
+
+%%gt4:
+ cmp tmp, 6*16
+ jg %%eq7
+ je %%eq6
+
+%%eq5:
+ do_aes_load 5, %%CNTR_TYPE
+ add p_out, 5*16
+ jmp %%chk
+
+%%eq6:
+ do_aes_load 6, %%CNTR_TYPE
+ add p_out, 6*16
+ jmp %%chk
+
+%%eq7:
+ do_aes_load 7, %%CNTR_TYPE
+ add p_out, 7*16
+ ; fall through to chk
+%%chk:
+ and num_bytes, ~(7*16)
+ jz %%do_return2
+
+ cmp num_bytes, 16
+ jb %%last
+
+ ; process multiples of 8 blocks
+ vmovdqa xkey0, [p_keys + 0*16]
+ vmovdqa xkey4, [p_keys + 4*16]
+ vmovdqa xkey8, [p_keys + 8*16]
+ vmovdqa xkey12, [p_keys + 12*16]
+ jmp %%main_loop2
+
+align 32
+%%main_loop2:
+ ; num_bytes is a multiple of 8 blocks + partial bytes
+ do_aes_noload 8, %%CNTR_TYPE
+ add p_out, 8*16
+ sub num_bytes, 8*16
+ cmp num_bytes, 8*16
+ jae %%main_loop2
+
+ ; Check if there is a partial block
+ or num_bytes, num_bytes
+ jnz %%last
+
+%%do_return2:
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ pop r14
+ pop r13
+ pop r12
+%endif
+
+ ret
+
+%%last:
+
+ ; load partial block into XMM register
+ simd_load_avx_15_1 xpart, p_in, num_bytes
+
+%%final_ctr_enc:
+ ; Encryption of a single partial block
+ vpshufb xcounter, xbyteswap
+ vmovdqa xdata0, xcounter
+ vpxor xdata0, [p_keys + 16*0]
+%assign i 1
+%rep 13
+ vaesenc xdata0, [p_keys + 16*i]
+%assign i (i+1)
+%endrep
+ ; created keystream
+ vaesenclast xdata0, [p_keys + 16*i]
+
+ ; xor keystream with the message (scratch)
+ vpxor xdata0, xpart
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ ;; Check if there is a partial byte
+ or r_bits, r_bits
+ jz %%store_output
+
+ ;; Load output to get last partial byte
+ simd_load_avx_15_1 xtmp, p_out, num_bytes
+
+ ;; Save RCX in temporary GP register
+ mov tmp, rcx
+ mov mask, 0xff
+%ifidn r_bits, rcx
+%error "r_bits cannot be mapped to rcx!"
+%endif
+ mov cl, BYTE(r_bits)
+ shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111
+ mov rcx, tmp
+
+ vmovq xtmp2, mask
+
+ ;; Get number of full bytes in last block of 16 bytes
+ mov tmp, num_bytes
+ dec tmp
+ XVPSLLB xtmp2, tmp, xtmp3, tmp2
+ ;; At this point, xtmp2 contains a mask with all 0s, but with some ones
+ ;; in the partial byte
+
+ ;; Clear all the bits that do not need to be preserved from the output
+ vpand xtmp, xtmp, xtmp2
+
+ ;; Clear the bits from the input that are not to be ciphered
+ vpandn xdata0, xtmp2, xdata0
+ vpor xdata0, xtmp
+%endif
+
+%%store_output:
+ ; copy result into the output buffer
+ simd_store_avx_15 p_out, xdata0, num_bytes, tmp, rax
+
+ jmp %%do_return2
+
+%%iv_is_16_bytes:
+ ; Read 16 byte IV: Nonce + ESP IV + block counter (BE)
+ vmovdqu xcounter, [p_IV]
+ jmp %%bswap_iv
+%endmacro
+
+align 32
+%ifdef CNTR_CCM_AVX
+; JOB_AES_HMAC * aes_cntr_ccm_256_avx(JOB_AES_HMAC *job)
+; arg 1 : job
+MKGLOBAL(aes_cntr_ccm_256_avx,function,internal)
+aes_cntr_ccm_256_avx:
+ DO_CNTR CCM
+%else
+;; aes_cntr_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes,
+;; UINT64 iv_len)
+MKGLOBAL(aes_cntr_256_avx,function,internal)
+aes_cntr_256_avx:
+ DO_CNTR CNTR
+
+;; aes_cntr_bit_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bits,
+;; UINT64 iv_len)
+MKGLOBAL(aes_cntr_bit_256_avx,function,internal)
+aes_cntr_bit_256_avx:
+ DO_CNTR CNTR_BIT
+%endif ;; CNTR_CCM_AVX
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_128_x8.asm b/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_128_x8.asm
new file mode 100644
index 000000000..745a8e4d4
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_128_x8.asm
@@ -0,0 +1,494 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;;; routine to do a 128 bit CBC AES encrypt and CBC MAC
+
+;; clobbers all registers except for ARG1 and rbp
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+
+%define VMOVDQ vmovdqu ;; assume buffers not aligned
+
+%macro VPXOR2 2
+ vpxor %1, %1, %2
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; struct AES_ARGS {
+;; void* in[8];
+;; void* out[8];
+;; UINT128* keys[8];
+;; UINT128 IV[8];
+;; }
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cbc_enc_128_x8(AES_ARGS *args, UINT64 len);
+;; arg 1: ARG : addr of AES_ARGS structure
+;; arg 2: LEN : len (in units of bytes)
+
+struc STACK
+_gpr_save: resq 8
+_len: resq 1
+endstruc
+
+%define GPR_SAVE_AREA rsp + _gpr_save
+%define LEN_AREA rsp + _len
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rcx
+%define arg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 rdi
+%define arg4 rsi
+%endif
+
+%define ARG arg1
+%define LEN arg2
+
+%define IDX rax
+%define TMP rbx
+
+%define KEYS0 arg3
+%define KEYS1 arg4
+%define KEYS2 rbp
+%define KEYS3 r8
+%define KEYS4 r9
+%define KEYS5 r10
+%define KEYS6 r11
+%define KEYS7 r12
+
+%define IN0 r13
+%define IN2 r14
+%define IN4 r15
+%define IN6 LEN
+
+%define XDATA0 xmm0
+%define XDATA1 xmm1
+%define XDATA2 xmm2
+%define XDATA3 xmm3
+%define XDATA4 xmm4
+%define XDATA5 xmm5
+%define XDATA6 xmm6
+%define XDATA7 xmm7
+
+%define XKEY0_3 xmm8
+%define XKEY1_4 xmm9
+%define XKEY2_5 xmm10
+%define XKEY3_6 xmm11
+%define XKEY4_7 xmm12
+%define XKEY5_8 xmm13
+%define XKEY6_9 xmm14
+%define XTMP xmm15
+
+section .text
+%ifdef CBC_MAC
+MKGLOBAL(aes128_cbc_mac_x8,function,internal)
+aes128_cbc_mac_x8:
+%else
+MKGLOBAL(aes_cbc_enc_128_x8,function,internal)
+aes_cbc_enc_128_x8:
+%endif
+ sub rsp, STACK_size
+ mov [GPR_SAVE_AREA + 8*0], rbp
+%ifdef CBC_MAC
+ mov [GPR_SAVE_AREA + 8*1], rbx
+ mov [GPR_SAVE_AREA + 8*2], r12
+ mov [GPR_SAVE_AREA + 8*3], r13
+ mov [GPR_SAVE_AREA + 8*4], r14
+ mov [GPR_SAVE_AREA + 8*5], r15
+%ifndef LINUX
+ mov [GPR_SAVE_AREA + 8*6], rsi
+ mov [GPR_SAVE_AREA + 8*7], rdi
+%endif
+%endif
+
+ mov IDX, 16
+ mov [LEN_AREA], LEN
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ mov IN0, [ARG + _aesarg_in + 8*0]
+ mov IN2, [ARG + _aesarg_in + 8*2]
+ mov IN4, [ARG + _aesarg_in + 8*4]
+ mov IN6, [ARG + _aesarg_in + 8*6]
+
+ mov TMP, [ARG + _aesarg_in + 8*1]
+ VMOVDQ XDATA0, [IN0] ; load first block of plain text
+ VMOVDQ XDATA1, [TMP] ; load first block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*3]
+ VMOVDQ XDATA2, [IN2] ; load first block of plain text
+ VMOVDQ XDATA3, [TMP] ; load first block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*5]
+ VMOVDQ XDATA4, [IN4] ; load first block of plain text
+ VMOVDQ XDATA5, [TMP] ; load first block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*7]
+ VMOVDQ XDATA6, [IN6] ; load first block of plain text
+ VMOVDQ XDATA7, [TMP] ; load first block of plain text
+
+
+ VPXOR2 XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV
+ VPXOR2 XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV
+ VPXOR2 XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV
+ VPXOR2 XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV
+ VPXOR2 XDATA4, [ARG + _aesarg_IV + 16*4] ; plaintext XOR IV
+ VPXOR2 XDATA5, [ARG + _aesarg_IV + 16*5] ; plaintext XOR IV
+ VPXOR2 XDATA6, [ARG + _aesarg_IV + 16*6] ; plaintext XOR IV
+ VPXOR2 XDATA7, [ARG + _aesarg_IV + 16*7] ; plaintext XOR IV
+
+ mov KEYS0, [ARG + _aesarg_keys + 8*0]
+ mov KEYS1, [ARG + _aesarg_keys + 8*1]
+ mov KEYS2, [ARG + _aesarg_keys + 8*2]
+ mov KEYS3, [ARG + _aesarg_keys + 8*3]
+ mov KEYS4, [ARG + _aesarg_keys + 8*4]
+ mov KEYS5, [ARG + _aesarg_keys + 8*5]
+ mov KEYS6, [ARG + _aesarg_keys + 8*6]
+ mov KEYS7, [ARG + _aesarg_keys + 8*7]
+
+ VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK
+ VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK
+ VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK
+ VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK
+ VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK
+
+ vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+ vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC
+ vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC
+ vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC
+ vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC
+
+ vmovdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key
+
+ vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+ vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC
+ vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC
+ vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC
+ vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC
+
+ vmovdqa XKEY1_4, [KEYS1 + 16*4] ; load round 4 key
+
+ vaesenc XDATA0, XKEY0_3 ; 3. ENC
+ vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC
+ vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC
+ vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC
+ vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC
+ vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC
+ vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC
+ vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ vmovdqa XKEY2_5, [KEYS2 + 16*5] ; load round 5 key
+ vaesenc XDATA1, XKEY1_4 ; 4. ENC
+ vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+ vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC
+ vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC
+ vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC
+ vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ vmovdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key
+ vaesenc XDATA2, XKEY2_5 ; 5. ENC
+ vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+ vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC
+ vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC
+ vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC
+ vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC
+ vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC
+ vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC
+ vmovdqa XKEY4_7, [KEYS4 + 16*7] ; load round 7 key
+ vaesenc XDATA3, XKEY3_6 ; 6. ENC
+ vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC
+ vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC
+ vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC
+ vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+ vmovdqa XKEY5_8, [KEYS5 + 16*8] ; load round 8 key
+ vaesenc XDATA4, XKEY4_7 ; 7. ENC
+ vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC
+ vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC
+ vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+ vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC
+ vmovdqa XKEY6_9, [KEYS6 + 16*9] ; load round 9 key
+ vaesenc XDATA5, XKEY5_8 ; 8. ENC
+ vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC
+ vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC
+ vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC
+ vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC
+ vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC
+ vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC
+ vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC
+ mov TMP, [ARG + _aesarg_out + 8*0]
+ vaesenc XDATA6, XKEY6_9 ; 9. ENC
+ vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC
+
+
+ vaesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ vaesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ vaesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ vaesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC
+ vaesenclast XDATA4, [KEYS4 + 16*10] ; 10. ENC
+ vaesenclast XDATA5, [KEYS5 + 16*10] ; 10. ENC
+ vaesenclast XDATA6, [KEYS6 + 16*10] ; 10. ENC
+ vaesenclast XDATA7, [KEYS7 + 16*10] ; 10. ENC
+
+%ifndef CBC_MAC
+ VMOVDQ [TMP], XDATA0 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*1]
+ VMOVDQ [TMP], XDATA1 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*2]
+ VMOVDQ [TMP], XDATA2 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*3]
+ VMOVDQ [TMP], XDATA3 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*4]
+ VMOVDQ [TMP], XDATA4 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*5]
+ VMOVDQ [TMP], XDATA5 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*6]
+ VMOVDQ [TMP], XDATA6 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*7]
+ VMOVDQ [TMP], XDATA7 ; write back ciphertext
+%endif
+ cmp [LEN_AREA], IDX
+ je done
+
+main_loop:
+ mov TMP, [ARG + _aesarg_in + 8*1]
+ VPXOR2 XDATA0, [IN0 + IDX] ; load next block of plain text
+ VPXOR2 XDATA1, [TMP + IDX] ; load next block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*3]
+ VPXOR2 XDATA2, [IN2 + IDX] ; load next block of plain text
+ VPXOR2 XDATA3, [TMP + IDX] ; load next block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*5]
+ VPXOR2 XDATA4, [IN4 + IDX] ; load next block of plain text
+ VPXOR2 XDATA5, [TMP + IDX] ; load next block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*7]
+ VPXOR2 XDATA6, [IN6 + IDX] ; load next block of plain text
+ VPXOR2 XDATA7, [TMP + IDX] ; load next block of plain text
+
+ VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK
+ VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK
+ VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK
+ VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK
+ VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK
+
+ vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+ vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC
+ vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC
+ vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC
+ vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+ vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC
+ vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC
+ vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC
+ vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC
+
+ vaesenc XDATA0, XKEY0_3 ; 3. ENC
+ vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC
+ vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC
+ vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC
+ vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC
+ vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC
+ vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC
+ vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ vaesenc XDATA1, XKEY1_4 ; 4. ENC
+ vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+ vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC
+ vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC
+ vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC
+ vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ vaesenc XDATA2, XKEY2_5 ; 5. ENC
+ vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+ vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC
+ vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC
+ vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC
+ vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC
+ vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC
+ vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC
+ vaesenc XDATA3, XKEY3_6 ; 6. ENC
+ vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC
+ vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC
+ vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC
+ vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+ vaesenc XDATA4, XKEY4_7 ; 7. ENC
+ vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC
+ vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC
+ vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+ vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC
+ vaesenc XDATA5, XKEY5_8 ; 8. ENC
+ vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC
+ vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC
+ vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC
+ vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC
+ vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC
+ vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC
+ vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC
+ mov TMP, [ARG + _aesarg_out + 8*0]
+ vaesenc XDATA6, XKEY6_9 ; 9. ENC
+ vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC
+
+
+ vaesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ vaesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ vaesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ vaesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC
+ vaesenclast XDATA4, [KEYS4 + 16*10] ; 10. ENC
+ vaesenclast XDATA5, [KEYS5 + 16*10] ; 10. ENC
+ vaesenclast XDATA6, [KEYS6 + 16*10] ; 10. ENC
+ vaesenclast XDATA7, [KEYS7 + 16*10] ; 10. ENC
+
+%ifndef CBC_MAC
+ ;; no ciphertext write back for CBC-MAC
+ VMOVDQ [TMP + IDX], XDATA0 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*1]
+ VMOVDQ [TMP + IDX], XDATA1 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*2]
+ VMOVDQ [TMP + IDX], XDATA2 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*3]
+ VMOVDQ [TMP + IDX], XDATA3 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*4]
+ VMOVDQ [TMP + IDX], XDATA4 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*5]
+ VMOVDQ [TMP + IDX], XDATA5 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*6]
+ VMOVDQ [TMP + IDX], XDATA6 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*7]
+ VMOVDQ [TMP + IDX], XDATA7 ; write back ciphertext
+%endif
+ add IDX, 16
+ cmp [LEN_AREA], IDX
+ jne main_loop
+
+done:
+ ;; update IV for AES128-CBC / store digest for CBC-MAC
+ vmovdqa [ARG + _aesarg_IV + 16*0], XDATA0
+ vmovdqa [ARG + _aesarg_IV + 16*1], XDATA1
+ vmovdqa [ARG + _aesarg_IV + 16*2], XDATA2
+ vmovdqa [ARG + _aesarg_IV + 16*3], XDATA3
+ vmovdqa [ARG + _aesarg_IV + 16*4], XDATA4
+ vmovdqa [ARG + _aesarg_IV + 16*5], XDATA5
+ vmovdqa [ARG + _aesarg_IV + 16*6], XDATA6
+ vmovdqa [ARG + _aesarg_IV + 16*7], XDATA7
+
+ ;; update IN and OUT
+ vmovd xmm0, [LEN_AREA]
+ vpshufd xmm0, xmm0, 0x44
+ vpaddq xmm1, xmm0, [ARG + _aesarg_in + 16*0]
+ vpaddq xmm2, xmm0, [ARG + _aesarg_in + 16*1]
+ vpaddq xmm3, xmm0, [ARG + _aesarg_in + 16*2]
+ vpaddq xmm4, xmm0, [ARG + _aesarg_in + 16*3]
+ vmovdqa [ARG + _aesarg_in + 16*0], xmm1
+ vmovdqa [ARG + _aesarg_in + 16*1], xmm2
+ vmovdqa [ARG + _aesarg_in + 16*2], xmm3
+ vmovdqa [ARG + _aesarg_in + 16*3], xmm4
+%ifndef CBC_MAC
+ vpaddq xmm5, xmm0, [ARG + _aesarg_out + 16*0]
+ vpaddq xmm6, xmm0, [ARG + _aesarg_out + 16*1]
+ vpaddq xmm7, xmm0, [ARG + _aesarg_out + 16*2]
+ vpaddq xmm8, xmm0, [ARG + _aesarg_out + 16*3]
+ vmovdqa [ARG + _aesarg_out + 16*0], xmm5
+ vmovdqa [ARG + _aesarg_out + 16*1], xmm6
+ vmovdqa [ARG + _aesarg_out + 16*2], xmm7
+ vmovdqa [ARG + _aesarg_out + 16*3], xmm8
+%endif
+
+ ;; XMMs are saved at a higher level
+ mov rbp, [GPR_SAVE_AREA + 8*0]
+%ifdef CBC_MAC
+ mov rbx, [GPR_SAVE_AREA + 8*1]
+ mov r12, [GPR_SAVE_AREA + 8*2]
+ mov r13, [GPR_SAVE_AREA + 8*3]
+ mov r14, [GPR_SAVE_AREA + 8*4]
+ mov r15, [GPR_SAVE_AREA + 8*5]
+%ifndef LINUX
+ mov rsi, [GPR_SAVE_AREA + 8*6]
+ mov rdi, [GPR_SAVE_AREA + 8*7]
+%endif
+%endif
+
+ add rsp, STACK_size
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_192_x8.asm b/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_192_x8.asm
new file mode 100644
index 000000000..e446f13c3
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_192_x8.asm
@@ -0,0 +1,501 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;;; routine to do a 192 bit CBC AES encrypt
+
+;; clobbers all registers except for ARG1 and rbp
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+
+%define VMOVDQ vmovdqu ;; assume buffers not aligned
+
+%macro VPXOR2 2
+ vpxor %1, %1, %2
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; struct AES_ARGS {
+;; void* in[8];
+;; void* out[8];
+;; UINT128* keys[8];
+;; UINT128 IV[8];
+;; }
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cbc_enc_192_x8(AES_ARGS *args, UINT64 len);
+;; arg 1: ARG : addr of AES_ARGS structure
+;; arg 2: LEN : len (in units of bytes)
+
+struc STACK
+_gpr_save: resq 1
+_len: resq 1
+endstruc
+
+%define GPR_SAVE_AREA rsp + _gpr_save
+%define LEN_AREA rsp + _len
+
+%ifdef LINUX
+%define ARG rdi
+%define LEN rsi
+%define REG3 rcx
+%define REG4 rdx
+%else
+%define ARG rcx
+%define LEN rdx
+%define REG3 rsi
+%define REG4 rdi
+%endif
+
+%define IDX rax
+%define TMP rbx
+
+%define KEYS0 REG3
+%define KEYS1 REG4
+%define KEYS2 rbp
+%define KEYS3 r8
+%define KEYS4 r9
+%define KEYS5 r10
+%define KEYS6 r11
+%define KEYS7 r12
+
+%define IN0 r13
+%define IN2 r14
+%define IN4 r15
+%define IN6 LEN
+
+%define XDATA0 xmm0
+%define XDATA1 xmm1
+%define XDATA2 xmm2
+%define XDATA3 xmm3
+%define XDATA4 xmm4
+%define XDATA5 xmm5
+%define XDATA6 xmm6
+%define XDATA7 xmm7
+
+%define XKEY0_3 xmm8
+%define XKEY1_4 xmm9
+%define XKEY2_5 xmm10
+%define XKEY3_6 xmm11
+%define XKEY4_7 xmm12
+%define XKEY5_8 xmm13
+%define XKEY6_9 xmm14
+%define XTMP xmm15
+
+section .text
+
+MKGLOBAL(aes_cbc_enc_192_x8,function,internal)
+aes_cbc_enc_192_x8:
+
+ sub rsp, STACK_size
+ mov [GPR_SAVE_AREA + 8*0], rbp
+
+ mov IDX, 16
+ mov [LEN_AREA], LEN
+
+ mov IN0, [ARG + _aesarg_in + 8*0]
+ mov IN2, [ARG + _aesarg_in + 8*2]
+ mov IN4, [ARG + _aesarg_in + 8*4]
+ mov IN6, [ARG + _aesarg_in + 8*6]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ mov TMP, [ARG + _aesarg_in + 8*1]
+ VMOVDQ XDATA0, [IN0] ; load first block of plain text
+ VMOVDQ XDATA1, [TMP] ; load first block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*3]
+ VMOVDQ XDATA2, [IN2] ; load first block of plain text
+ VMOVDQ XDATA3, [TMP] ; load first block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*5]
+ VMOVDQ XDATA4, [IN4] ; load first block of plain text
+ VMOVDQ XDATA5, [TMP] ; load first block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*7]
+ VMOVDQ XDATA6, [IN6] ; load first block of plain text
+ VMOVDQ XDATA7, [TMP] ; load first block of plain text
+
+
+ VPXOR2 XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV
+ VPXOR2 XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV
+ VPXOR2 XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV
+ VPXOR2 XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV
+ VPXOR2 XDATA4, [ARG + _aesarg_IV + 16*4] ; plaintext XOR IV
+ VPXOR2 XDATA5, [ARG + _aesarg_IV + 16*5] ; plaintext XOR IV
+ VPXOR2 XDATA6, [ARG + _aesarg_IV + 16*6] ; plaintext XOR IV
+ VPXOR2 XDATA7, [ARG + _aesarg_IV + 16*7] ; plaintext XOR IV
+
+ mov KEYS0, [ARG + _aesarg_keys + 8*0]
+ mov KEYS1, [ARG + _aesarg_keys + 8*1]
+ mov KEYS2, [ARG + _aesarg_keys + 8*2]
+ mov KEYS3, [ARG + _aesarg_keys + 8*3]
+ mov KEYS4, [ARG + _aesarg_keys + 8*4]
+ mov KEYS5, [ARG + _aesarg_keys + 8*5]
+ mov KEYS6, [ARG + _aesarg_keys + 8*6]
+ mov KEYS7, [ARG + _aesarg_keys + 8*7]
+
+ VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK
+ VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK
+ VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK
+ VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK
+ VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK
+
+ vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+ vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC
+ vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC
+ vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC
+ vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC
+
+ vmovdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key
+
+ vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+ vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC
+ vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC
+ vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC
+ vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC
+
+ vmovdqa XKEY1_4, [KEYS1 + 16*4] ; load round 4 key
+
+ vaesenc XDATA0, XKEY0_3 ; 3. ENC
+ vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC
+ vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC
+ vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC
+ vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC
+ vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC
+ vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC
+ vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ vmovdqa XKEY2_5, [KEYS2 + 16*5] ; load round 5 key
+ vaesenc XDATA1, XKEY1_4 ; 4. ENC
+ vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+ vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC
+ vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC
+ vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC
+ vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ vmovdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key
+ vaesenc XDATA2, XKEY2_5 ; 5. ENC
+ vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+ vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC
+ vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC
+ vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC
+ vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC
+ vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC
+ vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC
+ vmovdqa XKEY4_7, [KEYS4 + 16*7] ; load round 7 key
+ vaesenc XDATA3, XKEY3_6 ; 6. ENC
+ vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC
+ vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC
+ vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC
+ vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+ vmovdqa XKEY5_8, [KEYS5 + 16*8] ; load round 8 key
+ vaesenc XDATA4, XKEY4_7 ; 7. ENC
+ vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC
+ vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC
+ vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+ vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC
+ vmovdqa XKEY6_9, [KEYS6 + 16*9] ; load round 9 key
+ vaesenc XDATA5, XKEY5_8 ; 8. ENC
+ vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC
+ vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC
+ vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC
+ vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC
+ vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC
+ vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC
+ vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC
+ mov TMP, [ARG + _aesarg_out + 8*0]
+ vaesenc XDATA6, XKEY6_9 ; 9. ENC
+ vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC
+
+
+ vaesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ vaesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ vaesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ vaesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC
+ vaesenc XDATA4, [KEYS4 + 16*10] ; 10. ENC
+ vaesenc XDATA5, [KEYS5 + 16*10] ; 10. ENC
+ vaesenc XDATA6, [KEYS6 + 16*10] ; 10. ENC
+ vaesenc XDATA7, [KEYS7 + 16*10] ; 10. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC
+ vaesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC
+ vaesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC
+ vaesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC
+ vaesenc XDATA4, [KEYS4 + 16*11] ; 11. ENC
+ vaesenc XDATA5, [KEYS5 + 16*11] ; 11. ENC
+ vaesenc XDATA6, [KEYS6 + 16*11] ; 11. ENC
+ vaesenc XDATA7, [KEYS7 + 16*11] ; 11. ENC
+
+
+ vaesenclast XDATA0, [KEYS0 + 16*12] ; 12. ENC
+ vaesenclast XDATA1, [KEYS1 + 16*12] ; 12. ENC
+ vaesenclast XDATA2, [KEYS2 + 16*12] ; 12. ENC
+ vaesenclast XDATA3, [KEYS3 + 16*12] ; 12. ENC
+ vaesenclast XDATA4, [KEYS4 + 16*12] ; 12. ENC
+ vaesenclast XDATA5, [KEYS5 + 16*12] ; 12. ENC
+ vaesenclast XDATA6, [KEYS6 + 16*12] ; 12. ENC
+ vaesenclast XDATA7, [KEYS7 + 16*12] ; 12. ENC
+
+ VMOVDQ [TMP], XDATA0 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*1]
+ VMOVDQ [TMP], XDATA1 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*2]
+ VMOVDQ [TMP], XDATA2 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*3]
+ VMOVDQ [TMP], XDATA3 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*4]
+ VMOVDQ [TMP], XDATA4 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*5]
+ VMOVDQ [TMP], XDATA5 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*6]
+ VMOVDQ [TMP], XDATA6 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*7]
+ VMOVDQ [TMP], XDATA7 ; write back ciphertext
+
+ cmp [LEN_AREA], IDX
+ je done
+
+main_loop:
+ mov TMP, [ARG + _aesarg_in + 8*1]
+ VPXOR2 XDATA0, [IN0 + IDX] ; load next block of plain text
+ VPXOR2 XDATA1, [TMP + IDX] ; load next block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*3]
+ VPXOR2 XDATA2, [IN2 + IDX] ; load next block of plain text
+ VPXOR2 XDATA3, [TMP + IDX] ; load next block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*5]
+ VPXOR2 XDATA4, [IN4 + IDX] ; load next block of plain text
+ VPXOR2 XDATA5, [TMP + IDX] ; load next block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*7]
+ VPXOR2 XDATA6, [IN6 + IDX] ; load next block of plain text
+ VPXOR2 XDATA7, [TMP + IDX] ; load next block of plain text
+
+
+ VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK
+ VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK
+ VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK
+ VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK
+ VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK
+
+ vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+ vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC
+ vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC
+ vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC
+ vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+ vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC
+ vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC
+ vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC
+ vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC
+
+ vaesenc XDATA0, XKEY0_3 ; 3. ENC
+ vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC
+ vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC
+ vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC
+ vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC
+ vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC
+ vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC
+ vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ vaesenc XDATA1, XKEY1_4 ; 4. ENC
+ vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+ vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC
+ vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC
+ vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC
+ vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ vaesenc XDATA2, XKEY2_5 ; 5. ENC
+ vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+ vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC
+ vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC
+ vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC
+ vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC
+ vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC
+ vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC
+ vaesenc XDATA3, XKEY3_6 ; 6. ENC
+ vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC
+ vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC
+ vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC
+ vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+ vaesenc XDATA4, XKEY4_7 ; 7. ENC
+ vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC
+ vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC
+ vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+ vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC
+ vaesenc XDATA5, XKEY5_8 ; 8. ENC
+ vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC
+ vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC
+ vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC
+ vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC
+ vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC
+ vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC
+ vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC
+ mov TMP, [ARG + _aesarg_out + 8*0]
+ vaesenc XDATA6, XKEY6_9 ; 9. ENC
+ vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC
+
+
+ vaesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ vaesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ vaesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ vaesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC
+ vaesenc XDATA4, [KEYS4 + 16*10] ; 10. ENC
+ vaesenc XDATA5, [KEYS5 + 16*10] ; 10. ENC
+ vaesenc XDATA6, [KEYS6 + 16*10] ; 10. ENC
+ vaesenc XDATA7, [KEYS7 + 16*10] ; 10. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC
+ vaesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC
+ vaesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC
+ vaesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC
+ vaesenc XDATA4, [KEYS4 + 16*11] ; 11. ENC
+ vaesenc XDATA5, [KEYS5 + 16*11] ; 11. ENC
+ vaesenc XDATA6, [KEYS6 + 16*11] ; 11. ENC
+ vaesenc XDATA7, [KEYS7 + 16*11] ; 11. ENC
+
+ vaesenclast XDATA0, [KEYS0 + 16*12] ; 12. ENC
+ vaesenclast XDATA1, [KEYS1 + 16*12] ; 12. ENC
+ vaesenclast XDATA2, [KEYS2 + 16*12] ; 12. ENC
+ vaesenclast XDATA3, [KEYS3 + 16*12] ; 12. ENC
+ vaesenclast XDATA4, [KEYS4 + 16*12] ; 12. ENC
+ vaesenclast XDATA5, [KEYS5 + 16*12] ; 12. ENC
+ vaesenclast XDATA6, [KEYS6 + 16*12] ; 12. ENC
+ vaesenclast XDATA7, [KEYS7 + 16*12] ; 12. ENC
+
+
+ VMOVDQ [TMP + IDX], XDATA0 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*1]
+ VMOVDQ [TMP + IDX], XDATA1 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*2]
+ VMOVDQ [TMP + IDX], XDATA2 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*3]
+ VMOVDQ [TMP + IDX], XDATA3 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*4]
+ VMOVDQ [TMP + IDX], XDATA4 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*5]
+ VMOVDQ [TMP + IDX], XDATA5 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*6]
+ VMOVDQ [TMP + IDX], XDATA6 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*7]
+ VMOVDQ [TMP + IDX], XDATA7 ; write back ciphertext
+
+ add IDX, 16
+ cmp [LEN_AREA], IDX
+ jne main_loop
+
+done:
+ ;; update IV
+ vmovdqa [ARG + _aesarg_IV + 16*0], XDATA0
+ vmovdqa [ARG + _aesarg_IV + 16*1], XDATA1
+ vmovdqa [ARG + _aesarg_IV + 16*2], XDATA2
+ vmovdqa [ARG + _aesarg_IV + 16*3], XDATA3
+ vmovdqa [ARG + _aesarg_IV + 16*4], XDATA4
+ vmovdqa [ARG + _aesarg_IV + 16*5], XDATA5
+ vmovdqa [ARG + _aesarg_IV + 16*6], XDATA6
+ vmovdqa [ARG + _aesarg_IV + 16*7], XDATA7
+
+ ;; update IN and OUT
+ vmovd xmm0, [LEN_AREA]
+ vpshufd xmm0, xmm0, 0x44
+ vpaddq xmm1, xmm0, [ARG + _aesarg_in + 16*0]
+ vpaddq xmm2, xmm0, [ARG + _aesarg_in + 16*1]
+ vpaddq xmm3, xmm0, [ARG + _aesarg_in + 16*2]
+ vpaddq xmm4, xmm0, [ARG + _aesarg_in + 16*3]
+ vmovdqa [ARG + _aesarg_in + 16*0], xmm1
+ vmovdqa [ARG + _aesarg_in + 16*1], xmm2
+ vmovdqa [ARG + _aesarg_in + 16*2], xmm3
+ vmovdqa [ARG + _aesarg_in + 16*3], xmm4
+ vpaddq xmm5, xmm0, [ARG + _aesarg_out + 16*0]
+ vpaddq xmm6, xmm0, [ARG + _aesarg_out + 16*1]
+ vpaddq xmm7, xmm0, [ARG + _aesarg_out + 16*2]
+ vpaddq xmm8, xmm0, [ARG + _aesarg_out + 16*3]
+ vmovdqa [ARG + _aesarg_out + 16*0], xmm5
+ vmovdqa [ARG + _aesarg_out + 16*1], xmm6
+ vmovdqa [ARG + _aesarg_out + 16*2], xmm7
+ vmovdqa [ARG + _aesarg_out + 16*3], xmm8
+
+;; XMMs are saved at a higher level
+ mov rbp, [GPR_SAVE_AREA + 8*0]
+
+ add rsp, STACK_size
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_256_x8.asm b/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_256_x8.asm
new file mode 100644
index 000000000..75cf285d9
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_256_x8.asm
@@ -0,0 +1,536 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;;; routine to do a 256 bit CBC AES encrypt
+
+;; clobbers all registers except for ARG1 and rbp
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+
+%define VMOVDQ vmovdqu ;; assume buffers not aligned
+
+%macro VPXOR2 2
+ vpxor %1, %1, %2
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; struct AES_ARGS {
+;; void* in[8];
+;; void* out[8];
+;; UINT128* keys[8];
+;; UINT128 IV[8];
+;; }
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cbc_enc_256_x8(AES_ARGS *args, UINT64 len);
+;; arg 1: ARG : addr of AES_ARGS structure
+;; arg 2: LEN : len (in units of bytes)
+
+struc STACK
+_gpr_save: resq 1
+_len: resq 1
+endstruc
+
+%define GPR_SAVE_AREA rsp + _gpr_save
+%define LEN_AREA rsp + _len
+
+%ifdef LINUX
+%define ARG rdi
+%define LEN rsi
+%define REG3 rcx
+%define REG4 rdx
+%else
+%define ARG rcx
+%define LEN rdx
+%define REG3 rsi
+%define REG4 rdi
+%endif
+
+%define IDX rax
+%define TMP rbx
+
+%define KEYS0 REG3
+%define KEYS1 REG4
+%define KEYS2 rbp
+%define KEYS3 r8
+%define KEYS4 r9
+%define KEYS5 r10
+%define KEYS6 r11
+%define KEYS7 r12
+
+%define IN0 r13
+%define IN2 r14
+%define IN4 r15
+%define IN6 LEN
+
+%define XDATA0 xmm0
+%define XDATA1 xmm1
+%define XDATA2 xmm2
+%define XDATA3 xmm3
+%define XDATA4 xmm4
+%define XDATA5 xmm5
+%define XDATA6 xmm6
+%define XDATA7 xmm7
+
+%define XKEY0_3 xmm8
+%define XKEY1_4 xmm9
+%define XKEY2_5 xmm10
+%define XKEY3_6 xmm11
+%define XKEY4_7 xmm12
+%define XKEY5_8 xmm13
+%define XKEY6_9 xmm14
+%define XTMP xmm15
+
+section .text
+MKGLOBAL(aes_cbc_enc_256_x8,function,internal)
+aes_cbc_enc_256_x8:
+
+ sub rsp, STACK_size
+ mov [GPR_SAVE_AREA + 8*0], rbp
+
+ mov IDX, 16
+ mov [LEN_AREA], LEN
+
+ mov IN0, [ARG + _aesarg_in + 8*0]
+ mov IN2, [ARG + _aesarg_in + 8*2]
+ mov IN4, [ARG + _aesarg_in + 8*4]
+ mov IN6, [ARG + _aesarg_in + 8*6]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ mov TMP, [ARG + _aesarg_in + 8*1]
+ VMOVDQ XDATA0, [IN0] ; load first block of plain text
+ VMOVDQ XDATA1, [TMP] ; load first block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*3]
+ VMOVDQ XDATA2, [IN2] ; load first block of plain text
+ VMOVDQ XDATA3, [TMP] ; load first block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*5]
+ VMOVDQ XDATA4, [IN4] ; load first block of plain text
+ VMOVDQ XDATA5, [TMP] ; load first block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*7]
+ VMOVDQ XDATA6, [IN6] ; load first block of plain text
+ VMOVDQ XDATA7, [TMP] ; load first block of plain text
+
+
+ VPXOR2 XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV
+ VPXOR2 XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV
+ VPXOR2 XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV
+ VPXOR2 XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV
+ VPXOR2 XDATA4, [ARG + _aesarg_IV + 16*4] ; plaintext XOR IV
+ VPXOR2 XDATA5, [ARG + _aesarg_IV + 16*5] ; plaintext XOR IV
+ VPXOR2 XDATA6, [ARG + _aesarg_IV + 16*6] ; plaintext XOR IV
+ VPXOR2 XDATA7, [ARG + _aesarg_IV + 16*7] ; plaintext XOR IV
+
+ mov KEYS0, [ARG + _aesarg_keys + 8*0]
+ mov KEYS1, [ARG + _aesarg_keys + 8*1]
+ mov KEYS2, [ARG + _aesarg_keys + 8*2]
+ mov KEYS3, [ARG + _aesarg_keys + 8*3]
+ mov KEYS4, [ARG + _aesarg_keys + 8*4]
+ mov KEYS5, [ARG + _aesarg_keys + 8*5]
+ mov KEYS6, [ARG + _aesarg_keys + 8*6]
+ mov KEYS7, [ARG + _aesarg_keys + 8*7]
+
+ VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK
+ VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK
+ VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK
+ VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK
+ VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK
+
+ vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+ vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC
+ vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC
+ vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC
+ vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC
+
+ vmovdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key
+
+ vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+ vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC
+ vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC
+ vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC
+ vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC
+
+ vmovdqa XKEY1_4, [KEYS1 + 16*4] ; load round 4 key
+
+ vaesenc XDATA0, XKEY0_3 ; 3. ENC
+ vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC
+ vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC
+ vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC
+ vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC
+ vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC
+ vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC
+ vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ vmovdqa XKEY2_5, [KEYS2 + 16*5] ; load round 5 key
+ vaesenc XDATA1, XKEY1_4 ; 4. ENC
+ vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+ vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC
+ vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC
+ vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC
+ vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ vmovdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key
+ vaesenc XDATA2, XKEY2_5 ; 5. ENC
+ vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+ vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC
+ vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC
+ vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC
+ vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC
+ vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC
+ vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC
+ vmovdqa XKEY4_7, [KEYS4 + 16*7] ; load round 7 key
+ vaesenc XDATA3, XKEY3_6 ; 6. ENC
+ vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC
+ vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC
+ vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC
+ vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+ vmovdqa XKEY5_8, [KEYS5 + 16*8] ; load round 8 key
+ vaesenc XDATA4, XKEY4_7 ; 7. ENC
+ vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC
+ vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC
+ vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+ vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC
+ vmovdqa XKEY6_9, [KEYS6 + 16*9] ; load round 9 key
+ vaesenc XDATA5, XKEY5_8 ; 8. ENC
+ vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC
+ vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC
+ vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC
+ vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC
+ vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC
+ vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC
+ vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC
+ mov TMP, [ARG + _aesarg_out + 8*0]
+ vaesenc XDATA6, XKEY6_9 ; 9. ENC
+ vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC
+
+
+ vaesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ vaesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ vaesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ vaesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC
+ vaesenc XDATA4, [KEYS4 + 16*10] ; 10. ENC
+ vaesenc XDATA5, [KEYS5 + 16*10] ; 10. ENC
+ vaesenc XDATA6, [KEYS6 + 16*10] ; 10. ENC
+ vaesenc XDATA7, [KEYS7 + 16*10] ; 10. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC
+ vaesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC
+ vaesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC
+ vaesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC
+ vaesenc XDATA4, [KEYS4 + 16*11] ; 11. ENC
+ vaesenc XDATA5, [KEYS5 + 16*11] ; 11. ENC
+ vaesenc XDATA6, [KEYS6 + 16*11] ; 11. ENC
+ vaesenc XDATA7, [KEYS7 + 16*11] ; 11. ENC
+
+
+ vaesenc XDATA0, [KEYS0 + 16*12] ; 12. ENC
+ vaesenc XDATA1, [KEYS1 + 16*12] ; 12. ENC
+ vaesenc XDATA2, [KEYS2 + 16*12] ; 12. ENC
+ vaesenc XDATA3, [KEYS3 + 16*12] ; 12. ENC
+ vaesenc XDATA4, [KEYS4 + 16*12] ; 12. ENC
+ vaesenc XDATA5, [KEYS5 + 16*12] ; 12. ENC
+ vaesenc XDATA6, [KEYS6 + 16*12] ; 12. ENC
+ vaesenc XDATA7, [KEYS7 + 16*12] ; 12. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*13] ; 13. ENC
+ vaesenc XDATA1, [KEYS1 + 16*13] ; 13. ENC
+ vaesenc XDATA2, [KEYS2 + 16*13] ; 13. ENC
+ vaesenc XDATA3, [KEYS3 + 16*13] ; 13. ENC
+ vaesenc XDATA4, [KEYS4 + 16*13] ; 13. ENC
+ vaesenc XDATA5, [KEYS5 + 16*13] ; 13. ENC
+ vaesenc XDATA6, [KEYS6 + 16*13] ; 13. ENC
+ vaesenc XDATA7, [KEYS7 + 16*13] ; 13. ENC
+
+ vaesenclast XDATA0, [KEYS0 + 16*14] ; 14. ENC
+ vaesenclast XDATA1, [KEYS1 + 16*14] ; 14. ENC
+ vaesenclast XDATA2, [KEYS2 + 16*14] ; 14. ENC
+ vaesenclast XDATA3, [KEYS3 + 16*14] ; 14. ENC
+ vaesenclast XDATA4, [KEYS4 + 16*14] ; 14. ENC
+ vaesenclast XDATA5, [KEYS5 + 16*14] ; 14. ENC
+ vaesenclast XDATA6, [KEYS6 + 16*14] ; 14. ENC
+ vaesenclast XDATA7, [KEYS7 + 16*14] ; 14. ENC
+
+ VMOVDQ [TMP], XDATA0 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*1]
+ VMOVDQ [TMP], XDATA1 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*2]
+ VMOVDQ [TMP], XDATA2 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*3]
+ VMOVDQ [TMP], XDATA3 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*4]
+ VMOVDQ [TMP], XDATA4 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*5]
+ VMOVDQ [TMP], XDATA5 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*6]
+ VMOVDQ [TMP], XDATA6 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*7]
+ VMOVDQ [TMP], XDATA7 ; write back ciphertext
+
+ cmp [LEN_AREA], IDX
+ je done
+
+main_loop:
+ mov TMP, [ARG + _aesarg_in + 8*1]
+ VPXOR2 XDATA0, [IN0 + IDX] ; load next block of plain text
+ VPXOR2 XDATA1, [TMP + IDX] ; load next block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*3]
+ VPXOR2 XDATA2, [IN2 + IDX] ; load next block of plain text
+ VPXOR2 XDATA3, [TMP + IDX] ; load next block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*5]
+ VPXOR2 XDATA4, [IN4 + IDX] ; load next block of plain text
+ VPXOR2 XDATA5, [TMP + IDX] ; load next block of plain text
+ mov TMP, [ARG + _aesarg_in + 8*7]
+ VPXOR2 XDATA6, [IN6 + IDX] ; load next block of plain text
+ VPXOR2 XDATA7, [TMP + IDX] ; load next block of plain text
+
+
+ VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK
+ VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK
+ VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK
+ VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK
+ VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK
+
+ vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+ vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC
+ vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC
+ vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC
+ vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+ vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC
+ vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC
+ vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC
+ vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC
+
+ vaesenc XDATA0, XKEY0_3 ; 3. ENC
+ vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC
+ vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC
+ vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC
+ vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC
+ vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC
+ vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC
+ vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ vaesenc XDATA1, XKEY1_4 ; 4. ENC
+ vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+ vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC
+ vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC
+ vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC
+ vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ vaesenc XDATA2, XKEY2_5 ; 5. ENC
+ vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+ vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC
+ vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC
+ vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC
+ vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC
+ vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC
+ vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC
+ vaesenc XDATA3, XKEY3_6 ; 6. ENC
+ vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC
+ vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC
+ vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC
+ vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+ vaesenc XDATA4, XKEY4_7 ; 7. ENC
+ vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC
+ vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC
+ vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+ vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC
+ vaesenc XDATA5, XKEY5_8 ; 8. ENC
+ vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC
+ vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC
+ vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC
+ vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC
+ vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC
+ vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC
+ vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC
+ mov TMP, [ARG + _aesarg_out + 8*0]
+ vaesenc XDATA6, XKEY6_9 ; 9. ENC
+ vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC
+
+
+ vaesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ vaesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ vaesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ vaesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC
+ vaesenc XDATA4, [KEYS4 + 16*10] ; 10. ENC
+ vaesenc XDATA5, [KEYS5 + 16*10] ; 10. ENC
+ vaesenc XDATA6, [KEYS6 + 16*10] ; 10. ENC
+ vaesenc XDATA7, [KEYS7 + 16*10] ; 10. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC
+ vaesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC
+ vaesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC
+ vaesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC
+ vaesenc XDATA4, [KEYS4 + 16*11] ; 11. ENC
+ vaesenc XDATA5, [KEYS5 + 16*11] ; 11. ENC
+ vaesenc XDATA6, [KEYS6 + 16*11] ; 11. ENC
+ vaesenc XDATA7, [KEYS7 + 16*11] ; 11. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*12] ; 12. ENC
+ vaesenc XDATA1, [KEYS1 + 16*12] ; 12. ENC
+ vaesenc XDATA2, [KEYS2 + 16*12] ; 12. ENC
+ vaesenc XDATA3, [KEYS3 + 16*12] ; 12. ENC
+ vaesenc XDATA4, [KEYS4 + 16*12] ; 12. ENC
+ vaesenc XDATA5, [KEYS5 + 16*12] ; 12. ENC
+ vaesenc XDATA6, [KEYS6 + 16*12] ; 12. ENC
+ vaesenc XDATA7, [KEYS7 + 16*12] ; 12. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*13] ; 13. ENC
+ vaesenc XDATA1, [KEYS1 + 16*13] ; 13. ENC
+ vaesenc XDATA2, [KEYS2 + 16*13] ; 13. ENC
+ vaesenc XDATA3, [KEYS3 + 16*13] ; 13. ENC
+ vaesenc XDATA4, [KEYS4 + 16*13] ; 13. ENC
+ vaesenc XDATA5, [KEYS5 + 16*13] ; 13. ENC
+ vaesenc XDATA6, [KEYS6 + 16*13] ; 13. ENC
+ vaesenc XDATA7, [KEYS7 + 16*13] ; 13. ENC
+
+ vaesenclast XDATA0, [KEYS0 + 16*14] ; 14. ENC
+ vaesenclast XDATA1, [KEYS1 + 16*14] ; 14. ENC
+ vaesenclast XDATA2, [KEYS2 + 16*14] ; 14. ENC
+ vaesenclast XDATA3, [KEYS3 + 16*14] ; 14. ENC
+ vaesenclast XDATA4, [KEYS4 + 16*14] ; 14. ENC
+ vaesenclast XDATA5, [KEYS5 + 16*14] ; 14. ENC
+ vaesenclast XDATA6, [KEYS6 + 16*14] ; 14. ENC
+ vaesenclast XDATA7, [KEYS7 + 16*14] ; 14. ENC
+
+
+ VMOVDQ [TMP + IDX], XDATA0 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*1]
+ VMOVDQ [TMP + IDX], XDATA1 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*2]
+ VMOVDQ [TMP + IDX], XDATA2 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*3]
+ VMOVDQ [TMP + IDX], XDATA3 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*4]
+ VMOVDQ [TMP + IDX], XDATA4 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*5]
+ VMOVDQ [TMP + IDX], XDATA5 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*6]
+ VMOVDQ [TMP + IDX], XDATA6 ; write back ciphertext
+ mov TMP, [ARG + _aesarg_out + 8*7]
+ VMOVDQ [TMP + IDX], XDATA7 ; write back ciphertext
+
+ add IDX, 16
+ cmp [LEN_AREA], IDX
+ jne main_loop
+
+done:
+ ;; update IV
+ vmovdqa [ARG + _aesarg_IV + 16*0], XDATA0
+ vmovdqa [ARG + _aesarg_IV + 16*1], XDATA1
+ vmovdqa [ARG + _aesarg_IV + 16*2], XDATA2
+ vmovdqa [ARG + _aesarg_IV + 16*3], XDATA3
+ vmovdqa [ARG + _aesarg_IV + 16*4], XDATA4
+ vmovdqa [ARG + _aesarg_IV + 16*5], XDATA5
+ vmovdqa [ARG + _aesarg_IV + 16*6], XDATA6
+ vmovdqa [ARG + _aesarg_IV + 16*7], XDATA7
+
+ ;; update IN and OUT
+ vmovd xmm0, [LEN_AREA]
+ vpshufd xmm0, xmm0, 0x44
+ vpaddq xmm1, xmm0, [ARG + _aesarg_in + 16*0]
+ vpaddq xmm2, xmm0, [ARG + _aesarg_in + 16*1]
+ vpaddq xmm3, xmm0, [ARG + _aesarg_in + 16*2]
+ vpaddq xmm4, xmm0, [ARG + _aesarg_in + 16*3]
+ vmovdqa [ARG + _aesarg_in + 16*0], xmm1
+ vmovdqa [ARG + _aesarg_in + 16*1], xmm2
+ vmovdqa [ARG + _aesarg_in + 16*2], xmm3
+ vmovdqa [ARG + _aesarg_in + 16*3], xmm4
+ vpaddq xmm5, xmm0, [ARG + _aesarg_out + 16*0]
+ vpaddq xmm6, xmm0, [ARG + _aesarg_out + 16*1]
+ vpaddq xmm7, xmm0, [ARG + _aesarg_out + 16*2]
+ vpaddq xmm8, xmm0, [ARG + _aesarg_out + 16*3]
+ vmovdqa [ARG + _aesarg_out + 16*0], xmm5
+ vmovdqa [ARG + _aesarg_out + 16*1], xmm6
+ vmovdqa [ARG + _aesarg_out + 16*2], xmm7
+ vmovdqa [ARG + _aesarg_out + 16*3], xmm8
+
+;; XMMs are saved at a higher level
+ mov rbp, [GPR_SAVE_AREA + 8*0]
+
+ add rsp, STACK_size
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/aes_cfb_128_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes_cfb_128_avx.asm
new file mode 100644
index 000000000..34d03bb99
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes_cfb_128_avx.asm
@@ -0,0 +1,165 @@
+;;
+;; Copyright (c) 2018-2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "include/memcpy.asm"
+%include "include/clear_regs.asm"
+
+;;; Routine to do 128 bit CFB AES encrypt/decrypt operations on one block only.
+;;; It processes only one buffer at a time.
+;;; It is designed to manage partial blocks of DOCSIS 3.1 SEC BPI
+
+;; In System V AMD64 ABI
+;; calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RAX R9 R10 R11
+;; Windows preserves: RBX RCX RDX RBP RSI RDI R8 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Linux clobbers: RAX R9 R10
+;; Linux preserves: RBX RCX RDX RBP RSI RDI R8 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;;
+;; Linux/Windows clobbers: xmm0
+;;
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%define arg4 rcx
+%define arg5 r8
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 r8
+%define arg4 r9
+%define arg5 [rsp + 5*8]
+%endif
+
+%define OUT arg1
+%define IN arg2
+%define IV arg3
+%define KEYS arg4
+%ifdef LINUX
+%define LEN arg5
+%else
+%define LEN2 arg5
+%define LEN r11
+%endif
+
+%define TMP0 rax
+%define TMP1 r10
+
+%define XDATA xmm0
+%define XIN xmm1
+
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cfb_128_one(void *out, void *in, void *iv, void *keys, uint64_t len)
+;; arg 1: OUT : addr to put clear/cipher text out
+;; arg 2: IN : addr to take cipher/clear text from
+;; arg 3: IV : initialization vector
+;; arg 4: KEYS: pointer to expanded keys structure (16 byte aligned)
+;; arg 5: LEN: length of the text to encrypt/decrypt (valid range is 0 to 16)
+;;
+;; AES CFB128 one block encrypt/decrypt implementation.
+;; The function doesn't update IV. The result of operation can be found in OUT.
+;;
+;; It is primarly designed to process partial block of
+;; DOCSIS 3.1 AES Packet PDU Encryption (I.10)
+;;
+;; It process up to one block only (up to 16 bytes).
+;;
+;; It makes sure not to read more than LEN bytes from IN and
+;; not to store more than LEN bytes to OUT.
+MKGLOBAL(aes_cfb_128_one_avx,function,)
+MKGLOBAL(aes_cfb_128_one_avx2,function,)
+MKGLOBAL(aes_cfb_128_one_avx512,function,)
+align 32
+aes_cfb_128_one_avx:
+aes_cfb_128_one_avx2:
+aes_cfb_128_one_avx512:
+%ifndef LINUX
+ mov LEN, LEN2
+%endif
+%ifdef SAFE_PARAM
+ cmp IV, 0
+ jz exit_cfb
+
+ cmp KEYS, 0
+ jz exit_cfb
+
+ cmp LEN, 0
+ jz skip_in_out_check
+
+ cmp OUT, 0
+ jz exit_cfb
+
+ cmp IN, 0
+ jz exit_cfb
+
+skip_in_out_check:
+%endif
+ simd_load_avx_16 XIN, IN, LEN
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu XDATA, [IV] ; IV (or next to last block)
+ vpxor XDATA, XDATA, [KEYS + 16*0] ; 0. ARK
+ vaesenc XDATA, XDATA, [KEYS + 16*1] ; 1. ENC
+ vaesenc XDATA, XDATA, [KEYS + 16*2] ; 2. ENC
+ vaesenc XDATA, XDATA, [KEYS + 16*3] ; 3. ENC
+ vaesenc XDATA, XDATA, [KEYS + 16*4] ; 4. ENC
+ vaesenc XDATA, XDATA, [KEYS + 16*5] ; 5. ENC
+ vaesenc XDATA, XDATA, [KEYS + 16*6] ; 6. ENC
+ vaesenc XDATA, XDATA, [KEYS + 16*7] ; 7. ENC
+ vaesenc XDATA, XDATA, [KEYS + 16*8] ; 8. ENC
+ vaesenc XDATA, XDATA, [KEYS + 16*9] ; 9. ENC
+ vaesenclast XDATA, XDATA, [KEYS + 16*10] ; 10. ENC
+
+ vpxor XDATA, XIN ; plaintext/ciphertext XOR block cipher encryption
+
+ simd_store_avx OUT, XDATA, LEN, TMP0, TMP1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifdef SAFE_DATA
+ ;; XDATA and XIN are the only scratch SIMD registers used
+ clear_xmms_avx XDATA, XIN
+ clear_scratch_gps_asm
+%endif
+exit_cfb:
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/aes_ecb_by4_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes_ecb_by4_avx.asm
new file mode 100644
index 000000000..d71bd8c46
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes_ecb_by4_avx.asm
@@ -0,0 +1,654 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; routine to do AES ECB encrypt/decrypt on 16n bytes doing AES by 4
+
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void aes_ecb_x_y_avx(void *in,
+; UINT128 keys[],
+; void *out,
+; UINT64 len_bytes);
+;
+; x = direction (enc/dec)
+; y = key size (128/192/256)
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: KEYS: pointer to keys
+; arg 3: OUT: pointer to output (plain text)
+; arg 4: LEN: length in bytes (multiple of 16)
+;
+
+%include "include/os.asm"
+
+%ifndef AES_ECB_ENC_128
+%define AES_ECB_ENC_128 aes_ecb_enc_128_avx
+%define AES_ECB_ENC_192 aes_ecb_enc_192_avx
+%define AES_ECB_ENC_256 aes_ecb_enc_256_avx
+%define AES_ECB_DEC_128 aes_ecb_dec_128_avx
+%define AES_ECB_DEC_192 aes_ecb_dec_192_avx
+%define AES_ECB_DEC_256 aes_ecb_dec_256_avx
+%endif
+
+%ifdef LINUX
+%define IN rdi
+%define KEYS rsi
+%define OUT rdx
+%define LEN rcx
+%else
+%define IN rcx
+%define KEYS rdx
+%define OUT r8
+%define LEN r9
+%endif
+
+%define IDX rax
+%define TMP IDX
+%define XDATA0 xmm0
+%define XDATA1 xmm1
+%define XDATA2 xmm2
+%define XDATA3 xmm3
+%define XKEY0 xmm4
+%define XKEY2 xmm5
+%define XKEY4 xmm6
+%define XKEY6 xmm7
+%define XKEY10 xmm8
+%define XKEY_A xmm9
+%define XKEY_B xmm10
+
+section .text
+
+%macro AES_ECB 2
+%define %%NROUNDS %1 ; [in] Number of AES rounds, numerical value
+%define %%DIR %2 ; [in] Direction (encrypt/decrypt)
+
+%ifidn %%DIR, ENC
+%define AES vaesenc
+%define AES_LAST vaesenclast
+%else ; DIR = DEC
+%define AES vaesdec
+%define AES_LAST vaesdeclast
+%endif
+ mov TMP, LEN
+ and TMP, 3*16
+ jz %%initial_4
+ cmp TMP, 2*16
+ jb %%initial_1
+ ja %%initial_3
+
+%%initial_2:
+ ; load plain/cipher text
+ vmovdqu XDATA0, [IN + 0*16]
+ vmovdqu XDATA1, [IN + 1*16]
+
+ vmovdqa XKEY0, [KEYS + 0*16]
+
+ vpxor XDATA0, XKEY0 ; 0. ARK
+ vpxor XDATA1, XKEY0
+
+ vmovdqa XKEY2, [KEYS + 2*16]
+
+ AES XDATA0, [KEYS + 1*16] ; 1. ENC
+ AES XDATA1, [KEYS + 1*16]
+
+ mov IDX, 2*16
+
+ AES XDATA0, XKEY2 ; 2. ENC
+ AES XDATA1, XKEY2
+
+ vmovdqa XKEY4, [KEYS + 4*16]
+
+ AES XDATA0, [KEYS + 3*16] ; 3. ENC
+ AES XDATA1, [KEYS + 3*16]
+
+ AES XDATA0, XKEY4 ; 4. ENC
+ AES XDATA1, XKEY4
+
+ vmovdqa XKEY6, [KEYS + 6*16]
+
+ AES XDATA0, [KEYS + 5*16] ; 5. ENC
+ AES XDATA1, [KEYS + 5*16]
+
+ AES XDATA0, XKEY6 ; 6. ENC
+ AES XDATA1, XKEY6
+
+ vmovdqa XKEY_B, [KEYS + 8*16]
+
+ AES XDATA0, [KEYS + 7*16] ; 7. ENC
+ AES XDATA1, [KEYS + 7*16]
+
+ AES XDATA0, XKEY_B ; 8. ENC
+ AES XDATA1, XKEY_B
+
+ vmovdqa XKEY10, [KEYS + 10*16]
+
+ AES XDATA0, [KEYS + 9*16] ; 9. ENC
+ AES XDATA1, [KEYS + 9*16]
+
+%if %%NROUNDS >= 12
+ AES XDATA0, XKEY10 ; 10. ENC
+ AES XDATA1, XKEY10
+
+ AES XDATA0, [KEYS + 11*16] ; 11. ENC
+ AES XDATA1, [KEYS + 11*16]
+%endif
+
+%if %%NROUNDS == 14
+ AES XDATA0, [KEYS + 12*16] ; 12. ENC
+ AES XDATA1, [KEYS + 12*16]
+
+ AES XDATA0, [KEYS + 13*16] ; 13. ENC
+ AES XDATA1, [KEYS + 13*16]
+%endif
+
+%if %%NROUNDS == 10
+ AES_LAST XDATA0, XKEY10 ; 10. ENC
+ AES_LAST XDATA1, XKEY10
+%elif %%NROUNDS == 12
+ AES_LAST XDATA0, [KEYS + 12*16] ; 12. ENC
+ AES_LAST XDATA1, [KEYS + 12*16]
+%else
+ AES_LAST XDATA0, [KEYS + 14*16] ; 14. ENC
+ AES_LAST XDATA1, [KEYS + 14*16]
+%endif
+ vmovdqu [OUT + 0*16], XDATA0
+ vmovdqu [OUT + 1*16], XDATA1
+
+ cmp LEN, 2*16
+ je %%done
+ jmp %%main_loop
+
+
+ align 16
+%%initial_1:
+ ; load plain/cipher text
+ vmovdqu XDATA0, [IN + 0*16]
+
+ vmovdqa XKEY0, [KEYS + 0*16]
+
+ vpxor XDATA0, XKEY0 ; 0. ARK
+
+ vmovdqa XKEY2, [KEYS + 2*16]
+
+ AES XDATA0, [KEYS + 1*16] ; 1. ENC
+
+ mov IDX, 1*16
+
+ AES XDATA0, XKEY2 ; 2. ENC
+
+ vmovdqa XKEY4, [KEYS + 4*16]
+
+ AES XDATA0, [KEYS + 3*16] ; 3. ENC
+
+ AES XDATA0, XKEY4 ; 4. ENC
+
+ vmovdqa XKEY6, [KEYS + 6*16]
+
+ AES XDATA0, [KEYS + 5*16] ; 5. ENC
+
+ AES XDATA0, XKEY6 ; 6. ENC
+
+ vmovdqa XKEY_B, [KEYS + 8*16]
+
+ AES XDATA0, [KEYS + 7*16] ; 7. ENC
+
+ AES XDATA0, XKEY_B ; 8. ENC
+
+ vmovdqa XKEY10, [KEYS + 10*16]
+
+ AES XDATA0, [KEYS + 9*16] ; 9. ENC
+
+%if %%NROUNDS >= 12
+ AES XDATA0, XKEY10 ; 10. ENC
+
+ AES XDATA0, [KEYS + 11*16] ; 11. ENC
+%endif
+
+%if %%NROUNDS == 14
+ AES XDATA0, [KEYS + 12*16] ; 12. ENC
+
+ AES XDATA0, [KEYS + 13*16] ; 13. ENC
+%endif
+
+%if %%NROUNDS == 10
+
+ AES_LAST XDATA0, XKEY10 ; 10. ENC
+%elif %%NROUNDS == 12
+ AES_LAST XDATA0, [KEYS + 12*16] ; 12. ENC
+%else
+ AES_LAST XDATA0, [KEYS + 14*16] ; 14. ENC
+%endif
+
+ vmovdqu [OUT + 0*16], XDATA0
+
+ cmp LEN, 1*16
+ je %%done
+ jmp %%main_loop
+
+
+%%initial_3:
+ ; load plain/cipher text
+ vmovdqu XDATA0, [IN + 0*16]
+ vmovdqu XDATA1, [IN + 1*16]
+ vmovdqu XDATA2, [IN + 2*16]
+
+ vmovdqa XKEY0, [KEYS + 0*16]
+
+ vmovdqa XKEY_A, [KEYS + 1*16]
+
+ vpxor XDATA0, XKEY0 ; 0. ARK
+ vpxor XDATA1, XKEY0
+ vpxor XDATA2, XKEY0
+
+ vmovdqa XKEY2, [KEYS + 2*16]
+
+ AES XDATA0, XKEY_A ; 1. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+
+ vmovdqa XKEY_A, [KEYS + 3*16]
+ mov IDX, 3*16
+
+ AES XDATA0, XKEY2 ; 2. ENC
+ AES XDATA1, XKEY2
+ AES XDATA2, XKEY2
+
+ vmovdqa XKEY4, [KEYS + 4*16]
+
+ AES XDATA0, XKEY_A ; 3. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+
+ vmovdqa XKEY_A, [KEYS + 5*16]
+
+ AES XDATA0, XKEY4 ; 4. ENC
+ AES XDATA1, XKEY4
+ AES XDATA2, XKEY4
+
+ vmovdqa XKEY6, [KEYS + 6*16]
+
+ AES XDATA0, XKEY_A ; 5. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+
+ vmovdqa XKEY_A, [KEYS + 7*16]
+
+ AES XDATA0, XKEY6 ; 6. ENC
+ AES XDATA1, XKEY6
+ AES XDATA2, XKEY6
+
+ vmovdqa XKEY_B, [KEYS + 8*16]
+
+ AES XDATA0, XKEY_A ; 7. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+
+ vmovdqa XKEY_A, [KEYS + 9*16]
+
+ AES XDATA0, XKEY_B ; 8. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+
+ vmovdqa XKEY_B, [KEYS + 10*16]
+
+ AES XDATA0, XKEY_A ; 9. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+
+%if %%NROUNDS >= 12
+ vmovdqa XKEY_A, [KEYS + 11*16]
+
+ AES XDATA0, XKEY_B ; 10. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+
+ vmovdqa XKEY_B, [KEYS + 12*16]
+
+ AES XDATA0, XKEY_A ; 11. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+
+%endif
+
+%if %%NROUNDS == 14
+ vmovdqa XKEY_A, [KEYS + 13*16]
+
+ AES XDATA0, XKEY_B ; 12. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+
+ vmovdqa XKEY_B, [KEYS + 14*16]
+
+ AES XDATA0, XKEY_A ; 13. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+%endif
+
+ AES_LAST XDATA0, XKEY_B ; 10/12/14. ENC (depending on key size)
+ AES_LAST XDATA1, XKEY_B
+ AES_LAST XDATA2, XKEY_B
+
+ vmovdqu [OUT + 0*16], XDATA0
+ vmovdqu [OUT + 1*16], XDATA1
+ vmovdqu [OUT + 2*16], XDATA2
+
+ cmp LEN, 3*16
+ je %%done
+ jmp %%main_loop
+
+
+ align 16
+%%initial_4:
+ ; load plain/cipher text
+ vmovdqu XDATA0, [IN + 0*16]
+ vmovdqu XDATA1, [IN + 1*16]
+ vmovdqu XDATA2, [IN + 2*16]
+ vmovdqu XDATA3, [IN + 3*16]
+
+ vmovdqa XKEY0, [KEYS + 0*16]
+
+ vmovdqa XKEY_A, [KEYS + 1*16]
+
+ vpxor XDATA0, XKEY0 ; 0. ARK
+ vpxor XDATA1, XKEY0
+ vpxor XDATA2, XKEY0
+ vpxor XDATA3, XKEY0
+
+ vmovdqa XKEY2, [KEYS + 2*16]
+
+ AES XDATA0, XKEY_A ; 1. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+ vmovdqa XKEY_A, [KEYS + 3*16]
+
+ mov IDX, 4*16
+
+ AES XDATA0, XKEY2 ; 2. ENC
+ AES XDATA1, XKEY2
+ AES XDATA2, XKEY2
+ AES XDATA3, XKEY2
+
+ vmovdqa XKEY4, [KEYS + 4*16]
+
+ AES XDATA0, XKEY_A ; 3. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+ vmovdqa XKEY_A, [KEYS + 5*16]
+
+ AES XDATA0, XKEY4 ; 4. ENC
+ AES XDATA1, XKEY4
+ AES XDATA2, XKEY4
+ AES XDATA3, XKEY4
+
+ vmovdqa XKEY6, [KEYS + 6*16]
+
+ AES XDATA0, XKEY_A ; 5. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+ vmovdqa XKEY_A, [KEYS + 7*16]
+
+ AES XDATA0, XKEY6 ; 6. ENC
+ AES XDATA1, XKEY6
+ AES XDATA2, XKEY6
+ AES XDATA3, XKEY6
+
+ vmovdqa XKEY_B, [KEYS + 8*16]
+
+ AES XDATA0, XKEY_A ; 7. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+ vmovdqa XKEY_A, [KEYS + 9*16]
+
+ AES XDATA0, XKEY_B ; 8. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+ AES XDATA3, XKEY_B
+
+ vmovdqa XKEY_B, [KEYS + 10*16]
+
+ AES XDATA0, XKEY_A ; 9. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+%if %%NROUNDS >= 12
+ vmovdqa XKEY_A, [KEYS + 11*16]
+
+ AES XDATA0, XKEY_B ; 10. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+ AES XDATA3, XKEY_B
+
+ vmovdqa XKEY_B, [KEYS + 12*16]
+
+ AES XDATA0, XKEY_A ; 11. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+%endif
+
+%if %%NROUNDS == 14
+ vmovdqa XKEY_A, [KEYS + 13*16]
+
+ AES XDATA0, XKEY_B ; 12. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+ AES XDATA3, XKEY_B
+
+ vmovdqa XKEY_B, [KEYS + 14*16]
+
+ AES XDATA0, XKEY_A ; 13. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+%endif
+
+ AES_LAST XDATA0, XKEY_B ; 10/12/14. ENC (depending on key size)
+ AES_LAST XDATA1, XKEY_B
+ AES_LAST XDATA2, XKEY_B
+ AES_LAST XDATA3, XKEY_B
+
+ vmovdqu [OUT + 0*16], XDATA0
+ vmovdqu [OUT + 1*16], XDATA1
+ vmovdqu [OUT + 2*16], XDATA2
+ vmovdqu [OUT + 3*16], XDATA3
+
+ cmp LEN, 4*16
+ jz %%done
+ jmp %%main_loop
+
+ align 16
+%%main_loop:
+ ; load plain/cipher text
+ vmovdqu XDATA0, [IN + IDX + 0*16]
+ vmovdqu XDATA1, [IN + IDX + 1*16]
+ vmovdqu XDATA2, [IN + IDX + 2*16]
+ vmovdqu XDATA3, [IN + IDX + 3*16]
+
+ vmovdqa XKEY_A, [KEYS + 1*16]
+
+ vpxor XDATA0, XKEY0 ; 0. ARK
+ vpxor XDATA1, XKEY0
+ vpxor XDATA2, XKEY0
+ vpxor XDATA3, XKEY0
+
+ add IDX, 4*16
+
+ AES XDATA0, XKEY_A ; 1. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+ vmovdqa XKEY_A, [KEYS + 3*16]
+
+ AES XDATA0, XKEY2 ; 2. ENC
+ AES XDATA1, XKEY2
+ AES XDATA2, XKEY2
+ AES XDATA3, XKEY2
+
+ AES XDATA0, XKEY_A ; 3. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+ vmovdqa XKEY_A, [KEYS + 5*16]
+
+ AES XDATA0, XKEY4 ; 4. ENC
+ AES XDATA1, XKEY4
+ AES XDATA2, XKEY4
+ AES XDATA3, XKEY4
+
+ AES XDATA0, XKEY_A ; 5. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+ vmovdqa XKEY_A, [KEYS + 7*16]
+
+ AES XDATA0, XKEY6 ; 6. ENC
+ AES XDATA1, XKEY6
+ AES XDATA2, XKEY6
+ AES XDATA3, XKEY6
+
+ vmovdqa XKEY_B, [KEYS + 8*16]
+
+ AES XDATA0, XKEY_A ; 7. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+ vmovdqa XKEY_A, [KEYS + 9*16]
+
+ AES XDATA0, XKEY_B ; 8. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+ AES XDATA3, XKEY_B
+
+ vmovdqa XKEY_B, [KEYS + 10*16]
+
+ AES XDATA0, XKEY_A ; 9. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+%if %%NROUNDS >= 12
+ vmovdqa XKEY_A, [KEYS + 11*16]
+
+ AES XDATA0, XKEY_B ; 10. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+ AES XDATA3, XKEY_B
+
+ vmovdqa XKEY_B, [KEYS + 12*16]
+
+ AES XDATA0, XKEY_A ; 11. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+%endif
+
+%if %%NROUNDS == 14
+ vmovdqa XKEY_A, [KEYS + 13*16]
+
+ AES XDATA0, XKEY_B ; 12. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+ AES XDATA3, XKEY_B
+
+ vmovdqa XKEY_B, [KEYS + 14*16]
+
+ AES XDATA0, XKEY_A ; 13. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+%endif
+
+ AES_LAST XDATA0, XKEY_B ; 10/12/14. ENC (depending on key size)
+ AES_LAST XDATA1, XKEY_B
+ AES_LAST XDATA2, XKEY_B
+ AES_LAST XDATA3, XKEY_B
+
+ vmovdqu [OUT + IDX + 0*16 - 4*16], XDATA0
+ vmovdqu [OUT + IDX + 1*16 - 4*16], XDATA1
+ vmovdqu [OUT + IDX + 2*16 - 4*16], XDATA2
+ vmovdqu [OUT + IDX + 3*16 - 4*16], XDATA3
+
+ cmp IDX, LEN
+ jne %%main_loop
+
+%%done:
+
+ ret
+
+%endmacro
+
+align 16
+MKGLOBAL(AES_ECB_ENC_128,function,internal)
+AES_ECB_ENC_128:
+
+ AES_ECB 10, ENC
+
+align 16
+MKGLOBAL(AES_ECB_ENC_192,function,internal)
+AES_ECB_ENC_192:
+
+ AES_ECB 12, ENC
+
+align 16
+MKGLOBAL(AES_ECB_ENC_256,function,internal)
+AES_ECB_ENC_256:
+
+ AES_ECB 14, ENC
+
+align 16
+MKGLOBAL(AES_ECB_DEC_128,function,internal)
+AES_ECB_DEC_128:
+
+ AES_ECB 10, DEC
+
+align 16
+MKGLOBAL(AES_ECB_DEC_192,function,internal)
+AES_ECB_DEC_192:
+
+ AES_ECB 12, DEC
+
+align 16
+MKGLOBAL(AES_ECB_DEC_256,function,internal)
+AES_ECB_DEC_256:
+
+ AES_ECB 14, DEC
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/aes_xcbc_mac_128_x8.asm b/src/spdk/intel-ipsec-mb/avx/aes_xcbc_mac_128_x8.asm
new file mode 100644
index 000000000..615e19050
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/aes_xcbc_mac_128_x8.asm
@@ -0,0 +1,418 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;;; routine to do 128 bit AES XCBC
+
+;; clobbers all registers except for ARG1 and rbp
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+
+%define VMOVDQ vmovdqu ;; assume buffers not aligned
+
+%macro VPXOR2 2
+ vpxor %1, %1, %2
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; struct AES_XCBC_ARGS_x8 {
+;; void* in[8];
+;; UINT128* keys[8];
+;; UINT128 ICV[8];
+;; }
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_xcbc_mac_128_x8(AES_XCBC_ARGS_x8 *args, UINT64 len);
+;; arg 1: ARG : addr of AES_XCBC_ARGS_x8 structure
+;; arg 2: LEN : len (in units of bytes)
+
+struc STACK
+_gpr_save: resq 1
+_len: resq 1
+endstruc
+
+%define GPR_SAVE_AREA rsp + _gpr_save
+%define LEN_AREA rsp + _len
+
+%ifdef LINUX
+%define ARG rdi
+%define LEN rsi
+%define REG3 rcx
+%define REG4 rdx
+%else
+%define ARG rcx
+%define LEN rdx
+%define REG3 rsi
+%define REG4 rdi
+%endif
+
+%define IDX rax
+%define TMP rbx
+
+%define KEYS0 REG3
+%define KEYS1 REG4
+%define KEYS2 rbp
+%define KEYS3 r8
+%define KEYS4 r9
+%define KEYS5 r10
+%define KEYS6 r11
+%define KEYS7 r12
+
+%define IN0 r13
+%define IN2 r14
+%define IN4 r15
+%define IN6 LEN
+
+%define XDATA0 xmm0
+%define XDATA1 xmm1
+%define XDATA2 xmm2
+%define XDATA3 xmm3
+%define XDATA4 xmm4
+%define XDATA5 xmm5
+%define XDATA6 xmm6
+%define XDATA7 xmm7
+
+%define XKEY0_3 xmm8
+%define XKEY1_4 xmm9
+%define XKEY2_5 xmm10
+%define XKEY3_6 xmm11
+%define XKEY4_7 xmm12
+%define XKEY5_8 xmm13
+%define XKEY6_9 xmm14
+%define XTMP xmm15
+
+section .text
+MKGLOBAL(aes_xcbc_mac_128_x8,function,internal)
+aes_xcbc_mac_128_x8:
+
+ sub rsp, STACK_size
+ mov [GPR_SAVE_AREA + 8*0], rbp
+
+ mov IDX, 16
+ mov [LEN_AREA], LEN
+
+ mov IN0, [ARG + _aesxcbcarg_in + 8*0]
+ mov IN2, [ARG + _aesxcbcarg_in + 8*2]
+ mov IN4, [ARG + _aesxcbcarg_in + 8*4]
+ mov IN6, [ARG + _aesxcbcarg_in + 8*6]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ mov TMP, [ARG + _aesxcbcarg_in + 8*1]
+ VMOVDQ XDATA0, [IN0] ; load first block of plain text
+ VMOVDQ XDATA1, [TMP] ; load first block of plain text
+ mov TMP, [ARG + _aesxcbcarg_in + 8*3]
+ VMOVDQ XDATA2, [IN2] ; load first block of plain text
+ VMOVDQ XDATA3, [TMP] ; load first block of plain text
+ mov TMP, [ARG + _aesxcbcarg_in + 8*5]
+ VMOVDQ XDATA4, [IN4] ; load first block of plain text
+ VMOVDQ XDATA5, [TMP] ; load first block of plain text
+ mov TMP, [ARG + _aesxcbcarg_in + 8*7]
+ VMOVDQ XDATA6, [IN6] ; load first block of plain text
+ VMOVDQ XDATA7, [TMP] ; load first block of plain text
+
+
+ VPXOR2 XDATA0, [ARG + _aesxcbcarg_ICV + 16*0] ; plaintext XOR ICV
+ VPXOR2 XDATA1, [ARG + _aesxcbcarg_ICV + 16*1] ; plaintext XOR ICV
+ VPXOR2 XDATA2, [ARG + _aesxcbcarg_ICV + 16*2] ; plaintext XOR ICV
+ VPXOR2 XDATA3, [ARG + _aesxcbcarg_ICV + 16*3] ; plaintext XOR ICV
+ VPXOR2 XDATA4, [ARG + _aesxcbcarg_ICV + 16*4] ; plaintext XOR ICV
+ VPXOR2 XDATA5, [ARG + _aesxcbcarg_ICV + 16*5] ; plaintext XOR ICV
+ VPXOR2 XDATA6, [ARG + _aesxcbcarg_ICV + 16*6] ; plaintext XOR ICV
+ VPXOR2 XDATA7, [ARG + _aesxcbcarg_ICV + 16*7] ; plaintext XOR ICV
+
+ mov KEYS0, [ARG + _aesxcbcarg_keys + 8*0]
+ mov KEYS1, [ARG + _aesxcbcarg_keys + 8*1]
+ mov KEYS2, [ARG + _aesxcbcarg_keys + 8*2]
+ mov KEYS3, [ARG + _aesxcbcarg_keys + 8*3]
+ mov KEYS4, [ARG + _aesxcbcarg_keys + 8*4]
+ mov KEYS5, [ARG + _aesxcbcarg_keys + 8*5]
+ mov KEYS6, [ARG + _aesxcbcarg_keys + 8*6]
+ mov KEYS7, [ARG + _aesxcbcarg_keys + 8*7]
+
+ VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK
+ VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK
+ VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK
+ VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK
+ VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK
+
+ vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+ vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC
+ vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC
+ vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC
+ vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC
+
+ vmovdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key
+
+ vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+ vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC
+ vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC
+ vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC
+ vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC
+
+ vmovdqa XKEY1_4, [KEYS1 + 16*4] ; load round 4 key
+
+ vaesenc XDATA0, XKEY0_3 ; 3. ENC
+ vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC
+ vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC
+ vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC
+ vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC
+ vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC
+ vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC
+ vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ vmovdqa XKEY2_5, [KEYS2 + 16*5] ; load round 5 key
+ vaesenc XDATA1, XKEY1_4 ; 4. ENC
+ vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+ vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC
+ vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC
+ vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC
+ vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ vmovdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key
+ vaesenc XDATA2, XKEY2_5 ; 5. ENC
+ vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+ vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC
+ vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC
+ vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC
+ vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC
+ vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC
+ vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC
+ vmovdqa XKEY4_7, [KEYS4 + 16*7] ; load round 7 key
+ vaesenc XDATA3, XKEY3_6 ; 6. ENC
+ vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC
+ vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC
+ vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC
+ vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+ vmovdqa XKEY5_8, [KEYS5 + 16*8] ; load round 8 key
+ vaesenc XDATA4, XKEY4_7 ; 7. ENC
+ vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC
+ vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC
+ vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+ vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC
+ vmovdqa XKEY6_9, [KEYS6 + 16*9] ; load round 9 key
+ vaesenc XDATA5, XKEY5_8 ; 8. ENC
+ vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC
+ vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC
+ vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC
+ vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC
+ vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC
+ vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC
+ vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC
+ vaesenc XDATA6, XKEY6_9 ; 9. ENC
+ vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC
+
+ vaesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ vaesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ vaesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ vaesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC
+ vaesenclast XDATA4, [KEYS4 + 16*10] ; 10. ENC
+ vaesenclast XDATA5, [KEYS5 + 16*10] ; 10. ENC
+ vaesenclast XDATA6, [KEYS6 + 16*10] ; 10. ENC
+ vaesenclast XDATA7, [KEYS7 + 16*10] ; 10. ENC
+
+ cmp [LEN_AREA], IDX
+ je done
+
+main_loop:
+ mov TMP, [ARG + _aesxcbcarg_in + 8*1]
+ VPXOR2 XDATA0, [IN0 + IDX] ; load next block of plain text
+ VPXOR2 XDATA1, [TMP + IDX] ; load next block of plain text
+ mov TMP, [ARG + _aesxcbcarg_in + 8*3]
+ VPXOR2 XDATA2, [IN2 + IDX] ; load next block of plain text
+ VPXOR2 XDATA3, [TMP + IDX] ; load next block of plain text
+ mov TMP, [ARG + _aesxcbcarg_in + 8*5]
+ VPXOR2 XDATA4, [IN4 + IDX] ; load next block of plain text
+ VPXOR2 XDATA5, [TMP + IDX] ; load next block of plain text
+ mov TMP, [ARG + _aesxcbcarg_in + 8*7]
+ VPXOR2 XDATA6, [IN6 + IDX] ; load next block of plain text
+ VPXOR2 XDATA7, [TMP + IDX] ; load next block of plain text
+
+
+ VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK
+ VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK
+ VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK
+ VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK
+ VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK
+
+ vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+ vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC
+ vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC
+ vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC
+ vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+ vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC
+ vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC
+ vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC
+ vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC
+
+ vaesenc XDATA0, XKEY0_3 ; 3. ENC
+ vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC
+ vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC
+ vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC
+ vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC
+ vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC
+ vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC
+ vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ vaesenc XDATA1, XKEY1_4 ; 4. ENC
+ vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+ vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC
+ vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC
+ vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC
+ vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ vaesenc XDATA2, XKEY2_5 ; 5. ENC
+ vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+ vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC
+ vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC
+ vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC
+ vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC
+ vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC
+ vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC
+ vaesenc XDATA3, XKEY3_6 ; 6. ENC
+ vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC
+ vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC
+ vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC
+ vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+ vaesenc XDATA4, XKEY4_7 ; 7. ENC
+ vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC
+ vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC
+ vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+ vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC
+ vaesenc XDATA5, XKEY5_8 ; 8. ENC
+ vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC
+ vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC
+
+ vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC
+ vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC
+ vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC
+ vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC
+ vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC
+ vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC
+ vaesenc XDATA6, XKEY6_9 ; 9. ENC
+ vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC
+
+
+ vaesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ vaesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ vaesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ vaesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC
+ vaesenclast XDATA4, [KEYS4 + 16*10] ; 10. ENC
+ vaesenclast XDATA5, [KEYS5 + 16*10] ; 10. ENC
+ vaesenclast XDATA6, [KEYS6 + 16*10] ; 10. ENC
+ vaesenclast XDATA7, [KEYS7 + 16*10] ; 10. ENC
+
+ add IDX, 16
+ cmp [LEN_AREA], IDX
+ jne main_loop
+
+done:
+ ;; update ICV
+ vmovdqa [ARG + _aesxcbcarg_ICV + 16*0], XDATA0
+ vmovdqa [ARG + _aesxcbcarg_ICV + 16*1], XDATA1
+ vmovdqa [ARG + _aesxcbcarg_ICV + 16*2], XDATA2
+ vmovdqa [ARG + _aesxcbcarg_ICV + 16*3], XDATA3
+ vmovdqa [ARG + _aesxcbcarg_ICV + 16*4], XDATA4
+ vmovdqa [ARG + _aesxcbcarg_ICV + 16*5], XDATA5
+ vmovdqa [ARG + _aesxcbcarg_ICV + 16*6], XDATA6
+ vmovdqa [ARG + _aesxcbcarg_ICV + 16*7], XDATA7
+
+ ;; update IN
+ vmovd xmm0, [LEN_AREA]
+ vpshufd xmm0, xmm0, 0x44
+ vpaddq xmm1, xmm0, [ARG + _aesxcbcarg_in + 16*0]
+ vpaddq xmm2, xmm0, [ARG + _aesxcbcarg_in + 16*1]
+ vpaddq xmm3, xmm0, [ARG + _aesxcbcarg_in + 16*2]
+ vpaddq xmm4, xmm0, [ARG + _aesxcbcarg_in + 16*3]
+ vmovdqa [ARG + _aesxcbcarg_in + 16*0], xmm1
+ vmovdqa [ARG + _aesxcbcarg_in + 16*1], xmm2
+ vmovdqa [ARG + _aesxcbcarg_in + 16*2], xmm3
+ vmovdqa [ARG + _aesxcbcarg_in + 16*3], xmm4
+
+;; XMMs are saved at a higher level
+ mov rbp, [GPR_SAVE_AREA + 8*0]
+
+ add rsp, STACK_size
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/gcm128_avx_gen2.asm b/src/spdk/intel-ipsec-mb/avx/gcm128_avx_gen2.asm
new file mode 100644
index 000000000..1bb601e4f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/gcm128_avx_gen2.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2018 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM128_MODE 1
+%include "avx/gcm_avx_gen2.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/gcm192_avx_gen2.asm b/src/spdk/intel-ipsec-mb/avx/gcm192_avx_gen2.asm
new file mode 100644
index 000000000..4de59d5bf
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/gcm192_avx_gen2.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2018 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM192_MODE 1
+%include "avx/gcm_avx_gen2.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/gcm256_avx_gen2.asm b/src/spdk/intel-ipsec-mb/avx/gcm256_avx_gen2.asm
new file mode 100644
index 000000000..de8eadf4c
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/gcm256_avx_gen2.asm
@@ -0,0 +1,30 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2018 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define GCM256_MODE 1
+%include "avx/gcm_avx_gen2.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/gcm_avx_gen2.asm b/src/spdk/intel-ipsec-mb/avx/gcm_avx_gen2.asm
new file mode 100644
index 000000000..2aa3a162d
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/gcm_avx_gen2.asm
@@ -0,0 +1,2515 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+;
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+;
+; For the shift-based reductions used in this code, we used the method described in paper:
+; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+; AAD will be padded with 0 to the next 16byte multiple
+; for example, assume AAD is a u32 vector
+;
+; if AAD is 8 bytes:
+; AAD[3] = {A0, A1};
+; padded AAD in xmm register = {A1 A0 0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A1) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 32-bit Sequence Number (A0) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 32-bit Sequence Number
+;
+; if AAD is 12 bytes:
+; AAD[3] = {A0, A1, A2};
+; padded AAD in xmm register = {A2 A1 A0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A2) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 64-bit Extended Sequence Number {A1,A0} |
+; | |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+; Must be a multiple of 4 bytes and from the definition of the spec.
+; The code additionally supports any aadLen length.
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "include/os.asm"
+%include "include/reg_sizes.asm"
+%include "include/clear_regs.asm"
+%include "include/gcm_defines.asm"
+%include "include/gcm_keys_sse_avx.asm"
+%include "include/memcpy.asm"
+
+%ifndef GCM128_MODE
+%ifndef GCM192_MODE
+%ifndef GCM256_MODE
+%error "No GCM mode selected for gcm_avx_gen2.asm!"
+%endif
+%endif
+%endif
+
+%ifdef GCM128_MODE
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx_gen2
+%define NROUNDS 9
+%endif
+
+%ifdef GCM192_MODE
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx_gen2
+%define NROUNDS 11
+%endif
+
+%ifdef GCM256_MODE
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx_gen2
+%define NROUNDS 13
+%endif
+
+default rel
+; need to push 4 registers into stack to maintain
+%define STACK_OFFSET 8*4
+
+%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define TMP3 16*1 ; Temporary storage for AES State 3
+%define TMP4 16*2 ; Temporary storage for AES State 4
+%define TMP5 16*3 ; Temporary storage for AES State 5
+%define TMP6 16*4 ; Temporary storage for AES State 6
+%define TMP7 16*5 ; Temporary storage for AES State 7
+%define TMP8 16*6 ; Temporary storage for AES State 8
+
+%define LOCAL_STORAGE 16*7
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE 16*10
+%else
+ %define XMM_STORAGE 0
+%endif
+
+%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
+
+section .text
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GHASH_MUL 7
+%define %%GH %1 ; 16 Bytes
+%define %%HK %2 ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba
+ vpshufd %%T2, %%GH, 01001110b
+ vpshufd %%T3, %%HK, 01001110b
+ vpxor %%T2, %%T2, %%GH ; %%T2 = (a1+a0)
+ vpxor %%T3, %%T3, %%HK ; %%T3 = (b1+b0)
+
+ vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0
+ vpclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ vpxor %%T2, %%T2, %%GH
+ vpxor %%T2, %%T2, %%T1 ; %%T2 = a0*b1+a1*b0
+
+ vpslldq %%T3, %%T2, 8 ; shift-L %%T3 2 DWs
+ vpsrldq %%T2, %%T2, 8 ; shift-R %%T2 2 DWs
+ vpxor %%GH, %%GH, %%T3
+ vpxor %%T1, %%T1, %%T2 ; <%%T1:%%GH> = %%GH x %%HK
+
+ ;first phase of the reduction
+ vpslld %%T2, %%GH, 31 ; packed right shifting << 31
+ vpslld %%T3, %%GH, 30 ; packed right shifting shift << 30
+ vpslld %%T4, %%GH, 25 ; packed right shifting shift << 25
+
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpsrldq %%T5, %%T2, 4 ; shift-R %%T5 1 DW
+
+ vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
+ vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+
+ vpsrld %%T2,%%GH,1 ; packed left shifting >> 1
+ vpsrld %%T3,%%GH,2 ; packed left shifting >> 2
+ vpsrld %%T4,%%GH,7 ; packed left shifting >> 7
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpxor %%T2, %%T2, %%T5
+ vpxor %%GH, %%GH, %%T2
+ vpxor %%GH, %%GH, %%T1 ; the result is in %%GH
+
+
+%endmacro
+
+
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+%define %%T6 %8
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vmovdqa %%T5, %%HK
+
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_2_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_3], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_3_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_4], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_4_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_5], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_5_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_6], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_6_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_7], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_7_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_8], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_8_k], %%T1
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT 6
+%define %%OUTPUT %1 ; %%OUTPUT is an xmm register
+%define %%INPUT %2
+%define %%LENGTH %3
+%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
+%define %%COUNTER %5
+%define %%TMP1 %6
+
+ vpxor %%OUTPUT, %%OUTPUT
+ mov %%COUNTER, %%LENGTH
+ mov %%END_READ_LOCATION, %%INPUT
+ add %%END_READ_LOCATION, %%LENGTH
+ xor %%TMP1, %%TMP1
+
+
+ cmp %%COUNTER, 8
+ jl %%_byte_loop_2
+ vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
+ je %%_done
+
+ sub %%COUNTER, 8
+
+%%_byte_loop_1: ;Read in data 1 byte at a time while data is left
+ shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_1
+ vpinsrq %%OUTPUT, %%TMP1, 1
+ jmp %%_done
+
+%%_byte_loop_2: ;Read in data 1 byte at a time while data is left
+ cmp %%COUNTER, 0
+ je %%_done
+ shl %%TMP1, 8 ;This loop handles when no bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_2
+ vpinsrq %%OUTPUT, %%TMP1, 0
+%%_done:
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CALC_AAD_HASH 15
+%define %%A_IN %1
+%define %%A_LEN %2
+%define %%AAD_HASH %3
+%define %%GDATA_KEY %4
+%define %%XTMP0 %5 ; xmm temp reg 5
+%define %%XTMP1 %6 ; xmm temp reg 5
+%define %%XTMP2 %7
+%define %%XTMP3 %8
+%define %%XTMP4 %9
+%define %%XTMP5 %10 ; xmm temp reg 5
+%define %%T1 %11 ; temp reg 1
+%define %%T2 %12
+%define %%T3 %13
+%define %%T4 %14
+%define %%T5 %15 ; temp reg 5
+
+
+ mov %%T1, %%A_IN ; T1 = AAD
+ mov %%T2, %%A_LEN ; T2 = aadLen
+ vpxor %%AAD_HASH, %%AAD_HASH
+
+%%_get_AAD_loop128:
+ cmp %%T2, 128
+ jl %%_exit_AAD_loop128
+
+ vmovdqu %%XTMP0, [%%T1 + 16*0]
+ vpshufb %%XTMP0, [rel SHUF_MASK]
+
+ vpxor %%XTMP0, %%AAD_HASH
+
+ vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_8]
+ vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0
+ vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1
+
+%assign i 1
+%assign j 7
+%rep 7
+ vmovdqu %%XTMP0, [%%T1 + 16*i]
+ vpshufb %%XTMP0, [rel SHUF_MASK]
+
+ vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_ %+ j]
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1
+ vpxor %%XTMP1, %%XTMP1, %%XTMP4
+
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0
+ vpxor %%XTMP2, %%XTMP2, %%XTMP4
+
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4
+%assign i (i + 1)
+%assign j (j - 1)
+%endrep
+
+ vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs
+ vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs
+ vpxor %%XTMP2, %%XTMP2, %%XTMP4
+ vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L)
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqa %%XTMP5, [rel POLY2]
+ vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01
+ vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs
+ vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00
+ vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10
+ vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1
+
+ sub %%T2, 128
+ je %%_CALC_AAD_done
+
+ add %%T1, 128
+ jmp %%_get_AAD_loop128
+
+%%_exit_AAD_loop128:
+ cmp %%T2, 16
+ jl %%_get_small_AAD_block
+
+ ;; calculate hash_key position to start with
+ mov %%T3, %%T2
+ and %%T3, -16 ; 1 to 7 blocks possible here
+ neg %%T3
+ add %%T3, HashKey_1 + 16
+ lea %%T3, [%%GDATA_KEY + %%T3]
+
+ vmovdqu %%XTMP0, [%%T1]
+ vpshufb %%XTMP0, [rel SHUF_MASK]
+
+ vpxor %%XTMP0, %%AAD_HASH
+
+ vmovdqu %%XTMP5, [%%T3]
+ vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0
+ vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1
+
+ add %%T3, 16 ; move to next hashkey
+ add %%T1, 16 ; move to next data block
+ sub %%T2, 16
+ cmp %%T2, 16
+ jl %%_AAD_reduce
+
+%%_AAD_blocks:
+ vmovdqu %%XTMP0, [%%T1]
+ vpshufb %%XTMP0, [rel SHUF_MASK]
+
+ vmovdqu %%XTMP5, [%%T3]
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1
+ vpxor %%XTMP1, %%XTMP1, %%XTMP4
+
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0
+ vpxor %%XTMP2, %%XTMP2, %%XTMP4
+
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4
+
+ add %%T3, 16 ; move to next hashkey
+ add %%T1, 16
+ sub %%T2, 16
+ cmp %%T2, 16
+ jl %%_AAD_reduce
+ jmp %%_AAD_blocks
+
+%%_AAD_reduce:
+ vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs
+ vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs
+ vpxor %%XTMP2, %%XTMP2, %%XTMP4
+ vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L)
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqa %%XTMP5, [rel POLY2]
+ vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01
+ vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs
+ vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00
+ vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10
+ vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1
+
+ or %%T2, %%T2
+ je %%_CALC_AAD_done
+
+%%_get_small_AAD_block:
+ vmovdqu %%XTMP0, [%%GDATA_KEY + HashKey]
+ READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [rel SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input:
+; GDATA_KEY - struct gcm_key_data *
+; GDATA_CTX - struct gcm_context_data *
+; PLAIN_CYPH_IN - input text
+; PLAIN_CYPH_LEN - input text length
+; DATA_OFFSET - the current data offset
+; ENC_DEC - whether encoding or decoding
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 8
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%PLAIN_CYPH_LEN %5
+%define %%DATA_OFFSET %6
+%define %%AAD_HASH %7
+%define %%ENC_DEC %8
+ mov r13, [%%GDATA_CTX + PBlockLen]
+ cmp r13, 0
+ je %%_partial_block_done ;Leave Macro if no partial blocks
+
+ cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
+ jl %%_fewer_than_16_bytes
+ VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
+ jmp %%_data_read
+
+%%_fewer_than_16_bytes:
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
+
+%%_data_read: ;Finished reading in data
+
+
+ vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
+ vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+
+ lea r12, [SHIFT_MASK]
+
+ cmp r13, rax
+ add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm9, xmm2 ;shift right r13 bytes
+
+%ifidn %%ENC_DEC, DEC
+ vmovdqa xmm3, xmm1
+ vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_1:
+
+ vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpand xmm3, xmm1
+ vpshufb xmm3, [SHUF_MASK]
+ vpshufb xmm3, xmm2
+ vpxor %%AAD_HASH, xmm3
+
+
+ cmp r15,0
+ jl %%_partial_incomplete_1
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA_CTX + PBlockLen], rax
+ jmp %%_dec_done
+%%_partial_incomplete_1:
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rax, %%PLAIN_CYPH_LEN
+ add [%%GDATA_CTX + PBlockLen], rax
+%else
+ add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%endif
+%%_dec_done:
+ vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
+
+%else
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_2:
+
+ vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpshufb xmm9, [SHUF_MASK]
+ vpshufb xmm9, xmm2
+ vpxor %%AAD_HASH, xmm9
+
+ cmp r15,0
+ jl %%_partial_incomplete_2
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA_CTX + PBlockLen], rax
+ jmp %%_encode_done
+%%_partial_incomplete_2:
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rax, %%PLAIN_CYPH_LEN
+ add [%%GDATA_CTX + PBlockLen], rax
+%else
+ add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%endif
+%%_encode_done:
+ vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
+
+ vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ vpshufb xmm9, xmm2
+%endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output encrypted Bytes
+ cmp r15,0
+ jl %%_partial_fill
+ mov r12, r13
+ mov r13, 16
+ sub r13, r12 ; Set r13 to be the number of bytes to write out
+ jmp %%_count_set
+%%_partial_fill:
+ mov r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+ vmovq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq xmm9, xmm9, 8
+ vmovq rax, xmm9
+ sub r13, 8
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 24
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%LENGTH %5
+%define %%DATA_OFFSET %6
+%define %%num_initial_blocks %7 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%T1 %8
+%define %%HASH_KEY %9
+%define %%T3 %10
+%define %%T4 %11
+%define %%T5 %12
+%define %%CTR %13
+%define %%XMM1 %14
+%define %%XMM2 %15
+%define %%XMM3 %16
+%define %%XMM4 %17
+%define %%XMM5 %18
+%define %%XMM6 %19
+%define %%XMM7 %20
+%define %%XMM8 %21
+%define %%T6 %22
+%define %%T_key %23
+%define %%ENC_DEC %24
+
+%assign i (8-%%num_initial_blocks)
+ vmovdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg
+ ; start AES for %%num_initial_blocks blocks
+ vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0
+
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa reg(i), %%CTR
+ vpshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpxor reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep NROUNDS
+ vmovdqu %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep ; NROUNDS
+
+
+vmovdqu %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vpxor reg(i), %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa reg(i), %%T1
+ %endif
+ vpshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations
+%assign i (i+1)
+%endrep
+
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+
+%rep %%num_initial_blocks
+ vpxor reg(j), reg(i)
+ GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+ ; %%XMM8 has the current Hash Value
+ vmovdqa %%T3, %%XMM8
+
+ cmp %%LENGTH, 128
+ jl %%_initial_blocks_done
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Prepare 8 counter blocks and perform rounds of AES cipher on them, load plain/cipher text and
+; store cipher/plain text.
+; Keep 8 cipher text blocks for further GHASH computations (XMM1 - XMM8)
+; - combine current GHASH value into block 0 (XMM1)
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM1, %%CTR
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM2, %%CTR
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM3, %%CTR
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM4, %%CTR
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM5, %%CTR
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM6, %%CTR
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM7, %%CTR
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM8, %%CTR
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*0]
+ vpxor %%XMM1, %%T_key
+ vpxor %%XMM2, %%T_key
+ vpxor %%XMM3, %%T_key
+ vpxor %%XMM4, %%T_key
+ vpxor %%XMM5, %%T_key
+ vpxor %%XMM6, %%T_key
+ vpxor %%XMM7, %%T_key
+ vpxor %%XMM8, %%T_key
+
+
+%assign i 1
+%rep NROUNDS
+ vmovdqu %%T_key, [%%GDATA_KEY+16*i]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+%assign i (i+1)
+%endrep
+
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*i]
+ vaesenclast %%XMM1, %%T_key
+ vaesenclast %%XMM2, %%T_key
+ vaesenclast %%XMM3, %%T_key
+ vaesenclast %%XMM4, %%T_key
+ vaesenclast %%XMM5, %%T_key
+ vaesenclast %%XMM6, %%T_key
+ vaesenclast %%XMM7, %%T_key
+ vaesenclast %%XMM8, %%T_key
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+ vpxor %%XMM1, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM1, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+ vpxor %%XMM2, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM2, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+ vpxor %%XMM3, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM3, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+ vpxor %%XMM4, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM4, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+ vpxor %%XMM5, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM5, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+ vpxor %%XMM6, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM6, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+ vpxor %%XMM7, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM7, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+ vpxor %%XMM8, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM8, %%T1
+ %endif
+
+ add %%DATA_OFFSET, 128
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA - (GCM key data), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; r11 is the data offset value
+%macro GHASH_8_ENCRYPT_8_PARALLEL 22
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%DATA_OFFSET %4
+%define %%T1 %5
+%define %%T2 %6
+%define %%T3 %7
+%define %%T4 %8
+%define %%T5 %9
+%define %%T6 %10
+%define %%CTR %11
+%define %%XMM1 %12
+%define %%XMM2 %13
+%define %%XMM3 %14
+%define %%XMM4 %15
+%define %%XMM5 %16
+%define %%XMM6 %17
+%define %%XMM7 %18
+%define %%XMM8 %19
+%define %%T7 %20
+%define %%loop_idx %21
+%define %%ENC_DEC %22
+
+ vmovdqa %%T2, %%XMM1
+ vmovdqu [rsp + TMP2], %%XMM2
+ vmovdqu [rsp + TMP3], %%XMM3
+ vmovdqu [rsp + TMP4], %%XMM4
+ vmovdqu [rsp + TMP5], %%XMM5
+ vmovdqu [rsp + TMP6], %%XMM6
+ vmovdqu [rsp + TMP7], %%XMM7
+ vmovdqu [rsp + TMP8], %%XMM8
+
+%ifidn %%loop_idx, in_order
+ vpaddd %%XMM1, %%CTR, [ONE] ; INCR CNT
+ vpaddd %%XMM2, %%XMM1, [ONE]
+ vpaddd %%XMM3, %%XMM2, [ONE]
+ vpaddd %%XMM4, %%XMM3, [ONE]
+ vpaddd %%XMM5, %%XMM4, [ONE]
+ vpaddd %%XMM6, %%XMM5, [ONE]
+ vpaddd %%XMM7, %%XMM6, [ONE]
+ vpaddd %%XMM8, %%XMM7, [ONE]
+ vmovdqa %%CTR, %%XMM8
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+%else
+ vpaddd %%XMM1, %%CTR, [ONEf] ; INCR CNT
+ vpaddd %%XMM2, %%XMM1, [ONEf]
+ vpaddd %%XMM3, %%XMM2, [ONEf]
+ vpaddd %%XMM4, %%XMM3, [ONEf]
+ vpaddd %%XMM5, %%XMM4, [ONEf]
+ vpaddd %%XMM6, %%XMM5, [ONEf]
+ vpaddd %%XMM7, %%XMM6, [ONEf]
+ vpaddd %%XMM8, %%XMM7, [ONEf]
+ vmovdqa %%CTR, %%XMM8
+%endif
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T1, [%%GDATA + 16*0]
+ vpxor %%XMM1, %%T1
+ vpxor %%XMM2, %%T1
+ vpxor %%XMM3, %%T1
+ vpxor %%XMM4, %%T1
+ vpxor %%XMM5, %%T1
+ vpxor %%XMM6, %%T1
+ vpxor %%XMM7, %%T1
+ vpxor %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T1, [%%GDATA + 16*1]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [%%GDATA + 16*2]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+ vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0
+
+ vpshufd %%T6, %%T2, 01001110b
+ vpxor %%T6, %%T2
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8_k]
+ vpclmulqdq %%T6, %%T6, %%T5, 0x00 ;
+
+
+ vmovdqu %%T1, [%%GDATA + 16*3]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP2]
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_7_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*4]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu %%T1, [rsp + TMP3]
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_6_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*5]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [rsp + TMP4]
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_5_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*6]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP5]
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_4_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+
+ vmovdqu %%T1, [%%GDATA + 16*7]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP6]
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_3_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*8]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP7]
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_2_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + 16*9]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T1, [rsp + TMP8]
+ vmovdqu %%T5, [%%GDATA + HashKey]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vpxor %%T6, %%T4
+ vpxor %%T6, %%T7
+
+%ifdef GCM128_MODE
+ vmovdqu %%T5, [%%GDATA + 16*10]
+%endif
+%ifdef GCM192_MODE
+ vmovdqu %%T5, [%%GDATA + 16*10]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*11]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*12]
+%endif
+%ifdef GCM256_MODE
+ vmovdqu %%T5, [%%GDATA + 16*10]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*11]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*12]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*13]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*14]
+%endif
+
+%assign i 0
+%assign j 1
+%rep 8
+
+%ifidn %%ENC_DEC, ENC
+%ifdef NT_LD
+ VXLDR %%T2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i]
+ vpxor %%T2, %%T2, %%T5
+%else
+ vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i]
+%endif ; NT_LD
+ vaesenclast reg(j), reg(j), %%T2
+%else
+ VXLDR %%T2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i]
+ vpxor %%T2, %%T2, %%T5
+ vaesenclast %%T3, reg(j), %%T2
+ vpxor reg(j), %%T2, %%T5
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*i], %%T3
+%endif ; %%ENC_DEC
+
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+
+ vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs
+ vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs
+ vpxor %%T7, %%T3
+ vpxor %%T6, %%T4 ; accumulate the results in %%T6:%%T7
+
+
+ ;first phase of the reduction
+
+ vpslld %%T2, %%T7, 31 ; packed right shifting << 31
+ vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30
+ vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25
+
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW
+
+ vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
+ vpxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ %ifidn %%ENC_DEC, ENC
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 ; Write to the Ciphertext buffer
+ %endif
+
+ ;second phase of the reduction
+
+ vpsrld %%T2,%%T7,1 ; packed left shifting >> 1
+ vpsrld %%T3,%%T7,2 ; packed left shifting >> 2
+ vpsrld %%T4,%%T7,7 ; packed left shifting >> 7
+ vpxor %%T2, %%T2,%%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2,%%T4
+
+ vpxor %%T2, %%T2, %%T1
+ vpxor %%T7, %%T7, %%T2
+ vpxor %%T6, %%T6, %%T7 ; the result is in %%T6
+
+
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [SHUF_MASK]
+ vpshufb %%XMM3, [SHUF_MASK]
+ vpshufb %%XMM4, [SHUF_MASK]
+ vpshufb %%XMM5, [SHUF_MASK]
+ vpshufb %%XMM6, [SHUF_MASK]
+ vpshufb %%XMM7, [SHUF_MASK]
+ vpshufb %%XMM8, [SHUF_MASK]
+
+
+ vpxor %%XMM1, %%T6
+
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+; %%GDATA is GCM key data
+%macro GHASH_LAST_8 16
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+%define %%XMM8 %16
+ ;; Karatsuba Method
+
+
+ vpshufd %%T2, %%XMM1, 01001110b
+ vpxor %%T2, %%XMM1
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+ vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
+ vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
+
+ vmovdqu %%T3, [%%GDATA + HashKey_8_k]
+ vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpshufd %%T2, %%XMM2, 01001110b
+ vpxor %%T2, %%XMM2
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_7_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpshufd %%T2, %%XMM3, 01001110b
+ vpxor %%T2, %%XMM3
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_6_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpshufd %%T2, %%XMM4, 01001110b
+ vpxor %%T2, %%XMM4
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_5_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM5, 01001110b
+ vpxor %%T2, %%XMM5
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_4_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM6, 01001110b
+ vpxor %%T2, %%XMM6
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_3_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM7, 01001110b
+ vpxor %%T2, %%XMM7
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_2_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM8, 01001110b
+ vpxor %%T2, %%XMM8
+ vmovdqu %%T5, [%%GDATA + HashKey]
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+ vpxor %%XMM1, %%XMM1, %%T6
+ vpxor %%T2, %%XMM1, %%T7
+
+
+
+
+ vpslldq %%T4, %%T2, 8
+ vpsrldq %%T2, %%T2, 8
+
+ vpxor %%T7, %%T4
+ vpxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+ ;first phase of the reduction
+
+ vpslld %%T2, %%T7, 31 ; packed right shifting << 31
+ vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30
+ vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25
+
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW
+
+ vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
+ vpxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+
+ vpsrld %%T2,%%T7,1 ; packed left shifting >> 1
+ vpsrld %%T3,%%T7,2 ; packed left shifting >> 2
+ vpsrld %%T4,%%T7,7 ; packed left shifting >> 7
+ vpxor %%T2, %%T2,%%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2,%%T4
+
+ vpxor %%T2, %%T2, %%T1
+ vpxor %%T7, %%T7, %%T2
+ vpxor %%T6, %%T6, %%T7 ; the result is in %%T6
+
+
+%endmacro
+
+
+; Encryption of a single block
+; %%GDATA is GCM key data
+%macro ENCRYPT_SINGLE_BLOCK 2
+%define %%GDATA %1
+%define %%XMM0 %2
+
+ vpxor %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep NROUNDS
+ vaesenc %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep ; NROUNDS
+ vaesenclast %%XMM0, [%%GDATA+16*i]
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+ ;; Required for Update/GMC_ENC
+ ;the number of pushes must equal STACK_OFFSET
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r14, rsp
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+ vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+ vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+ vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+ vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+ vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+ vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+ vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+ vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+ vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_xmms_avx_asm
+%endif
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16]
+ vmovdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16]
+ vmovdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16]
+ vmovdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16]
+ vmovdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16]
+ vmovdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16]
+ vmovdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
+ vmovdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
+ vmovdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
+ vmovdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
+%endif
+
+;; Required for Update/GMC_ENC
+ mov rsp, r14
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
+; Input: struct gcm_key_data *(GDATA_KEY), struct gcm_context_data *(GDATA_CTX),
+; IV, Additional Authentication data (A_IN), Additional
+; Data length (A_LEN)
+; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA.
+; Clobbers rax, r10-r13, and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_INIT 5
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%IV %3
+%define %%A_IN %4
+%define %%A_LEN %5
+%define %%AAD_HASH xmm0
+
+ CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
+ vpxor xmm2, xmm3
+ mov r10, %%A_LEN
+
+ vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash
+ mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length
+ xor r10, r10
+ mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0
+ mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0
+ vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0
+ mov r10, %%IV
+ vmovdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001
+ vpinsrq xmm2, [r10], 0
+ vpinsrd xmm2, [r10+8], 2
+ vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv
+
+ vpshufb xmm2, [rel SHUF_MASK]
+
+ vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct
+; has been initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: struct gcm_key_data* (GDATA_KEY), struct gcm_context_data * (GDATA_CTX),
+; input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_ENC_DEC 6
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%PLAIN_CYPH_LEN %5
+%define %%ENC_DEC %6
+%define %%DATA_OFFSET r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+ cmp %%PLAIN_CYPH_LEN, 0
+ je %%_multiple_of_16_bytes
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rax, %%PLAIN_CYPH_LEN
+ add [%%GDATA_CTX + InLen], rax ; Update length of data processed
+%else
+ add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN ; Update length of data processed
+%endif
+ vmovdqu xmm13, [%%GDATA_KEY + HashKey] ; xmm13 = HashKey
+ vmovdqu xmm8, [%%GDATA_CTX + AadHash]
+
+
+ PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+
+
+ mov r13, %%PLAIN_CYPH_LEN
+ sub r13, %%DATA_OFFSET
+ mov r10, r13 ; save the amount of data left to process in r10
+ and r13, -16 ; r13 = r13 - (r13 mod 16)
+
+ mov r12, r13
+ shr r12, 4
+ and r12, 7
+
+ jz %%_initial_num_blocks_is_0
+
+ cmp r12, 7
+ je %%_initial_num_blocks_is_7
+ cmp r12, 6
+ je %%_initial_num_blocks_is_6
+ cmp r12, 5
+ je %%_initial_num_blocks_is_5
+ cmp r12, 4
+ je %%_initial_num_blocks_is_4
+ cmp r12, 3
+ je %%_initial_num_blocks_is_3
+ cmp r12, 2
+ je %%_initial_num_blocks_is_2
+
+ jmp %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*7
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*6
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*5
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*4
+ jmp %%_initial_blocks_encrypted
+
+
+%%_initial_num_blocks_is_3:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*3
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_2:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*2
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+ cmp r13, 0
+ je %%_zero_cipher_left
+
+ sub r13, 128
+ je %%_eight_cipher_left
+
+
+
+
+ vmovd r15d, xmm9
+ and r15d, 255
+ vpshufb xmm9, [SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+ cmp r15d, 255-8
+ jg %%_encrypt_by_8
+
+
+
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ vpshufb xmm9, [SHUF_MASK]
+ jmp %%_eight_cipher_left
+
+%%_encrypt_by_8:
+ vpshufb xmm9, [SHUF_MASK]
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN,%%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
+ vpshufb xmm9, [SHUF_MASK]
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ vpshufb xmm9, [SHUF_MASK]
+
+
+
+
+%%_eight_cipher_left:
+ GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+
+%%_zero_cipher_left:
+ vmovdqu [%%GDATA_CTX + AadHash], xmm14 ; ctx_data.aad hash = xmm14
+ vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; ctx_data.current_counter = xmm9
+
+ mov r13, r10
+ and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16)
+
+ je %%_multiple_of_16_bytes
+
+ mov [%%GDATA_CTX + PBlockLen], r13 ; ctx_data.partial_blck_length = r13
+ ; handle the last <16 Byte block seperately
+
+ vpaddd xmm9, [ONE] ; INCR CNT to get Yn
+ vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
+ vpshufb xmm9, [SHUF_MASK]
+ ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Yn)
+ vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm9 ; ctx_data.partial_block_enc_key = xmm9
+
+ cmp %%PLAIN_CYPH_LEN, 16
+ jge %%_large_enough_update
+
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13
+ jmp %%_data_read
+
+%%_large_enough_update:
+ sub %%DATA_OFFSET, 16
+ add %%DATA_OFFSET, r13
+
+ vmovdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block
+
+ sub %%DATA_OFFSET, r13
+ add %%DATA_OFFSET, 16
+
+
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
+
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm1, xmm2 ; shift right 16-r13 bytes
+%%_data_read:
+%ifidn %%ENC_DEC, DEC
+ vmovdqa xmm2, xmm1
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ vpand xmm2, xmm1
+ vpshufb xmm2, [SHUF_MASK]
+ vpxor xmm14, xmm2
+ vmovdqu [%%GDATA_CTX + AadHash], xmm14
+
+%else
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ vpshufb xmm9, [SHUF_MASK]
+ vpxor xmm14, xmm9
+ vmovdqu [%%GDATA_CTX + AadHash], xmm14
+
+ vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output r13 Bytes
+ vmovq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq xmm9, xmm9, 8
+ vmovq rax, xmm9
+ sub r13, 8
+
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_multiple_of_16_bytes:
+
+
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: struct gcm_key_data* (GDATA_KEY), struct gcm_context_data *(GDATA_CTX) and
+; whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_COMPLETE 5
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%AUTH_TAG %3
+%define %%AUTH_TAG_LEN %4
+%define %%ENC_DEC %5
+%define %%PLAIN_CYPH_LEN rax
+
+ mov r12, [%%GDATA_CTX + PBlockLen]
+ vmovdqu xmm14, [%%GDATA_CTX + AadHash]
+ vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+
+ cmp r12, 0
+
+ je %%_partial_done
+
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ vmovdqu [%%GDATA_CTX + AadHash], xmm14
+
+%%_partial_done:
+
+ mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes)
+ mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
+
+ shl r12, 3 ; convert into number of bits
+ vmovd xmm15, r12d ; len(A) in xmm15
+
+ shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
+ vmovq xmm1, %%PLAIN_CYPH_LEN
+ vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
+ vpxor xmm15, xmm1 ; xmm15 = len(A)||len(C)
+
+ vpxor xmm14, xmm15
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation
+ vpshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0
+
+ ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0)
+
+ vpxor xmm9, xmm14
+
+
+%%_return_T:
+ mov r10, %%AUTH_TAG ; r10 = authTag
+ mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
+
+ cmp r11, 16
+ je %%_T_16
+
+ cmp r11, 12
+ je %%_T_12
+
+ cmp r11, 8
+ je %%_T_8
+
+ simd_store_avx r10, xmm9, r11, r12, rax
+ jmp %%_return_T_done
+%%_T_8:
+ vmovq rax, xmm9
+ mov [r10], rax
+ jmp %%_return_T_done
+%%_T_12:
+ vmovq rax, xmm9
+ mov [r10], rax
+ vpsrldq xmm9, xmm9, 8
+ vmovd eax, xmm9
+ mov [r10 + 8], eax
+ jmp %%_return_T_done
+%%_T_16:
+ vmovdqu [r10], xmm9
+
+%%_return_T_done:
+
+%ifdef SAFE_DATA
+ ;; Clear sensitive data from context structure
+ vpxor xmm0, xmm0
+ vmovdqu [%%GDATA_CTX + AadHash], xmm0
+ vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm0
+%endif
+%endmacro ; GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_precomp_128_avx_gen2
+; (struct gcm_key_data *key_data);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(precomp,_),function,)
+FN_NAME(precomp,_):
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_precomp
+%endif
+
+ push r12
+ push r13
+ push r14
+ push r15
+
+ mov r14, rsp
+
+
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63 ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; only xmm6 needs to be maintained
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+ vpxor xmm6, xmm6
+ ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey
+
+ vpshufb xmm6, [SHUF_MASK]
+ ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+ vmovdqa xmm2, xmm6
+ vpsllq xmm6, 1
+ vpsrlq xmm2, 63
+ vmovdqa xmm1, xmm2
+ vpslldq xmm2, xmm2, 8
+ vpsrldq xmm1, xmm1, 8
+ vpor xmm6, xmm2
+ ;reduction
+ vpshufd xmm2, xmm1, 00100100b
+ vpcmpeqd xmm2, [TWOONE]
+ vpand xmm2, [POLY]
+ vpxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
+
+
+ PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+ mov rsp, r14
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_xmms_avx_asm
+%endif
+exit_precomp:
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_init_128_avx_gen2(
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(init,_),function,)
+FN_NAME(init,_):
+ push r12
+ push r13
+%ifidn __OUTPUT_FORMAT__, win64
+ push r14
+ push r15
+ mov r14, rsp
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 1*16
+ movdqu [rsp + 0*16], xmm6
+%endif
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_init
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_init
+
+ ;; Check IV != NULL
+ cmp arg3, 0
+ jz exit_init
+
+ ;; Check if aad_len == 0
+ cmp arg5, 0
+ jz skip_aad_check_init
+
+ ;; Check aad != NULL (aad_len != 0)
+ cmp arg4, 0
+ jz exit_init
+
+skip_aad_check_init:
+%endif
+ GCM_INIT arg1, arg2, arg3, arg4, arg5
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_xmms_avx_asm
+%endif
+exit_init:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm6 , [rsp + 0*16]
+ mov rsp, r14
+ pop r15
+ pop r14
+%endif
+ pop r13
+ pop r12
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_update_avx_gen2(
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_update_),function,)
+FN_NAME(enc,_update_):
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_update_enc
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_update_enc
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_update_enc
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_update_enc
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_update_enc
+
+skip_in_out_check_update_enc:
+%endif
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC
+
+exit_update_enc:
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_update_avx_gen2(
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_update_),function,)
+FN_NAME(dec,_update_):
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_update_dec
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_update_dec
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_update_dec
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_update_dec
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_update_dec
+
+skip_in_out_check_update_dec:
+%endif
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC
+
+exit_update_dec:
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_finalize_avx_gen2(
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_finalize_),function,)
+FN_NAME(enc,_finalize_):
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_enc_fin
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_enc_fin
+
+ ;; Check auth_tag != NULL
+ cmp arg3, 0
+ jz exit_enc_fin
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg4, 0
+ jz exit_enc_fin
+
+ cmp arg4, 16
+ ja exit_enc_fin
+%endif
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm9
+ vmovdqu [rsp + 2*16],xmm11
+ vmovdqu [rsp + 3*16],xmm14
+ vmovdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, arg4, ENC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + 4*16]
+ vmovdqu xmm14 , [rsp + 3*16]
+ vmovdqu xmm11 , [rsp + 2*16]
+ vmovdqu xmm9 , [rsp + 1*16]
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_xmms_avx_asm
+%endif
+exit_enc_fin:
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_finalize_avx_gen2(
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_finalize_),function,)
+FN_NAME(dec,_finalize_):
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_dec_fin
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_dec_fin
+
+ ;; Check auth_tag != NULL
+ cmp arg3, 0
+ jz exit_dec_fin
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg4, 0
+ jz exit_dec_fin
+
+ cmp arg4, 16
+ ja exit_dec_fin
+%endif
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm9
+ vmovdqu [rsp + 2*16],xmm11
+ vmovdqu [rsp + 3*16],xmm14
+ vmovdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, arg4, DEC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + 4*16]
+ vmovdqu xmm14 , [rsp + 3*16]
+ vmovdqu xmm11 , [rsp + 2*16]
+ vmovdqu xmm9 , [rsp + 1*16]
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_xmms_avx_asm
+%endif
+exit_dec_fin:
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_avx_gen2(
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_),function,)
+FN_NAME(enc,_):
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_enc
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_enc
+
+ ;; Check IV != NULL
+ cmp arg6, 0
+ jz exit_enc
+
+ ;; Check auth_tag != NULL
+ cmp arg9, 0
+ jz exit_enc
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg10, 0
+ jz exit_enc
+
+ cmp arg10, 16
+ ja exit_enc
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_enc
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_enc
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_enc
+
+skip_in_out_check_enc:
+ ;; Check if aad_len == 0
+ cmp arg8, 0
+ jz skip_aad_check_enc
+
+ ;; Check aad != NULL (aad_len != 0)
+ cmp arg7, 0
+ jz exit_enc
+
+skip_aad_check_enc:
+%endif
+ GCM_INIT arg1, arg2, arg6, arg7, arg8
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC
+
+ GCM_COMPLETE arg1, arg2, arg9, arg10, ENC
+
+exit_enc:
+ FUNC_RESTORE
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_avx_gen2(
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_),function,)
+FN_NAME(dec,_):
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_dec
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_dec
+
+ ;; Check IV != NULL
+ cmp arg6, 0
+ jz exit_dec
+
+ ;; Check auth_tag != NULL
+ cmp arg9, 0
+ jz exit_dec
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg10, 0
+ jz exit_dec
+
+ cmp arg10, 16
+ ja exit_dec
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_dec
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_dec
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_dec
+
+skip_in_out_check_dec:
+ ;; Check if aad_len == 0
+ cmp arg8, 0
+ jz skip_aad_check_dec
+
+ ;; Check aad != NULL (aad_len != 0)
+ cmp arg7, 0
+ jz exit_dec
+
+skip_aad_check_dec:
+%endif
+
+ GCM_INIT arg1, arg2, arg6, arg7, arg8
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC
+
+ GCM_COMPLETE arg1, arg2, arg9, arg10, DEC
+
+exit_dec:
+ FUNC_RESTORE
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/kasumi_avx.c b/src/spdk/intel-ipsec-mb/avx/kasumi_avx.c
new file mode 100644
index 000000000..4739191ac
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/kasumi_avx.c
@@ -0,0 +1,386 @@
+/*******************************************************************************
+ Copyright (c) 2009-2019, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <limits.h>
+
+#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_xmms_avx
+
+#include "include/save_xmms.h"
+#include "include/kasumi_internal.h"
+#include "include/save_xmms.h"
+#include "include/clear_regs_mem.h"
+
+#define SAVE_XMMS save_xmms_avx
+#define RESTORE_XMMS restore_xmms_avx
+
+void
+kasumi_f8_1_buffer_avx(const kasumi_key_sched_t *pCtx, const uint64_t IV,
+ const void *pBufferIn, void *pBufferOut,
+ const uint32_t cipherLengthInBytes)
+{
+#ifndef LINUX
+ DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+ SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+ /* Check for NULL pointers */
+ if (pCtx == NULL || pBufferIn == NULL || pBufferOut == NULL)
+ return;
+
+ /* Check input data is in range of supported length */
+ if (cipherLengthInBytes == 0 ||
+ cipherLengthInBytes > (KASUMI_MAX_LEN / CHAR_BIT))
+ return;
+#endif
+ kasumi_f8_1_buffer(pCtx, IV, pBufferIn, pBufferOut,
+ cipherLengthInBytes);
+#ifdef SAFE_DATA
+ /* Clear sensitive data in registers */
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+ RESTORE_XMMS(xmm_save);
+#endif
+}
+
+void
+kasumi_f8_1_buffer_bit_avx(const kasumi_key_sched_t *pCtx, const uint64_t IV,
+ const void *pBufferIn,
+ void *pBufferOut,
+ const uint32_t cipherLengthInBits,
+ const uint32_t offsetInBits)
+{
+#ifndef LINUX
+ DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+ SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+ /* Check for NULL pointers */
+ if (pCtx == NULL || pBufferIn == NULL || pBufferOut == NULL)
+ return;
+
+ /* Check input data is in range of supported length */
+ if (cipherLengthInBits == 0 ||
+ cipherLengthInBits > KASUMI_MAX_LEN)
+ return;
+#endif
+ kasumi_f8_1_buffer_bit(pCtx, IV, pBufferIn, pBufferOut,
+ cipherLengthInBits, offsetInBits);
+#ifdef SAFE_DATA
+ /* Clear sensitive data in registers */
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+ RESTORE_XMMS(xmm_save);
+#endif
+}
+
+void
+kasumi_f8_2_buffer_avx(const kasumi_key_sched_t *pCtx, const uint64_t IV1,
+ const uint64_t IV2, const void *pBufferIn1,
+ void *pBufferOut1, const uint32_t lengthInBytes1,
+ const void *pBufferIn2, void *pBufferOut2,
+ const uint32_t lengthInBytes2)
+{
+#ifndef LINUX
+ DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+ SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+ /* Check for NULL pointers */
+ if (pCtx == NULL)
+ return;
+
+ if (pBufferIn1 == NULL || pBufferOut1 == NULL)
+ return;
+
+ if (pBufferIn2 == NULL || pBufferOut2 == NULL)
+ return;
+
+ /* Check input data is in range of supported length */
+ if (lengthInBytes1 == 0 || lengthInBytes1 > (KASUMI_MAX_LEN / CHAR_BIT))
+ return;
+
+ if (lengthInBytes2 == 0 || lengthInBytes2 > (KASUMI_MAX_LEN / CHAR_BIT))
+ return;
+#endif
+ kasumi_f8_2_buffer(pCtx, IV1, IV2,
+ pBufferIn1, pBufferOut1, lengthInBytes1,
+ pBufferIn2, pBufferOut2, lengthInBytes2);
+#ifdef SAFE_DATA
+ /* Clear sensitive data in registers */
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+ RESTORE_XMMS(xmm_save);
+#endif
+}
+
+void
+kasumi_f8_3_buffer_avx(const kasumi_key_sched_t *pCtx, const uint64_t IV1,
+ const uint64_t IV2, const uint64_t IV3,
+ const void *pBufferIn1, void *pBufferOut1,
+ const void *pBufferIn2, void *pBufferOut2,
+ const void *pBufferIn3, void *pBufferOut3,
+ const uint32_t lengthInBytes)
+{
+#ifndef LINUX
+ DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+ SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+ /* Check for NULL pointers */
+ if (pCtx == NULL)
+ return;
+
+ if (pBufferIn1 == NULL || pBufferOut1 == NULL)
+ return;
+
+ if (pBufferIn2 == NULL || pBufferOut2 == NULL)
+ return;
+
+ if (pBufferIn3 == NULL || pBufferOut3 == NULL)
+ return;
+
+ /* Check input data is in range of supported length */
+ if (lengthInBytes == 0 || lengthInBytes > (KASUMI_MAX_LEN / CHAR_BIT))
+ return;
+#endif
+ kasumi_f8_3_buffer(pCtx, IV1, IV2, IV3,
+ pBufferIn1, pBufferOut1,
+ pBufferIn2, pBufferOut2,
+ pBufferIn3, pBufferOut3, lengthInBytes);
+#ifdef SAFE_DATA
+ /* Clear sensitive data in registers */
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+ RESTORE_XMMS(xmm_save);
+#endif
+}
+
+void
+kasumi_f8_4_buffer_avx(const kasumi_key_sched_t *pCtx,
+ const uint64_t IV1, const uint64_t IV2,
+ const uint64_t IV3, const uint64_t IV4,
+ const void *pBufferIn1, void *pBufferOut1,
+ const void *pBufferIn2, void *pBufferOut2,
+ const void *pBufferIn3, void *pBufferOut3,
+ const void *pBufferIn4, void *pBufferOut4,
+ const uint32_t lengthInBytes)
+{
+#ifndef LINUX
+ DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+ SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+ /* Check for NULL pointers */
+ if (pCtx == NULL)
+ return;
+
+ if (pBufferIn1 == NULL || pBufferOut1 == NULL)
+ return;
+
+ if (pBufferIn2 == NULL || pBufferOut2 == NULL)
+ return;
+
+ if (pBufferIn3 == NULL || pBufferOut3 == NULL)
+ return;
+
+ if (pBufferIn4 == NULL || pBufferOut4 == NULL)
+ return;
+
+ /* Check input data is in range of supported length */
+ if (lengthInBytes == 0 || lengthInBytes > (KASUMI_MAX_LEN / CHAR_BIT))
+ return;
+#endif
+ kasumi_f8_4_buffer(pCtx, IV1, IV2, IV3, IV4,
+ pBufferIn1, pBufferOut1,
+ pBufferIn2, pBufferOut2,
+ pBufferIn3, pBufferOut3,
+ pBufferIn4, pBufferOut4,
+ lengthInBytes);
+#ifdef SAFE_DATA
+ /* Clear sensitive data in registers */
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+ RESTORE_XMMS(xmm_save);
+#endif
+}
+
+void
+kasumi_f8_n_buffer_avx(const kasumi_key_sched_t *pKeySchedule,
+ const uint64_t IV[],
+ const void * const pDataIn[], void *pDataOut[],
+ const uint32_t dataLen[], const uint32_t dataCount)
+{
+#ifndef LINUX
+ DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+ SAVE_XMMS(xmm_save);
+#endif
+ uint32_t numLeft = dataCount;
+ const uint64_t *IVPtr;
+ const void * const *pDataInPtr;
+ void **pDataOutPtr;
+ const uint32_t *dataLenPtr;
+ uint32_t i = 0;
+ uint32_t numBuffs;
+
+#ifdef SAFE_PARAM
+ /* Check for NULL pointers */
+ if (pKeySchedule == NULL || pDataIn == NULL || pDataOut == NULL ||
+ dataLen == NULL || IV == NULL)
+ return;
+
+ for (i = 0; i < dataCount; i++) {
+ /* Check for NULL pointers */
+ if (pDataIn[i] == NULL || pDataOut[i] == NULL)
+ return;
+
+ /* Check input data is in range of supported length */
+ if (dataLen[i] == 0 || dataLen[i] > (KASUMI_MAX_LEN / CHAR_BIT))
+ return;
+ }
+#endif
+
+ i = 0;
+
+ /* KASUMI F8 n buffer function can handle up to 16 buffers */
+ while (numLeft > 0) {
+ IVPtr = &IV[i];
+ pDataInPtr = &pDataIn[i];
+ pDataOutPtr = &pDataOut[i];
+ dataLenPtr = &dataLen[i];
+ numBuffs = (numLeft > 16) ? 16 : numLeft;
+
+ kasumi_f8_n_buffer(pKeySchedule, IVPtr, pDataInPtr, pDataOutPtr,
+ dataLenPtr, numBuffs);
+ i += numBuffs;
+ numLeft -= numBuffs;
+ }
+#ifdef SAFE_DATA
+ /* Clear sensitive data in registers */
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+ RESTORE_XMMS(xmm_save);
+#endif
+}
+
+
+void
+kasumi_f9_1_buffer_avx(const kasumi_key_sched_t *pCtx, const void *pBufferIn,
+ const uint32_t lengthInBytes, void *pDigest)
+{
+#ifndef LINUX
+ DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+ SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+ /* Check for NULL pointers */
+ if (pCtx == NULL || pBufferIn == NULL || pDigest == NULL)
+ return;
+
+ /* Check input data is in range of supported length */
+ if (lengthInBytes == 0 || lengthInBytes > (KASUMI_MAX_LEN / CHAR_BIT))
+ return;
+#endif
+ kasumi_f9_1_buffer(pCtx, pBufferIn, lengthInBytes, pDigest);
+#ifdef SAFE_DATA
+ /* Clear sensitive data in registers */
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+ RESTORE_XMMS(xmm_save);
+#endif
+}
+
+void
+kasumi_f9_1_buffer_user_avx(const kasumi_key_sched_t *pCtx, const uint64_t IV,
+ const void *pBufferIn, const uint32_t lengthInBits,
+ void *pDigest, const uint32_t direction)
+{
+#ifndef LINUX
+ DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+ SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+ /* Check for NULL pointers */
+ if (pCtx == NULL || pBufferIn == NULL || pDigest == NULL)
+ return;
+
+ /* Check input data is in range of supported length */
+ if (lengthInBits == 0 || lengthInBits > KASUMI_MAX_LEN)
+ return;
+#endif
+ kasumi_f9_1_buffer_user(pCtx, IV, pBufferIn, lengthInBits,
+ pDigest, direction);
+#ifdef SAFE_DATA
+ /* Clear sensitive data in registers */
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+ RESTORE_XMMS(xmm_save);
+#endif
+}
+
+int
+kasumi_init_f8_key_sched_avx(const void *const pKey,
+ kasumi_key_sched_t *pCtx)
+{
+ return kasumi_init_f8_key_sched(pKey, pCtx);
+}
+
+int
+kasumi_init_f9_key_sched_avx(const void *const pKey,
+ kasumi_key_sched_t *pCtx)
+{
+ return kasumi_init_f9_key_sched(pKey, pCtx);
+}
+
+size_t
+kasumi_key_sched_size_avx(void)
+{
+ return kasumi_key_sched_size();
+}
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_flush_avx.asm
new file mode 100644
index 000000000..3e3de0492
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_flush_avx.asm
@@ -0,0 +1,30 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define AES_CBC_ENC_X8 aes_cbc_enc_192_x8
+%define FLUSH_JOB_AES_ENC flush_job_aes192_enc_avx
+%include "avx/mb_mgr_aes_flush_avx.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_submit_avx.asm
new file mode 100644
index 000000000..57fae603c
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_submit_avx.asm
@@ -0,0 +1,30 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define AES_CBC_ENC_X8 aes_cbc_enc_192_x8
+%define SUBMIT_JOB_AES_ENC submit_job_aes192_enc_avx
+%include "avx/mb_mgr_aes_submit_avx.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_flush_avx.asm
new file mode 100644
index 000000000..04c4824d7
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_flush_avx.asm
@@ -0,0 +1,30 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define AES_CBC_ENC_X8 aes_cbc_enc_256_x8
+%define FLUSH_JOB_AES_ENC flush_job_aes256_enc_avx
+%include "avx/mb_mgr_aes_flush_avx.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_submit_avx.asm
new file mode 100644
index 000000000..ee1de7165
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_submit_avx.asm
@@ -0,0 +1,30 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define AES_CBC_ENC_X8 aes_cbc_enc_256_x8
+%define SUBMIT_JOB_AES_ENC submit_job_aes256_enc_avx
+%include "avx/mb_mgr_aes_submit_avx.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_ccm_auth_submit_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_ccm_auth_submit_flush_avx.asm
new file mode 100644
index 000000000..9d132ec5f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_ccm_auth_submit_flush_avx.asm
@@ -0,0 +1,537 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "include/reg_sizes.asm"
+%include "include/const.inc"
+%include "include/memcpy.asm"
+
+%ifndef AES128_CBC_MAC
+
+%define AES128_CBC_MAC aes128_cbc_mac_x8
+%define SUBMIT_JOB_AES_CCM_AUTH submit_job_aes_ccm_auth_avx
+%define FLUSH_JOB_AES_CCM_AUTH flush_job_aes_ccm_auth_avx
+
+%endif
+
+extern AES128_CBC_MAC
+
+section .data
+default rel
+
+align 16
+len_mask:
+ dq 0xFFFFFFFFFFFFFFF0
+align 16
+len_masks:
+ dq 0x000000000000FFFF, 0x0000000000000000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x000000000000FFFF
+ dq 0x0000000000000000, 0x00000000FFFF0000
+ dq 0x0000000000000000, 0x0000FFFF00000000
+ dq 0x0000000000000000, 0xFFFF000000000000
+dupw:
+ dq 0x0100010001000100, 0x0100010001000100
+counter_mask:
+ dq 0xFFFFFFFFFFFFFF07, 0x0000FFFFFFFFFFFF
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
+
+section .text
+
+%define APPEND(a,b) a %+ b
+
+%define NROUNDS 9 ; AES-CCM-128
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define job_rax rax
+%define tmp4 rax
+%define auth_len_aad rax
+
+%define min_idx rbp
+%define flags rbp
+
+%define lane r8
+
+%define iv_len r9
+%define auth_len r9
+
+%define aad_len r10
+%define init_block_addr r11
+
+%define unused_lanes rbx
+%define r rbx
+
+%define tmp r12
+%define tmp2 r13
+%define tmp3 r14
+
+%define good_lane r15
+%define min_job r15
+
+%define init_block0 xmm0
+%define ccm_lens xmm1
+%define min_len_idx xmm2
+%define xtmp0 xmm3
+%define xtmp1 xmm4
+%define xtmp2 xmm5
+%define xtmp3 xmm6
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; MACROS
+;;; ===========================================================================
+;;; ===========================================================================
+
+%macro ENCRYPT_SINGLE_BLOCK 2
+%define %%GDATA %1
+%define %%XMM0 %2
+
+ vpxor %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep NROUNDS
+ vaesenc %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep
+ vaesenclast %%XMM0, [%%GDATA+16*i]
+%endmacro
+
+;;; ===========================================================================
+;;; AES CCM auth job submit & flush
+;;; ===========================================================================
+;;; SUBMIT_FLUSH [in] - SUBMIT, FLUSH job selection
+%macro GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_AVX 1
+%define %%SUBMIT_FLUSH %1
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ ;; Find free lane
+ mov unused_lanes, [state + _aes_ccm_unused_lanes]
+
+%ifidn %%SUBMIT_FLUSH, SUBMIT
+
+ mov lane, unused_lanes
+ and lane, 15
+ shr unused_lanes, 4
+ mov [state + _aes_ccm_unused_lanes], unused_lanes
+
+ ;; Copy job info into lane
+ mov [state + _aes_ccm_job_in_lane + lane*8], job
+ ;; Copy keys into lane args
+ mov tmp, [job + _aes_enc_key_expanded]
+ mov [state + _aes_ccm_args_keys + lane*8], tmp
+ ;; init_done = 0
+ mov word [state + _aes_ccm_init_done + lane*2], 0
+ lea tmp, [lane * 8]
+
+ vpxor init_block0, init_block0
+ vmovdqa [state + _aes_ccm_args_IV + tmp*2], init_block0
+
+ ;; Prepare initial Block 0 for CBC-MAC-128
+
+ ;; Byte 0: flags with L' and M' (AAD later)
+ ;; Calculate L' = 15 - IV length - 1 = 14 - IV length
+ mov flags, 14
+ mov iv_len, [job + _iv_len_in_bytes]
+ sub flags, iv_len
+ ;; Calculate M' = (Digest length - 2) / 2
+ mov tmp, [job + _auth_tag_output_len_in_bytes]
+ sub tmp, 2
+
+ shl tmp, 2 ; M' << 3 (combine 1xshr, to div by 2, and 3xshl)
+ or flags, tmp
+
+ ;; Bytes 1 - 13: Nonce (7 - 13 bytes long)
+
+ ;; Bytes 1 - 7 are always copied (first 7 bytes)
+ mov tmp, [job + _iv]
+ vpinsrb init_block0, [tmp], 1
+ vpinsrw init_block0, [tmp + 1], 1
+ vpinsrd init_block0, [tmp + 3], 1
+
+ cmp iv_len, 7
+ je %%_finish_nonce_move
+
+ cmp iv_len, 8
+ je %%_iv_length_8
+ cmp iv_len, 9
+ je %%_iv_length_9
+ cmp iv_len, 10
+ je %%_iv_length_10
+ cmp iv_len, 11
+ je %%_iv_length_11
+ cmp iv_len, 12
+ je %%_iv_length_12
+
+ ;; Bytes 8 - 13
+%%_iv_length_13:
+ vpinsrb init_block0, [tmp + 12], 13
+%%_iv_length_12:
+ vpinsrb init_block0, [tmp + 11], 12
+%%_iv_length_11:
+ vpinsrd init_block0, [tmp + 7], 2
+ jmp %%_finish_nonce_move
+%%_iv_length_10:
+ vpinsrb init_block0, [tmp + 9], 10
+%%_iv_length_9:
+ vpinsrb init_block0, [tmp + 8], 9
+%%_iv_length_8:
+ vpinsrb init_block0, [tmp + 7], 8
+
+%%_finish_nonce_move:
+
+ ;; Bytes 14 & 15 (message length), in Big Endian
+ mov ax, [job + _msg_len_to_hash_in_bytes]
+ xchg al, ah
+ vpinsrw init_block0, ax, 7
+
+ mov aad_len, [job + _cbcmac_aad_len]
+ ;; Initial length to authenticate (Block 0)
+ mov auth_len, 16
+ ;; Length to authenticate (Block 0 + len(AAD) (2B) + AAD padded,
+ ;; so length is multiple of 64B)
+ lea auth_len_aad, [aad_len + (2 + 15) + 16]
+ and auth_len_aad, -16
+
+ or aad_len, aad_len
+ cmovne auth_len, auth_len_aad
+ ;; Update lengths to authenticate and find min length
+ vmovdqa ccm_lens, [state + _aes_ccm_lens]
+ XVPINSRW ccm_lens, xtmp0, tmp2, lane, auth_len, scale_x16
+ vmovdqa [state + _aes_ccm_lens], ccm_lens
+ vphminposuw min_len_idx, ccm_lens
+
+ mov tmp, lane
+ shl tmp, 6
+ lea init_block_addr, [state + _aes_ccm_init_blocks + tmp]
+ or aad_len, aad_len
+ je %%_aad_complete
+
+ or flags, (1 << 6) ; Set Adata bit in flags
+
+ ;; Copy AAD
+ ;; Set all 0s in last block (padding)
+ lea tmp, [init_block_addr + auth_len]
+ sub tmp, 16
+ vpxor xtmp0, xtmp0
+ vmovdqa [tmp], xtmp0
+
+ ;; Start copying from second block
+ lea tmp, [init_block_addr+16]
+ mov rax, aad_len
+ xchg al, ah
+ mov [tmp], ax
+ add tmp, 2
+ mov tmp2, [job + _cbcmac_aad]
+ memcpy_avx_64_1 tmp, tmp2, aad_len, tmp3, tmp4, xtmp0, xtmp1, xtmp2, xtmp3
+
+%%_aad_complete:
+
+ ;; Finish Block 0 with Byte 0
+ vpinsrb init_block0, BYTE(flags), 0
+ vmovdqa [init_block_addr], init_block0
+
+ mov [state + _aes_ccm_args_in + lane * 8], init_block_addr
+
+ cmp byte [state + _aes_ccm_unused_lanes], 0xf
+ jne %%_return_null
+
+%else ; end SUBMIT
+
+ ;; Check at least one job
+ bt unused_lanes, 35
+ jc %%_return_null
+
+ ;; Find a lane with a non-null job
+ xor good_lane, good_lane
+ cmp QWORD [state + _aes_ccm_job_in_lane + 1*8], 0
+ cmovne good_lane, [rel one]
+ cmp QWORD [state + _aes_ccm_job_in_lane + 2*8], 0
+ cmovne good_lane, [rel two]
+ cmp QWORD [state + _aes_ccm_job_in_lane + 3*8], 0
+ cmovne good_lane, [rel three]
+ cmp qword [state + _aes_ccm_job_in_lane + 4*8], 0
+ cmovne good_lane, [rel four]
+ cmp qword [state + _aes_ccm_job_in_lane + 5*8], 0
+ cmovne good_lane, [rel five]
+ cmp qword [state + _aes_ccm_job_in_lane + 6*8], 0
+ cmovne good_lane, [rel six]
+ cmp qword [state + _aes_ccm_job_in_lane + 7*8], 0
+ cmovne good_lane, [rel seven]
+
+ ; Copy good_lane to empty lanes
+ movzx tmp, word [state + _aes_ccm_init_done + good_lane*2]
+ mov tmp2, [state + _aes_ccm_args_in + good_lane*8]
+ mov tmp3, [state + _aes_ccm_args_keys + good_lane*8]
+ shl good_lane, 4 ; multiply by 16
+ vmovdqa xtmp0, [state + _aes_ccm_args_IV + good_lane]
+ vmovdqa ccm_lens, [state + _aes_ccm_lens]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _aes_ccm_job_in_lane + I*8], 0
+ jne APPEND(skip_,I)
+ vpor ccm_lens, [rel len_masks + 16*I]
+ mov [state + _aes_ccm_init_done + I*2], WORD(tmp)
+ mov [state + _aes_ccm_args_in + I*8], tmp2
+ mov [state + _aes_ccm_args_keys + I*8], tmp3
+ vmovdqa [state + _aes_ccm_args_IV + I*16], xtmp0
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+ vmovdqa [state + _aes_ccm_lens], ccm_lens
+ ;; Find min length
+ vphminposuw min_len_idx, ccm_lens
+
+%endif ; end FLUSH
+
+%%_ccm_round:
+ vpextrw len2, min_len_idx, 0 ; min value
+ vpextrw min_idx, min_len_idx, 1 ; min index (0...7)
+
+ mov min_job, [state + _aes_ccm_job_in_lane + min_idx*8]
+
+ or len2, len2
+ je %%_len_is_0
+ ;; subtract min length from all lengths
+ vpshufb min_len_idx, min_len_idx, [rel dupw] ; broadcast min length
+ vpsubw ccm_lens, min_len_idx
+ vmovdqa [state + _aes_ccm_lens], ccm_lens
+
+ ; "state" and "args" are the same address, arg1
+ ; len2 is arg2
+ call AES128_CBC_MAC
+ ; state and min_idx are intact
+
+%%_len_is_0:
+
+ movzx tmp, WORD [state + _aes_ccm_init_done + min_idx*2]
+ cmp WORD(tmp), 0
+ je %%_prepare_full_blocks_to_auth
+ cmp WORD(tmp), 1
+ je %%_prepare_partial_block_to_auth
+
+%%_encrypt_digest:
+
+ ;; Set counter block 0 (reusing previous initial block 0)
+ mov tmp, min_idx
+ shl tmp, 3
+ vmovdqa init_block0, [state + _aes_ccm_init_blocks + tmp * 8]
+
+ vpand init_block0, [rel counter_mask]
+
+ mov tmp2, [state + _aes_ccm_args_keys + tmp]
+ ENCRYPT_SINGLE_BLOCK tmp2, init_block0
+ vpxor init_block0, [state + _aes_ccm_args_IV + tmp * 2]
+
+ ;; Copy Mlen bytes into auth_tag_output (Mlen = 4,6,8,10,12,14,16)
+ mov min_job, [state + _aes_ccm_job_in_lane + tmp]
+ mov tmp3, [min_job + _auth_tag_output_len_in_bytes]
+ mov tmp2, [min_job + _auth_tag_output]
+
+ simd_store_avx tmp2, init_block0, tmp3, tmp, tmp4
+%%_update_lanes:
+ ; Update unused lanes
+ mov unused_lanes, [state + _aes_ccm_unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, min_idx
+ mov [state + _aes_ccm_unused_lanes], unused_lanes
+
+ ; Set return job
+ mov job_rax, min_job
+
+ mov qword [state + _aes_ccm_job_in_lane + min_idx*8], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+
+%ifdef SAFE_DATA
+ vpxor xtmp0, xtmp0
+%ifidn %%SUBMIT_FLUSH, SUBMIT
+ shl min_idx, 3
+ ;; Clear digest (in memory for CBC IV), counter block 0 and AAD of returned job
+ vmovdqa [state + _aes_ccm_args_IV + min_idx * 2], xtmp0
+ vmovdqa [state + _aes_ccm_init_blocks + min_idx * 8], xtmp0
+ vmovdqa [state + _aes_ccm_init_blocks + min_idx * 8 + 16], xtmp0
+ vmovdqa [state + _aes_ccm_init_blocks + min_idx * 8 + 32], xtmp0
+ vmovdqa [state + _aes_ccm_init_blocks + min_idx * 8 + 48], xtmp0
+ mov qword [state + _aes_ccm_args_keys + min_idx], 0
+%else
+ ;; Clear digest (in memory for CBC IV), counter block 0 and AAD
+ ;; of returned job and "NULL lanes"
+%assign I 0
+%rep 8
+ cmp qword [state + _aes_ccm_job_in_lane + I*8], 0
+ jne APPEND(skip_clear_,I)
+ vmovdqa [state + _aes_ccm_args_IV + I*16], xtmp0
+ vmovdqa [state + _aes_ccm_init_blocks + I*64], xtmp0
+ vmovdqa [state + _aes_ccm_init_blocks + I*64 + 16], xtmp0
+ vmovdqa [state + _aes_ccm_init_blocks + I*64 + 32], xtmp0
+ vmovdqa [state + _aes_ccm_init_blocks + I*64 + 48], xtmp0
+ mov qword [state + _aes_ccm_args_keys + I*8], 0
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SUBMIT
+%endif ;; SAFE_DATA
+
+%%_return:
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+ ret
+
+%%_return_null:
+ xor job_rax, job_rax
+ jmp %%_return
+
+%%_prepare_full_blocks_to_auth:
+
+ cmp dword [min_job + _cipher_direction], 2 ; DECRYPT
+ je %%_decrypt
+
+%%_encrypt:
+ mov tmp, [min_job + _src]
+ add tmp, [min_job + _hash_start_src_offset_in_bytes]
+ jmp %%_set_init_done_1
+
+%%_decrypt:
+ mov tmp, [min_job + _dst]
+
+%%_set_init_done_1:
+ mov [state + _aes_ccm_args_in + min_idx*8], tmp
+ mov word [state + _aes_ccm_init_done + min_idx*2], 1
+
+ ; Check if there are full blocks to hash
+ mov tmp, [min_job + _msg_len_to_hash_in_bytes]
+ and tmp, -16
+ je %%_prepare_partial_block_to_auth
+
+ ;; Update lengths to authenticate and find min length
+ vmovdqa ccm_lens, [state + _aes_ccm_lens]
+ XVPINSRW ccm_lens, xtmp0, tmp2, min_idx, tmp, scale_x16
+ vphminposuw min_len_idx, ccm_lens
+ vmovdqa [state + _aes_ccm_lens], ccm_lens
+
+ jmp %%_ccm_round
+
+%%_prepare_partial_block_to_auth:
+ ; Check if partial block needs to be hashed
+ mov auth_len, [min_job + _msg_len_to_hash_in_bytes]
+ and auth_len, 15
+ je %%_encrypt_digest
+
+ mov word [state + _aes_ccm_init_done + min_idx * 2], 2
+ ;; Update lengths to authenticate and find min length
+ vmovdqa ccm_lens, [state + _aes_ccm_lens]
+ XVPINSRW ccm_lens, xtmp0, tmp2, min_idx, 16, scale_x16
+ vphminposuw min_len_idx, ccm_lens
+ vmovdqa [state + _aes_ccm_lens], ccm_lens
+
+ mov tmp2, min_idx
+ shl tmp2, 6
+ add tmp2, 16 ; pb[AES_BLOCK_SIZE]
+ lea init_block_addr, [state + _aes_ccm_init_blocks + tmp2]
+ mov tmp2, [state + _aes_ccm_args_in + min_idx * 8]
+
+ simd_load_avx_15_1 xtmp0, tmp2, auth_len
+
+%%_finish_partial_block_copy:
+ vmovdqa [init_block_addr], xtmp0
+ mov [state + _aes_ccm_args_in + min_idx * 8], init_block_addr
+
+ jmp %%_ccm_round
+%endmacro
+
+
+align 64
+; JOB_AES_HMAC * submit_job_aes_ccm_auth_avx(MB_MGR_CCM_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(SUBMIT_JOB_AES_CCM_AUTH,function,internal)
+SUBMIT_JOB_AES_CCM_AUTH:
+ GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_AVX SUBMIT
+
+; JOB_AES_HMAC * flush_job_aes_ccm_auth_avx(MB_MGR_CCM_OOO *state)
+; arg 1 : state
+MKGLOBAL(FLUSH_JOB_AES_CCM_AUTH,function,internal)
+FLUSH_JOB_AES_CCM_AUTH:
+ GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_AVX FLUSH
+
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_cmac_submit_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_cmac_submit_flush_avx.asm
new file mode 100644
index 000000000..e17023004
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_cmac_submit_flush_avx.asm
@@ -0,0 +1,518 @@
+;;
+;; Copyright (c) 2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+%include "include/const.inc"
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+%define AES128_CBC_MAC aes128_cbc_mac_x8
+%define SUBMIT_JOB_AES_CMAC_AUTH submit_job_aes_cmac_auth_avx
+%define FLUSH_JOB_AES_CMAC_AUTH flush_job_aes_cmac_auth_avx
+
+extern AES128_CBC_MAC
+
+section .data
+default rel
+
+align 16
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ ;ddq 0x00000000000000000000FFFF00000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ ;ddq 0x0000000000000000FFFF000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+ ;ddq 0x000000000000FFFF0000000000000000
+ dq 0x0000000000000000, 0x000000000000FFFF
+ ;ddq 0x00000000FFFF00000000000000000000
+ dq 0x0000000000000000, 0x00000000FFFF0000
+ ;ddq 0x0000FFFF000000000000000000000000
+ dq 0x0000000000000000, 0x0000FFFF00000000
+ ;ddq 0xFFFF0000000000000000000000000000
+ dq 0x0000000000000000, 0xFFFF000000000000
+dupw:
+ ;ddq 0x01000100010001000100010001000100
+ dq 0x0100010001000100, 0x0100010001000100
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
+
+section .text
+
+%define APPEND(a,b) a %+ b
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define job_rax rax
+
+; idx needs to be in rbp
+%define len rbp
+%define idx rbp
+%define tmp rbp
+
+%define lane r8
+
+%define iv r9
+%define m_last r10
+%define n r11
+
+%define unused_lanes rbx
+%define r rbx
+
+%define tmp3 r12
+%define tmp4 r13
+%define tmp2 r14
+
+%define good_lane r15
+%define rbits r15
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; MACROS
+;;; ===========================================================================
+;;; ===========================================================================
+
+;;; ===========================================================================
+;;; AES CMAC job submit & flush
+;;; ===========================================================================
+;;; SUBMIT_FLUSH [in] - SUBMIT, FLUSH job selection
+%macro GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_AVX 1
+%define %%SUBMIT_FLUSH %1
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ ;; Find free lane
+ mov unused_lanes, [state + _aes_cmac_unused_lanes]
+
+%ifidn %%SUBMIT_FLUSH, SUBMIT
+
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ mov [state + _aes_cmac_unused_lanes], unused_lanes
+
+ ;; Copy job info into lane
+ mov [state + _aes_cmac_job_in_lane + lane*8], job
+ ;; Copy keys into lane args
+ mov tmp, [job + _key_expanded]
+ mov [state + _aes_cmac_args_keys + lane*8], tmp
+ mov tmp, lane
+ shl tmp, 4 ; lane*16
+
+ ;; Zero IV to store digest
+ vpxor xmm0, xmm0
+ vmovdqa [state + _aes_cmac_args_IV + tmp], xmm0
+
+ lea m_last, [state + _aes_cmac_scratch + tmp]
+
+ ;; calculate len
+ ;; convert bits to bytes (message length in bits for CMAC)
+ mov len, [job + _msg_len_to_hash_in_bits]
+ mov rbits, len
+ add len, 7 ; inc len if there are remainder bits
+ shr len, 3
+ and rbits, 7
+
+ ;; Check number of blocks and for partial block
+ mov r, len ; set remainder
+ and r, 0xf
+
+ lea n, [len + 0xf] ; set num blocks
+ shr n, 4
+
+ jz %%_lt_one_block ; check one or more blocks?
+
+ ;; One or more blocks, potentially partial
+ mov word [state + _aes_cmac_init_done + lane*2], 0
+
+ mov tmp2, [job + _src]
+ add tmp2, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _aes_cmac_args_in + lane*8], tmp2
+
+ ;; len = (n-1)*16
+ lea tmp2, [n - 1]
+ shl tmp2, 4
+ vmovdqa xmm0, [state + _aes_cmac_lens]
+ XVPINSRW xmm0, xmm1, tmp, lane, tmp2, scale_x16
+ vmovdqa [state + _aes_cmac_lens], xmm0
+
+ ;; check remainder bits
+ or rbits, rbits
+ jnz %%_not_complete_block_3gpp
+
+ ;; check if complete block
+ or r, r
+ jz %%_complete_block
+
+%%_not_complete_block:
+ ;; M_last = padding(M_n) XOR K2
+ lea tmp, [rel padding_0x80_tab16 + 16]
+ sub tmp, r
+ vmovdqu xmm0, [tmp]
+ vmovdqa [m_last], xmm0
+
+ mov tmp, [job + _src]
+ add tmp, [job + _hash_start_src_offset_in_bytes]
+ lea tmp3, [n - 1]
+ shl tmp3, 4
+ add tmp, tmp3
+
+ memcpy_avx_16 m_last, tmp, r, tmp4, iv
+
+ ;; src + n + r
+ mov tmp3, [job + _skey2]
+ vmovdqa xmm1, [m_last]
+ vmovdqu xmm0, [tmp3]
+ vpxor xmm0, xmm1
+ vmovdqa [m_last], xmm0
+
+%%_step_5:
+ ;; Find min length
+ vmovdqa xmm0, [state + _aes_cmac_lens]
+ vphminposuw xmm1, xmm0
+
+ cmp byte [state + _aes_cmac_unused_lanes], 0xf
+ jne %%_return_null
+
+%else ; end SUBMIT
+
+ ;; Check at least one job
+ bt unused_lanes, 35
+ jc %%_return_null
+
+ ;; Find a lane with a non-null job
+ xor good_lane, good_lane
+ cmp qword [state + _aes_cmac_job_in_lane + 1*8], 0
+ cmovne good_lane, [rel one]
+ cmp qword [state + _aes_cmac_job_in_lane + 2*8], 0
+ cmovne good_lane, [rel two]
+ cmp qword [state + _aes_cmac_job_in_lane + 3*8], 0
+ cmovne good_lane, [rel three]
+ cmp qword [state + _aes_cmac_job_in_lane + 4*8], 0
+ cmovne good_lane, [rel four]
+ cmp qword [state + _aes_cmac_job_in_lane + 5*8], 0
+ cmovne good_lane, [rel five]
+ cmp qword [state + _aes_cmac_job_in_lane + 6*8], 0
+ cmovne good_lane, [rel six]
+ cmp qword [state + _aes_cmac_job_in_lane + 7*8], 0
+ cmovne good_lane, [rel seven]
+
+ ; Copy good_lane to empty lanes
+ mov tmp2, [state + _aes_cmac_args_in + good_lane*8]
+ mov tmp3, [state + _aes_cmac_args_keys + good_lane*8]
+ shl good_lane, 4 ; multiply by 16
+ vmovdqa xmm2, [state + _aes_cmac_args_IV + good_lane]
+ vmovdqa xmm0, [state + _aes_cmac_lens]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _aes_cmac_job_in_lane + I*8], 0
+ jne APPEND(skip_,I)
+ mov [state + _aes_cmac_args_in + I*8], tmp2
+ mov [state + _aes_cmac_args_keys + I*8], tmp3
+ vmovdqa [state + _aes_cmac_args_IV + I*16], xmm2
+ vpor xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+ ;; Find min length
+ vphminposuw xmm1, xmm0
+
+%endif ; end FLUSH
+
+%%_cmac_round:
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je %%_len_is_0
+ vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes
+ vpsubw xmm0, xmm1
+ vmovdqa [state + _aes_cmac_lens], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len2 is arg2
+ call AES128_CBC_MAC
+ ; state and idx are intact
+
+ vmovdqa xmm0, [state + _aes_cmac_lens] ; preload lens
+%%_len_is_0:
+ ; Check if job complete
+ test word [state + _aes_cmac_init_done + idx*2], 0xffff
+ jnz %%_copy_complete_digest
+
+ ; Finish step 6
+ mov word [state + _aes_cmac_init_done + idx*2], 1
+
+ XVPINSRW xmm0, xmm1, tmp3, idx, 16, scale_x16
+ vmovdqa [state + _aes_cmac_lens], xmm0
+
+ vphminposuw xmm1, xmm0 ; find min length
+
+ mov tmp3, idx
+ shl tmp3, 4 ; idx*16
+ lea m_last, [state + _aes_cmac_scratch + tmp3]
+ mov [state + _aes_cmac_args_in + idx*8], m_last
+
+ jmp %%_cmac_round
+
+%%_copy_complete_digest:
+ ; Job complete, copy digest to AT output
+ mov job_rax, [state + _aes_cmac_job_in_lane + idx*8]
+
+ mov tmp4, idx
+ shl tmp4, 4
+ lea tmp3, [state + _aes_cmac_args_IV + tmp4]
+ mov tmp4, [job_rax + _auth_tag_output_len_in_bytes]
+ mov tmp2, [job_rax + _auth_tag_output]
+
+ cmp tmp4, 16
+ jne %%_ne_16_copy
+
+ ;; 16 byte AT copy
+ vmovdqa xmm0, [tmp3]
+ vmovdqu [tmp2], xmm0
+ jmp %%_update_lanes
+
+%%_ne_16_copy:
+ memcpy_avx_16 tmp2, tmp3, tmp4, lane, iv
+
+%%_update_lanes:
+ ; Update unused lanes
+ mov unused_lanes, [state + _aes_cmac_unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _aes_cmac_unused_lanes], unused_lanes
+
+ ; Set return job
+ mov job_rax, [state + _aes_cmac_job_in_lane + idx*8]
+
+ mov qword [state + _aes_cmac_job_in_lane + idx*8], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+
+%ifdef SAFE_DATA
+ vpxor xmm0, xmm0
+%ifidn %%SUBMIT_FLUSH, SUBMIT
+ ;; Clear digest (in memory for IV) and scratch memory of returned job
+ vmovdqa [tmp3], xmm0
+
+ shl idx, 4
+ vmovdqa [state + _aes_cmac_scratch + idx], xmm0
+
+%else
+ ;; Clear digest and scratch memory of returned job and "NULL lanes"
+%assign I 0
+%rep 8
+ cmp qword [state + _aes_cmac_job_in_lane + I*8], 0
+ jne APPEND(skip_clear_,I)
+ vmovdqa [state + _aes_cmac_args_IV + I*16], xmm0
+ vmovdqa [state + _aes_cmac_scratch + I*16], xmm0
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+%endif ;; SUBMIT
+
+%endif ;; SAFE_DATA
+
+%%_return:
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+ ret
+
+%%_return_null:
+ xor job_rax, job_rax
+ jmp %%_return
+
+%ifidn %%SUBMIT_FLUSH, SUBMIT
+%%_complete_block:
+
+ ;; Block size aligned
+ mov tmp2, [job + _src]
+ add tmp2, [job + _hash_start_src_offset_in_bytes]
+ lea tmp3, [n - 1]
+ shl tmp3, 4
+ add tmp2, tmp3
+
+ ;; M_last = M_n XOR K1
+ mov tmp3, [job + _skey1]
+ vmovdqu xmm0, [tmp3]
+ vmovdqu xmm1, [tmp2]
+ vpxor xmm0, xmm1
+ vmovdqa [m_last], xmm0
+
+ jmp %%_step_5
+
+%%_lt_one_block:
+ ;; Single partial block
+ mov word [state + _aes_cmac_init_done + lane*2], 1
+ mov [state + _aes_cmac_args_in + lane*8], m_last
+
+ vmovdqa xmm0, [state + _aes_cmac_lens]
+ XVPINSRW xmm0, xmm1, tmp2, lane, 16, scale_x16
+ vmovdqa [state + _aes_cmac_lens], xmm0
+
+ mov n, 1
+ jmp %%_not_complete_block
+
+%%_not_complete_block_3gpp:
+ ;; bit pad last block
+ ;; xor with skey2
+ ;; copy to m_last
+
+ ;; load pointer to src
+ mov tmp, [job + _src]
+ add tmp, [job + _hash_start_src_offset_in_bytes]
+ lea tmp3, [n - 1]
+ shl tmp3, 4
+ add tmp, tmp3
+
+ ;; check if partial block
+ or r, r
+ jz %%_load_full_block_3gpp
+
+ simd_load_avx_15_1 xmm0, tmp, r
+ dec r
+
+%%_update_mlast_3gpp:
+ ;; set last byte padding mask
+ ;; shift into correct xmm idx
+
+ ;; save and restore rcx on windows
+%ifndef LINUX
+ mov tmp, rcx
+%endif
+ mov rcx, rbits
+ mov tmp3, 0xff
+ shr tmp3, cl
+ movq xmm2, tmp3
+ XVPSLLB xmm2, r, xmm1, tmp2
+
+ ;; pad final byte
+ vpandn xmm2, xmm0
+%ifndef LINUX
+ mov rcx, tmp
+%endif
+ ;; set OR mask to pad final bit
+ mov tmp2, tmp3
+ shr tmp2, 1
+ xor tmp2, tmp3 ; XOR to get OR mask
+ movq xmm3, tmp2
+ ;; xmm1 contains shift table from previous shift
+ vpshufb xmm3, xmm1
+
+ ;; load skey2 address
+ mov tmp3, [job + _skey2]
+ vmovdqu xmm1, [tmp3]
+
+ ;; set final padding bit
+ vpor xmm2, xmm3
+
+ ;; XOR last partial block with skey2
+ ;; update mlast
+ vpxor xmm2, xmm1
+ vmovdqa [m_last], xmm2
+
+ jmp %%_step_5
+
+%%_load_full_block_3gpp:
+ vmovdqu xmm0, [tmp]
+ mov r, 0xf
+ jmp %%_update_mlast_3gpp
+%endif
+%endmacro
+
+
+align 64
+; JOB_AES_HMAC * submit_job_aes_cmac_auth_avx(MB_MGR_CMAC_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(SUBMIT_JOB_AES_CMAC_AUTH,function,internal)
+SUBMIT_JOB_AES_CMAC_AUTH:
+ GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_AVX SUBMIT
+
+; JOB_AES_HMAC * flush_job_aes_cmac_auth_avx(MB_MGR_CMAC_OOO *state)
+; arg 1 : state
+MKGLOBAL(FLUSH_JOB_AES_CMAC_AUTH,function,internal)
+FLUSH_JOB_AES_CMAC_AUTH:
+ GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_AVX FLUSH
+
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_flush_avx.asm
new file mode 100644
index 000000000..dbd2a4547
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_flush_avx.asm
@@ -0,0 +1,239 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "include/reg_sizes.asm"
+
+%ifndef AES_CBC_ENC_X8
+%define AES_CBC_ENC_X8 aes_cbc_enc_128_x8
+%define FLUSH_JOB_AES_ENC flush_job_aes128_enc_avx
+%endif
+
+; void AES_CBC_ENC_X8(AES_ARGS *args, UINT64 len_in_bytes);
+extern AES_CBC_ENC_X8
+
+section .data
+default rel
+align 16
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ ;ddq 0x00000000000000000000FFFF00000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ ;ddq 0x0000000000000000FFFF000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+ ;ddq 0x000000000000FFFF0000000000000000
+ dq 0x0000000000000000, 0x000000000000FFFF
+ ;ddq 0x00000000FFFF00000000000000000000
+ dq 0x0000000000000000, 0x00000000FFFF0000
+ ;ddq 0x0000FFFF000000000000000000000000
+ dq 0x0000000000000000, 0x0000FFFF00000000
+ ;ddq 0xFFFF0000000000000000000000000000
+ dq 0x0000000000000000, 0xFFFF000000000000
+dupw:
+ ;ddq 0x01000100010001000100010001000100
+ dq 0x0100010001000100, 0x0100010001000100
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
+
+section .text
+
+%define APPEND(a,b) a %+ b
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define job_rax rax
+
+%if 1
+%define unused_lanes rbx
+%define tmp1 rbx
+
+%define good_lane rdx
+%define iv rdx
+
+%define tmp2 rax
+
+; idx needs to be in rbp
+%define tmp rbp
+%define idx rbp
+
+%define tmp3 r8
+%endif
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+; JOB* FLUSH_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(FLUSH_JOB_AES_ENC,function,internal)
+FLUSH_JOB_AES_ENC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ ; check for empty
+ mov unused_lanes, [state + _aes_unused_lanes]
+ bt unused_lanes, 32+3
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor good_lane, good_lane
+ cmp qword [state + _aes_job_in_lane + 1*8], 0
+ cmovne good_lane, [rel one]
+ cmp qword [state + _aes_job_in_lane + 2*8], 0
+ cmovne good_lane, [rel two]
+ cmp qword [state + _aes_job_in_lane + 3*8], 0
+ cmovne good_lane, [rel three]
+ cmp qword [state + _aes_job_in_lane + 4*8], 0
+ cmovne good_lane, [rel four]
+ cmp qword [state + _aes_job_in_lane + 5*8], 0
+ cmovne good_lane, [rel five]
+ cmp qword [state + _aes_job_in_lane + 6*8], 0
+ cmovne good_lane, [rel six]
+ cmp qword [state + _aes_job_in_lane + 7*8], 0
+ cmovne good_lane, [rel seven]
+
+ ; copy good_lane to empty lanes
+ mov tmp1, [state + _aes_args_in + good_lane*8]
+ mov tmp2, [state + _aes_args_out + good_lane*8]
+ mov tmp3, [state + _aes_args_keys + good_lane*8]
+ shl good_lane, 4 ; multiply by 16
+ vmovdqa xmm2, [state + _aes_args_IV + good_lane]
+ vmovdqa xmm0, [state + _aes_lens]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _aes_job_in_lane + I*8], 0
+ jne APPEND(skip_,I)
+ mov [state + _aes_args_in + I*8], tmp1
+ mov [state + _aes_args_out + I*8], tmp2
+ mov [state + _aes_args_keys + I*8], tmp3
+ vmovdqa [state + _aes_args_IV + I*16], xmm2
+ vpor xmm0, xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _aes_lens], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call AES_CBC_ENC_X8
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ mov job_rax, [state + _aes_job_in_lane + idx*8]
+ mov unused_lanes, [state + _aes_unused_lanes]
+ mov qword [state + _aes_job_in_lane + idx*8], 0
+ or dword [job_rax + _status], STS_COMPLETED_AES
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _aes_unused_lanes], unused_lanes
+%ifdef SAFE_DATA
+ ;; Clear IVs of returned job and "NULL lanes"
+ vpxor xmm0, xmm0
+%assign I 0
+%rep 8
+ cmp qword [state + _aes_job_in_lane + I*8], 0
+ jne APPEND(skip_clear_,I)
+ vmovdqa [state + _aes_args_IV + I*16], xmm0
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+%endif
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_submit_avx.asm
new file mode 100644
index 000000000..c95fa1f6c
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_submit_avx.asm
@@ -0,0 +1,194 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "include/reg_sizes.asm"
+%include "include/const.inc"
+
+%ifndef AES_CBC_ENC_X8
+%define AES_CBC_ENC_X8 aes_cbc_enc_128_x8
+%define SUBMIT_JOB_AES_ENC submit_job_aes128_enc_avx
+%endif
+
+; void AES_CBC_ENC_X8(AES_ARGS *args, UINT64 len_in_bytes);
+extern AES_CBC_ENC_X8
+
+section .data
+default rel
+
+align 16
+dupw:
+ ;ddq 0x01000100010001000100010001000100
+ dq 0x0100010001000100, 0x0100010001000100
+
+section .text
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define job_rax rax
+
+%if 1
+; idx needs to be in rbp
+%define len rbp
+%define idx rbp
+%define tmp rbp
+
+%define lane r8
+
+%define iv r9
+
+%define unused_lanes rbx
+%endif
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+; JOB* SUBMIT_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(SUBMIT_JOB_AES_ENC,function,internal)
+SUBMIT_JOB_AES_ENC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _aes_unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ mov len, [job + _msg_len_to_cipher_in_bytes]
+ and len, -16 ; DOCSIS may pass size unaligned to block size
+ mov iv, [job + _iv]
+ mov [state + _aes_unused_lanes], unused_lanes
+
+ mov [state + _aes_job_in_lane + lane*8], job
+
+ vmovdqa xmm0, [state + _aes_lens]
+ XVPINSRW xmm0, xmm1, tmp, lane, len, scale_x16
+ vmovdqa [state + _aes_lens], xmm0
+
+ mov tmp, [job + _src]
+ add tmp, [job + _cipher_start_src_offset_in_bytes]
+ vmovdqu xmm0, [iv]
+ mov [state + _aes_args_in + lane*8], tmp
+ mov tmp, [job + _aes_enc_key_expanded]
+ mov [state + _aes_args_keys + lane*8], tmp
+ mov tmp, [job + _dst]
+ mov [state + _aes_args_out + lane*8], tmp
+ shl lane, 4 ; multiply by 16
+ vmovdqa [state + _aes_args_IV + lane], xmm0
+
+ cmp unused_lanes, 0xf
+ jne return_null
+
+ ; Find min length
+ vmovdqa xmm0, [state + _aes_lens]
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...7)
+ cmp len2, 0
+ je len_is_0
+
+ vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _aes_lens], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call AES_CBC_ENC_X8
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ mov job_rax, [state + _aes_job_in_lane + idx*8]
+ mov unused_lanes, [state + _aes_unused_lanes]
+ mov qword [state + _aes_job_in_lane + idx*8], 0
+ or dword [job_rax + _status], STS_COMPLETED_AES
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _aes_unused_lanes], unused_lanes
+%ifdef SAFE_DATA
+ ;; Clear IV
+ vpxor xmm0, xmm0
+ shl idx, 3 ; multiply by 8
+ vmovdqa [state + _aes_args_IV + idx*2], xmm0
+ mov qword [state + _aes_args_keys + idx], 0
+%endif
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_flush_avx.asm
new file mode 100644
index 000000000..a810842a9
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_flush_avx.asm
@@ -0,0 +1,264 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "include/reg_sizes.asm"
+
+%ifndef AES_XCBC_X8
+%define AES_XCBC_X8 aes_xcbc_mac_128_x8
+%define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_avx
+%endif
+
+; void AES_XCBC_X8(AES_XCBC_ARGS_x8 *args, UINT64 len_in_bytes);
+extern AES_XCBC_X8
+
+section .data
+default rel
+
+align 16
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ ;ddq 0x00000000000000000000FFFF00000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ ;ddq 0x0000000000000000FFFF000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+ ;ddq 0x000000000000FFFF0000000000000000
+ dq 0x0000000000000000, 0x000000000000FFFF
+ ;ddq 0x00000000FFFF00000000000000000000
+ dq 0x0000000000000000, 0x00000000FFFF0000
+ ;ddq 0x0000FFFF000000000000000000000000
+ dq 0x0000000000000000, 0x0000FFFF00000000
+ ;ddq 0xFFFF0000000000000000000000000000
+ dq 0x0000000000000000, 0xFFFF000000000000
+dupw:
+ ;ddq 0x01000100010001000100010001000100
+ dq 0x0100010001000100, 0x0100010001000100
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
+
+section .text
+
+%define APPEND(a,b) a %+ b
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define job_rax rax
+
+%if 1
+%define unused_lanes rbx
+%define tmp1 rbx
+
+%define icv rdx
+
+%define tmp2 rax
+
+; idx needs to be in rbp
+%define tmp r10
+%define idx rbp
+
+%define tmp3 r8
+%define lane_data r9
+%endif
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+; JOB* FLUSH_JOB_AES_XCBC(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(FLUSH_JOB_AES_XCBC,function,internal)
+FLUSH_JOB_AES_XCBC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ ; check for empty
+ mov unused_lanes, [state + _aes_xcbc_unused_lanes]
+ bt unused_lanes, 32+3
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _aes_xcbc_ldata + 1 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+ cmovne idx, [rel one]
+ cmp qword [state + _aes_xcbc_ldata + 2 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+ cmovne idx, [rel two]
+ cmp qword [state + _aes_xcbc_ldata + 3 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+ cmovne idx, [rel three]
+ cmp qword [state + _aes_xcbc_ldata + 4 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+ cmovne idx, [rel four]
+ cmp qword [state + _aes_xcbc_ldata + 5 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+ cmovne idx, [rel five]
+ cmp qword [state + _aes_xcbc_ldata + 6 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+ cmovne idx, [rel six]
+ cmp qword [state + _aes_xcbc_ldata + 7 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+ cmovne idx, [rel seven]
+
+copy_lane_data:
+ ; copy idx to empty lanes
+ mov tmp1, [state + _aes_xcbc_args_in + idx*8]
+ mov tmp3, [state + _aes_xcbc_args_keys + idx*8]
+ shl idx, 4 ; multiply by 16
+ vmovdqa xmm2, [state + _aes_xcbc_args_ICV + idx]
+ vmovdqa xmm0, [state + _aes_xcbc_lens]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _aes_xcbc_ldata + I * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _aes_xcbc_args_in + I*8], tmp1
+ mov [state + _aes_xcbc_args_keys + I*8], tmp3
+ vmovdqa [state + _aes_xcbc_args_ICV + I*16], xmm2
+ vpor xmm0, xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ vmovdqa [state + _aes_xcbc_lens], xmm0
+
+ ; Find min length
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _aes_xcbc_lens], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call AES_XCBC_X8
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _XCBC_LANE_DATA_size
+ lea lane_data, [state + _aes_xcbc_ldata + lane_data]
+ cmp dword [lane_data + _xcbc_final_done], 0
+ jne end_loop
+
+ mov dword [lane_data + _xcbc_final_done], 1
+ mov word [state + _aes_xcbc_lens + 2*idx], 16
+ lea tmp, [lane_data + _xcbc_final_block]
+ mov [state + _aes_xcbc_args_in + 8*idx], tmp
+ jmp copy_lane_data
+
+end_loop:
+ mov job_rax, [lane_data + _xcbc_job_in_lane]
+ mov icv, [job_rax + _auth_tag_output]
+ mov unused_lanes, [state + _aes_xcbc_unused_lanes]
+ mov qword [lane_data + _xcbc_job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ shl idx, 4 ; multiply by 16
+ mov [state + _aes_xcbc_unused_lanes], unused_lanes
+
+ ; copy 12 bytes
+ vmovdqa xmm0, [state + _aes_xcbc_args_ICV + idx]
+ vmovq [icv], xmm0
+ vpextrd [icv + 8], xmm0, 2
+
+%ifdef SAFE_DATA
+ vpxor xmm0, xmm0
+ ;; Clear ICV's and final blocks in returned job and NULL lanes
+%assign I 0
+%rep 8
+ cmp qword [state + _aes_xcbc_ldata + I * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0
+ jne APPEND(skip_clear_,I)
+ vmovdqa [state + _aes_xcbc_args_ICV + I*16], xmm0
+ lea lane_data, [state + _aes_xcbc_ldata + (I * _XCBC_LANE_DATA_size)]
+ vmovdqa [lane_data + _xcbc_final_block], xmm0
+ vmovdqa [lane_data + _xcbc_final_block + 16], xmm0
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+%endif
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_submit_avx.asm
new file mode 100644
index 000000000..38f6a6470
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_submit_avx.asm
@@ -0,0 +1,272 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "include/reg_sizes.asm"
+
+%include "include/memcpy.asm"
+%include "include/const.inc"
+
+%ifndef AES_XCBC_X8
+%define AES_XCBC_X8 aes_xcbc_mac_128_x8
+%define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_avx
+%endif
+
+; void AES_XCBC_X8(AES_XCBC_ARGS_x8 *args, UINT64 len_in_bytes);
+extern AES_XCBC_X8
+
+
+section .data
+default rel
+
+align 16
+dupw: ;ddq 0x01000100010001000100010001000100
+ dq 0x0100010001000100, 0x0100010001000100
+x80: ;ddq 0x00000000000000000000000000000080
+ dq 0x0000000000000080, 0x0000000000000000
+
+section .text
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define job_rax rax
+
+%if 1
+; idx needs to be in rbp
+%define len r11
+%define idx rbp
+%define tmp2 rbp
+%define tmp r14
+
+%define lane r8
+%define icv r9
+%define p2 r9
+
+%define last_len r10
+
+%define lane_data r12
+%define p r13
+
+%define unused_lanes rbx
+%endif
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+; JOB* SUBMIT_JOB_AES_XCBC(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(SUBMIT_JOB_AES_XCBC,function,internal)
+SUBMIT_JOB_AES_XCBC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _aes_xcbc_unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _XCBC_LANE_DATA_size
+ lea lane_data, [state + _aes_xcbc_ldata + lane_data]
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov [state + _aes_xcbc_unused_lanes], unused_lanes
+ mov [lane_data + _xcbc_job_in_lane], job
+ mov dword [lane_data + _xcbc_final_done], 0
+ mov tmp, [job + _k1_expanded]
+ mov [state + _aes_xcbc_args_keys + lane*8], tmp
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+
+ mov last_len, len
+
+ cmp len, 16
+ jle small_buffer
+
+ mov [state + _aes_xcbc_args_in + lane*8], p
+ add p, len ; set point to end of data
+
+ and last_len, 15 ; Check lsbs of msg len
+ jnz slow_copy ; if not 16B mult, do slow copy
+
+fast_copy:
+ vmovdqu xmm0, [p - 16] ; load last block M[n]
+ mov tmp, [job + _k2] ; load K2 address
+ vmovdqu xmm1, [tmp] ; load K2
+ vpxor xmm0, xmm0, xmm1 ; M[n] XOR K2
+ vmovdqa [lane_data + _xcbc_final_block], xmm0
+ sub len, 16 ; take last block off length
+end_fast_copy:
+ vpxor xmm0, xmm0, xmm0
+ shl lane, 4 ; multiply by 16
+ vmovdqa [state + _aes_xcbc_args_ICV + lane], xmm0
+
+ vmovdqa xmm0, [state + _aes_xcbc_lens]
+ XVPINSRW xmm0, xmm1, tmp, lane, len, no_scale
+ vmovdqa [state + _aes_xcbc_lens], xmm0
+
+ cmp unused_lanes, 0xf
+ jne return_null
+
+start_loop:
+ ; Find min length
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...7)
+ cmp len2, 0
+ je len_is_0
+
+ vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _aes_xcbc_lens], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call AES_XCBC_X8
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _XCBC_LANE_DATA_size
+ lea lane_data, [state + _aes_xcbc_ldata + lane_data]
+ cmp dword [lane_data + _xcbc_final_done], 0
+ jne end_loop
+
+ mov dword [lane_data + _xcbc_final_done], 1
+
+ vmovdqa xmm0, [state + _aes_xcbc_lens]
+ XVPINSRW xmm0, xmm1, tmp, idx, 16, scale_x16
+ vmovdqa [state + _aes_xcbc_lens], xmm0
+
+ lea tmp, [lane_data + _xcbc_final_block]
+ mov [state + _aes_xcbc_args_in + 8*idx], tmp
+ jmp start_loop
+
+end_loop:
+ ; process completed job "idx"
+ mov job_rax, [lane_data + _xcbc_job_in_lane]
+ mov icv, [job_rax + _auth_tag_output]
+ mov unused_lanes, [state + _aes_xcbc_unused_lanes]
+ mov qword [lane_data + _xcbc_job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ shl idx, 4 ; multiply by 16
+ mov [state + _aes_xcbc_unused_lanes], unused_lanes
+
+ ; copy 12 bytes
+ vmovdqa xmm0, [state + _aes_xcbc_args_ICV + idx]
+ vmovq [icv], xmm0
+ vpextrd [icv + 8], xmm0, 2
+
+%ifdef SAFE_DATA
+ ;; Clear ICV
+ vpxor xmm0, xmm0
+ vmovdqa [state + _aes_xcbc_args_ICV + idx], xmm0
+
+ ;; Clear final block (32 bytes)
+ vmovdqa [lane_data + _xcbc_final_block], xmm0
+ vmovdqa [lane_data + _xcbc_final_block + 16], xmm0
+%endif
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+small_buffer:
+ ; For buffers <= 16 Bytes
+ ; The input data is set to final block
+ lea tmp, [lane_data + _xcbc_final_block] ; final block
+ mov [state + _aes_xcbc_args_in + lane*8], tmp
+ add p, len ; set point to end of data
+ cmp len, 16
+ je fast_copy
+
+slow_copy:
+ and len, ~15 ; take final block off len
+ sub p, last_len ; adjust data pointer
+ lea p2, [lane_data + _xcbc_final_block + 16] ; upper part of final
+ sub p2, last_len ; adjust data pointer backwards
+ memcpy_avx_16_1 p2, p, last_len, tmp, tmp2
+ vmovdqa xmm0, [rel x80] ; fill reg with padding
+ vmovdqu [lane_data + _xcbc_final_block + 16], xmm0 ; add padding
+ vmovdqu xmm0, [p2] ; load final block to process
+ mov tmp, [job + _k3] ; load K3 address
+ vmovdqu xmm1, [tmp] ; load K3
+ vpxor xmm0, xmm0, xmm1 ; M[n] XOR K3
+ vmovdqu [lane_data + _xcbc_final_block], xmm0 ; write final block
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_avx.c b/src/spdk/intel-ipsec-mb/avx/mb_mgr_avx.c
new file mode 100644
index 000000000..29cf2a308
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_avx.c
@@ -0,0 +1,733 @@
+/*******************************************************************************
+ Copyright (c) 2012-2018, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_xmms_avx
+
+#include "intel-ipsec-mb.h"
+#include "include/kasumi_internal.h"
+#include "include/zuc_internal.h"
+#include "include/snow3g.h"
+
+#include "save_xmms.h"
+#include "asm.h"
+#include "des.h"
+#include "cpu_feature.h"
+#include "noaesni.h"
+
+JOB_AES_HMAC *submit_job_aes128_enc_avx(MB_MGR_AES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes128_enc_avx(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes192_enc_avx(MB_MGR_AES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes192_enc_avx(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes256_enc_avx(MB_MGR_AES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes256_enc_avx(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes_xcbc_avx(MB_MGR_AES_XCBC_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes_xcbc_avx(MB_MGR_AES_XCBC_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes_cntr_avx(JOB_AES_HMAC *job);
+
+JOB_AES_HMAC *submit_job_aes_cntr_bit_avx(JOB_AES_HMAC *job);
+
+#define SAVE_XMMS save_xmms_avx
+#define RESTORE_XMMS restore_xmms_avx
+
+#define SUBMIT_JOB_AES128_ENC submit_job_aes128_enc_avx
+#define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_avx
+#define FLUSH_JOB_AES128_ENC flush_job_aes128_enc_avx
+#define SUBMIT_JOB_AES192_ENC submit_job_aes192_enc_avx
+#define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_avx
+#define FLUSH_JOB_AES192_ENC flush_job_aes192_enc_avx
+#define SUBMIT_JOB_AES256_ENC submit_job_aes256_enc_avx
+#define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_avx
+#define FLUSH_JOB_AES256_ENC flush_job_aes256_enc_avx
+#define SUBMIT_JOB_AES_ECB_128_ENC submit_job_aes_ecb_128_enc_avx
+#define SUBMIT_JOB_AES_ECB_128_DEC submit_job_aes_ecb_128_dec_avx
+#define SUBMIT_JOB_AES_ECB_192_ENC submit_job_aes_ecb_192_enc_avx
+#define SUBMIT_JOB_AES_ECB_192_DEC submit_job_aes_ecb_192_dec_avx
+#define SUBMIT_JOB_AES_ECB_256_ENC submit_job_aes_ecb_256_enc_avx
+#define SUBMIT_JOB_AES_ECB_256_DEC submit_job_aes_ecb_256_dec_avx
+
+#define SUBMIT_JOB_AES_CNTR submit_job_aes_cntr_avx
+#define SUBMIT_JOB_AES_CNTR_BIT submit_job_aes_cntr_bit_avx
+
+#define AES_CBC_DEC_128 aes_cbc_dec_128_avx
+#define AES_CBC_DEC_192 aes_cbc_dec_192_avx
+#define AES_CBC_DEC_256 aes_cbc_dec_256_avx
+
+#define AES_CNTR_128 aes_cntr_128_avx
+#define AES_CNTR_192 aes_cntr_192_avx
+#define AES_CNTR_256 aes_cntr_256_avx
+
+#define AES_CNTR_CCM_128 aes_cntr_ccm_128_avx
+
+#define AES_ECB_ENC_128 aes_ecb_enc_128_avx
+#define AES_ECB_ENC_192 aes_ecb_enc_192_avx
+#define AES_ECB_ENC_256 aes_ecb_enc_256_avx
+#define AES_ECB_DEC_128 aes_ecb_dec_128_avx
+#define AES_ECB_DEC_192 aes_ecb_dec_192_avx
+#define AES_ECB_DEC_256 aes_ecb_dec_256_avx
+
+#define SUBMIT_JOB_PON_ENC submit_job_pon_enc_avx
+#define SUBMIT_JOB_PON_DEC submit_job_pon_dec_avx
+#define SUBMIT_JOB_PON_ENC_NO_CTR submit_job_pon_enc_no_ctr_avx
+#define SUBMIT_JOB_PON_DEC_NO_CTR submit_job_pon_dec_no_ctr_avx
+
+#ifndef NO_GCM
+#define AES_GCM_DEC_128 aes_gcm_dec_128_avx_gen2
+#define AES_GCM_ENC_128 aes_gcm_enc_128_avx_gen2
+#define AES_GCM_DEC_192 aes_gcm_dec_192_avx_gen2
+#define AES_GCM_ENC_192 aes_gcm_enc_192_avx_gen2
+#define AES_GCM_DEC_256 aes_gcm_dec_256_avx_gen2
+#define AES_GCM_ENC_256 aes_gcm_enc_256_avx_gen2
+
+#define SUBMIT_JOB_AES_GCM_DEC submit_job_aes_gcm_dec_avx
+#define FLUSH_JOB_AES_GCM_DEC flush_job_aes_gcm_dec_avx
+#define SUBMIT_JOB_AES_GCM_ENC submit_job_aes_gcm_enc_avx
+#define FLUSH_JOB_AES_GCM_ENC flush_job_aes_gcm_enc_avx
+#endif
+
+#define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_avx
+#define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_avx
+
+#define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_avx
+#define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_avx
+#define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_avx
+#define QUEUE_SIZE queue_size_avx
+
+#define SUBMIT_JOB_AES_ENC SUBMIT_JOB_AES_ENC_AVX
+#define FLUSH_JOB_AES_ENC FLUSH_JOB_AES_ENC_AVX
+#define SUBMIT_JOB_AES_DEC SUBMIT_JOB_AES_DEC_AVX
+#define FLUSH_JOB_AES_DEC FLUSH_JOB_AES_DEC_AVX
+
+
+
+JOB_AES_HMAC *submit_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_224_avx(MB_MGR_HMAC_SHA_256_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_224_avx(MB_MGR_HMAC_SHA_256_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_256_avx(MB_MGR_HMAC_SHA_256_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_256_avx(MB_MGR_HMAC_SHA_256_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_384_avx(MB_MGR_HMAC_SHA_512_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_384_avx(MB_MGR_HMAC_SHA_512_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_512_avx(MB_MGR_HMAC_SHA_512_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_512_avx(MB_MGR_HMAC_SHA_512_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes_cmac_auth_avx(MB_MGR_CMAC_OOO *state,
+ JOB_AES_HMAC *job);
+
+JOB_AES_HMAC *flush_job_aes_cmac_auth_avx(MB_MGR_CMAC_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes_ccm_auth_avx(MB_MGR_CCM_OOO *state,
+ JOB_AES_HMAC *job);
+
+JOB_AES_HMAC *flush_job_aes_ccm_auth_avx(MB_MGR_CCM_OOO *state);
+
+#define SUBMIT_JOB_HMAC submit_job_hmac_avx
+#define FLUSH_JOB_HMAC flush_job_hmac_avx
+#define SUBMIT_JOB_HMAC_SHA_224 submit_job_hmac_sha_224_avx
+#define FLUSH_JOB_HMAC_SHA_224 flush_job_hmac_sha_224_avx
+#define SUBMIT_JOB_HMAC_SHA_256 submit_job_hmac_sha_256_avx
+#define FLUSH_JOB_HMAC_SHA_256 flush_job_hmac_sha_256_avx
+#define SUBMIT_JOB_HMAC_SHA_384 submit_job_hmac_sha_384_avx
+#define FLUSH_JOB_HMAC_SHA_384 flush_job_hmac_sha_384_avx
+#define SUBMIT_JOB_HMAC_SHA_512 submit_job_hmac_sha_512_avx
+#define FLUSH_JOB_HMAC_SHA_512 flush_job_hmac_sha_512_avx
+#define SUBMIT_JOB_HMAC_MD5 submit_job_hmac_md5_avx
+#define FLUSH_JOB_HMAC_MD5 flush_job_hmac_md5_avx
+
+/* ====================================================================== */
+
+#define SUBMIT_JOB submit_job_avx
+#define FLUSH_JOB flush_job_avx
+#define SUBMIT_JOB_NOCHECK submit_job_nocheck_avx
+#define GET_NEXT_JOB get_next_job_avx
+#define GET_COMPLETED_JOB get_completed_job_avx
+
+/* ====================================================================== */
+
+
+#define SUBMIT_JOB_HASH SUBMIT_JOB_HASH_AVX
+#define FLUSH_JOB_HASH FLUSH_JOB_HASH_AVX
+
+/* ====================================================================== */
+
+#define AES_CFB_128_ONE aes_cfb_128_one_avx
+
+void aes128_cbc_mac_x8(AES_ARGS *args, uint64_t len);
+
+#define AES128_CBC_MAC aes128_cbc_mac_x8
+
+#define FLUSH_JOB_AES_CCM_AUTH flush_job_aes_ccm_auth_avx
+#define SUBMIT_JOB_AES_CCM_AUTH submit_job_aes_ccm_auth_avx
+
+#define FLUSH_JOB_AES_CMAC_AUTH flush_job_aes_cmac_auth_avx
+#define SUBMIT_JOB_AES_CMAC_AUTH submit_job_aes_cmac_auth_avx
+
+/* ====================================================================== */
+
+/*
+ * GCM submit / flush API for AVX arch
+ */
+#ifndef NO_GCM
+static JOB_AES_HMAC *
+submit_job_aes_gcm_dec_avx(MB_MGR *state, JOB_AES_HMAC *job)
+{
+ DECLARE_ALIGNED(struct gcm_context_data ctx, 16);
+ (void) state;
+
+ if (16 == job->aes_key_len_in_bytes)
+ AES_GCM_DEC_128(job->aes_dec_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else if (24 == job->aes_key_len_in_bytes)
+ AES_GCM_DEC_192(job->aes_dec_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else /* assume 32 bytes */
+ AES_GCM_DEC_256(job->aes_dec_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+
+ job->status = STS_COMPLETED;
+ return job;
+}
+
+static JOB_AES_HMAC *
+flush_job_aes_gcm_dec_avx(MB_MGR *state, JOB_AES_HMAC *job)
+{
+ (void) state;
+ (void) job;
+ return NULL;
+}
+
+static JOB_AES_HMAC *
+submit_job_aes_gcm_enc_avx(MB_MGR *state, JOB_AES_HMAC *job)
+{
+ DECLARE_ALIGNED(struct gcm_context_data ctx, 16);
+ (void) state;
+
+ if (16 == job->aes_key_len_in_bytes)
+ AES_GCM_ENC_128(job->aes_enc_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes, job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else if (24 == job->aes_key_len_in_bytes)
+ AES_GCM_ENC_192(job->aes_enc_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes, job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else /* assume 32 bytes */
+ AES_GCM_ENC_256(job->aes_enc_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes, job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+
+ job->status = STS_COMPLETED;
+ return job;
+}
+
+static JOB_AES_HMAC *
+flush_job_aes_gcm_enc_avx(MB_MGR *state, JOB_AES_HMAC *job)
+{
+ (void) state;
+ (void) job;
+ return NULL;
+}
+#endif /* NO_GCM */
+
+/* ====================================================================== */
+
+IMB_DLL_LOCAL JOB_AES_HMAC *
+submit_job_aes_cntr_avx(JOB_AES_HMAC *job)
+{
+ if (16 == job->aes_key_len_in_bytes)
+ AES_CNTR_128(job->src + job->cipher_start_src_offset_in_bytes,
+ job->iv,
+ job->aes_enc_key_expanded,
+ job->dst,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv_len_in_bytes);
+ else if (24 == job->aes_key_len_in_bytes)
+ AES_CNTR_192(job->src + job->cipher_start_src_offset_in_bytes,
+ job->iv,
+ job->aes_enc_key_expanded,
+ job->dst,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv_len_in_bytes);
+ else /* assume 32 bytes */
+ AES_CNTR_256(job->src + job->cipher_start_src_offset_in_bytes,
+ job->iv,
+ job->aes_enc_key_expanded,
+ job->dst,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv_len_in_bytes);
+
+ job->status |= STS_COMPLETED_AES;
+ return job;
+}
+
+IMB_DLL_LOCAL JOB_AES_HMAC *
+submit_job_aes_cntr_bit_avx(JOB_AES_HMAC *job)
+{
+ if (16 == job->aes_key_len_in_bytes)
+ aes_cntr_bit_128_avx(job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->iv,
+ job->aes_enc_key_expanded,
+ job->dst,
+ job->msg_len_to_cipher_in_bits,
+ job->iv_len_in_bytes);
+ else if (24 == job->aes_key_len_in_bytes)
+ aes_cntr_bit_192_avx(job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->iv,
+ job->aes_enc_key_expanded,
+ job->dst,
+ job->msg_len_to_cipher_in_bits,
+ job->iv_len_in_bytes);
+ else /* assume 32 bytes */
+ aes_cntr_bit_256_avx(job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->iv,
+ job->aes_enc_key_expanded,
+ job->dst,
+ job->msg_len_to_cipher_in_bits,
+ job->iv_len_in_bytes);
+
+ job->status |= STS_COMPLETED_AES;
+ return job;
+}
+
+void
+init_mb_mgr_avx(MB_MGR *state)
+{
+ unsigned int j;
+ uint8_t *p;
+ size_t size;
+
+ state->features = cpu_feature_adjust(state->flags,
+ cpu_feature_detect());
+
+ if (!(state->features & IMB_FEATURE_AESNI)) {
+ init_mb_mgr_sse_no_aesni(state);
+ return;
+ }
+
+ /* Init AES out-of-order fields */
+ memset(state->aes128_ooo.lens, 0xFF,
+ sizeof(state->aes128_ooo.lens));
+ memset(&state->aes128_ooo.lens[0], 0,
+ sizeof(state->aes128_ooo.lens[0]) * 8);
+ memset(state->aes128_ooo.job_in_lane, 0,
+ sizeof(state->aes128_ooo.job_in_lane));
+ state->aes128_ooo.unused_lanes = 0xF76543210;
+ state->aes128_ooo.num_lanes_inuse = 0;
+
+ memset(state->aes192_ooo.lens, 0xFF,
+ sizeof(state->aes192_ooo.lens));
+ memset(&state->aes192_ooo.lens[0], 0,
+ sizeof(state->aes192_ooo.lens[0]) * 8);
+ memset(state->aes192_ooo.job_in_lane, 0,
+ sizeof(state->aes192_ooo.job_in_lane));
+ state->aes192_ooo.unused_lanes = 0xF76543210;
+ state->aes192_ooo.num_lanes_inuse = 0;
+
+ memset(&state->aes256_ooo.lens, 0xFF,
+ sizeof(state->aes256_ooo.lens));
+ memset(&state->aes256_ooo.lens[0], 0,
+ sizeof(state->aes256_ooo.lens[0]) * 8);
+ memset(state->aes256_ooo.job_in_lane, 0,
+ sizeof(state->aes256_ooo.job_in_lane));
+ state->aes256_ooo.unused_lanes = 0xF76543210;
+ state->aes256_ooo.num_lanes_inuse = 0;
+
+ /* DOCSIS SEC BPI (AES CBC + AES CFB for partial block)
+ * uses same settings as AES128 CBC.
+ */
+ memset(state->docsis_sec_ooo.lens, 0xFF,
+ sizeof(state->docsis_sec_ooo.lens));
+ memset(&state->docsis_sec_ooo.lens[0], 0,
+ sizeof(state->docsis_sec_ooo.lens[0]) * 8);
+ memset(state->docsis_sec_ooo.job_in_lane, 0,
+ sizeof(state->docsis_sec_ooo.job_in_lane));
+ state->docsis_sec_ooo.unused_lanes = 0xF76543210;
+ state->docsis_sec_ooo.num_lanes_inuse = 0;
+
+
+ /* Init HMAC/SHA1 out-of-order fields */
+ state->hmac_sha_1_ooo.lens[0] = 0;
+ state->hmac_sha_1_ooo.lens[1] = 0;
+ state->hmac_sha_1_ooo.lens[2] = 0;
+ state->hmac_sha_1_ooo.lens[3] = 0;
+ state->hmac_sha_1_ooo.lens[4] = 0xFFFF;
+ state->hmac_sha_1_ooo.lens[5] = 0xFFFF;
+ state->hmac_sha_1_ooo.lens[6] = 0xFFFF;
+ state->hmac_sha_1_ooo.lens[7] = 0xFFFF;
+ state->hmac_sha_1_ooo.unused_lanes = 0xFF03020100;
+ for (j = 0; j < AVX_NUM_SHA1_LANES; j++) {
+ state->hmac_sha_1_ooo.ldata[j].job_in_lane = NULL;
+ state->hmac_sha_1_ooo.ldata[j].extra_block[64] = 0x80;
+ memset(state->hmac_sha_1_ooo.ldata[j].extra_block + 65,
+ 0x00,
+ 64+7);
+ p = state->hmac_sha_1_ooo.ldata[j].outer_block;
+ memset(p + 5*4 + 1,
+ 0x00,
+ 64 - 5*4 - 1 - 2);
+ p[5*4] = 0x80;
+ p[64-2] = 0x02;
+ p[64-1] = 0xA0;
+ }
+ /* Init HMAC/SHA224 out-of-order fields */
+ state->hmac_sha_224_ooo.lens[0] = 0;
+ state->hmac_sha_224_ooo.lens[1] = 0;
+ state->hmac_sha_224_ooo.lens[2] = 0;
+ state->hmac_sha_224_ooo.lens[3] = 0;
+ state->hmac_sha_224_ooo.lens[4] = 0xFFFF;
+ state->hmac_sha_224_ooo.lens[5] = 0xFFFF;
+ state->hmac_sha_224_ooo.lens[6] = 0xFFFF;
+ state->hmac_sha_224_ooo.lens[7] = 0xFFFF;
+ state->hmac_sha_224_ooo.unused_lanes = 0xFF03020100;
+ for (j = 0; j < AVX_NUM_SHA256_LANES; j++) {
+ state->hmac_sha_224_ooo.ldata[j].job_in_lane = NULL;
+
+ p = state->hmac_sha_224_ooo.ldata[j].extra_block;
+ size = sizeof(state->hmac_sha_224_ooo.ldata[j].extra_block);
+ memset (p, 0x00, size);
+ p[64] = 0x80;
+
+ p = state->hmac_sha_224_ooo.ldata[j].outer_block;
+ size = sizeof(state->hmac_sha_224_ooo.ldata[j].outer_block);
+ memset(p, 0x00, size);
+ p[7 * 4] = 0x80; /* digest 7 words long */
+ p[64 - 2] = 0x02; /* length in little endian = 0x02E0 */
+ p[64 - 1] = 0xE0;
+ }
+
+ /* Init HMAC/SHA256 out-of-order fields */
+ state->hmac_sha_256_ooo.lens[0] = 0;
+ state->hmac_sha_256_ooo.lens[1] = 0;
+ state->hmac_sha_256_ooo.lens[2] = 0;
+ state->hmac_sha_256_ooo.lens[3] = 0;
+ state->hmac_sha_256_ooo.lens[4] = 0xFFFF;
+ state->hmac_sha_256_ooo.lens[5] = 0xFFFF;
+ state->hmac_sha_256_ooo.lens[6] = 0xFFFF;
+ state->hmac_sha_256_ooo.lens[7] = 0xFFFF;
+ state->hmac_sha_256_ooo.unused_lanes = 0xFF03020100;
+ for (j = 0; j < AVX_NUM_SHA256_LANES; j++) {
+ state->hmac_sha_256_ooo.ldata[j].job_in_lane = NULL;
+ state->hmac_sha_256_ooo.ldata[j].extra_block[64] = 0x80;
+ memset(state->hmac_sha_256_ooo.ldata[j].extra_block + 65,
+ 0x00,
+ 64+7);
+ p = state->hmac_sha_256_ooo.ldata[j].outer_block;
+ memset(p + 8*4 + 1,
+ 0x00,
+ 64 - 8*4 - 1 - 2);
+ p[8 * 4] = 0x80; /* 8 digest words */
+ p[64 - 2] = 0x03; /* length */
+ p[64 - 1] = 0x00;
+ }
+
+
+ /* Init HMAC/SHA384 out-of-order fields */
+ state->hmac_sha_384_ooo.lens[0] = 0;
+ state->hmac_sha_384_ooo.lens[1] = 0;
+ state->hmac_sha_384_ooo.lens[2] = 0xFFFF;
+ state->hmac_sha_384_ooo.lens[3] = 0xFFFF;
+ state->hmac_sha_384_ooo.lens[4] = 0xFFFF;
+ state->hmac_sha_384_ooo.lens[5] = 0xFFFF;
+ state->hmac_sha_384_ooo.lens[6] = 0xFFFF;
+ state->hmac_sha_384_ooo.lens[7] = 0xFFFF;
+ state->hmac_sha_384_ooo.unused_lanes = 0xFF0100;
+ for (j = 0; j < AVX_NUM_SHA512_LANES; j++) {
+ MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_384_ooo;
+
+ ctx->ldata[j].job_in_lane = NULL;
+ ctx->ldata[j].extra_block[SHA_384_BLOCK_SIZE] = 0x80;
+ memset(ctx->ldata[j].extra_block + (SHA_384_BLOCK_SIZE + 1),
+ 0x00, SHA_384_BLOCK_SIZE + 7);
+
+ p = ctx->ldata[j].outer_block;
+ memset(p + SHA384_DIGEST_SIZE_IN_BYTES + 1, 0x00,
+ /* special end point because this length is constant */
+ SHA_384_BLOCK_SIZE -
+ SHA384_DIGEST_SIZE_IN_BYTES - 1 - 2);
+ /* mark the end */
+ p[SHA384_DIGEST_SIZE_IN_BYTES] = 0x80;
+ /* hmac outer block length always of fixed size,
+ * it is OKey length, a whole message block length, 1024 bits,
+ * with padding plus the length of the inner digest,
+ * which is 384 bits, 1408 bits == 0x0580.
+ * The input message block needs to be converted to big endian
+ * within the sha implementation before use.
+ */
+ p[SHA_384_BLOCK_SIZE - 2] = 0x05;
+ p[SHA_384_BLOCK_SIZE - 1] = 0x80;
+ }
+
+ /* Init HMAC/SHA512 out-of-order fields */
+ state->hmac_sha_512_ooo.lens[0] = 0;
+ state->hmac_sha_512_ooo.lens[1] = 0;
+ state->hmac_sha_512_ooo.lens[2] = 0xFFFF;
+ state->hmac_sha_512_ooo.lens[3] = 0xFFFF;
+ state->hmac_sha_512_ooo.lens[4] = 0xFFFF;
+ state->hmac_sha_512_ooo.lens[5] = 0xFFFF;
+ state->hmac_sha_512_ooo.lens[6] = 0xFFFF;
+ state->hmac_sha_512_ooo.lens[7] = 0xFFFF;
+ state->hmac_sha_512_ooo.unused_lanes = 0xFF0100;
+ for (j = 0; j < AVX_NUM_SHA512_LANES; j++) {
+ MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_512_ooo;
+
+ ctx->ldata[j].job_in_lane = NULL;
+ ctx->ldata[j].extra_block[SHA_512_BLOCK_SIZE] = 0x80;
+ memset(ctx->ldata[j].extra_block + (SHA_512_BLOCK_SIZE + 1),
+ 0x00, SHA_512_BLOCK_SIZE + 7);
+ p = ctx->ldata[j].outer_block;
+ memset(p + SHA512_DIGEST_SIZE_IN_BYTES + 1, 0x00,
+ /* special end point because this length is constant */
+ SHA_512_BLOCK_SIZE -
+ SHA512_DIGEST_SIZE_IN_BYTES - 1 - 2);
+ /* mark the end */
+ p[SHA512_DIGEST_SIZE_IN_BYTES] = 0x80;
+ /*
+ * hmac outer block length always of fixed size,
+ * it is OKey length, a whole message block length, 1024 bits,
+ * with padding plus the length of the inner digest,
+ * which is 512 bits, 1536 bits == 0x600.
+ * The input message block needs to be converted to big endian
+ * within the sha implementation before use.
+ */
+ p[SHA_512_BLOCK_SIZE - 2] = 0x06;
+ p[SHA_512_BLOCK_SIZE - 1] = 0x00;
+ }
+
+
+ /* Init HMAC/MD5 out-of-order fields */
+ state->hmac_md5_ooo.lens[0] = 0;
+ state->hmac_md5_ooo.lens[1] = 0;
+ state->hmac_md5_ooo.lens[2] = 0;
+ state->hmac_md5_ooo.lens[3] = 0;
+ state->hmac_md5_ooo.lens[4] = 0;
+ state->hmac_md5_ooo.lens[5] = 0;
+ state->hmac_md5_ooo.lens[6] = 0;
+ state->hmac_md5_ooo.lens[7] = 0;
+ state->hmac_md5_ooo.lens[8] = 0xFFFF;
+ state->hmac_md5_ooo.lens[9] = 0xFFFF;
+ state->hmac_md5_ooo.lens[10] = 0xFFFF;
+ state->hmac_md5_ooo.lens[11] = 0xFFFF;
+ state->hmac_md5_ooo.lens[12] = 0xFFFF;
+ state->hmac_md5_ooo.lens[13] = 0xFFFF;
+ state->hmac_md5_ooo.lens[14] = 0xFFFF;
+ state->hmac_md5_ooo.lens[15] = 0xFFFF;
+ state->hmac_md5_ooo.unused_lanes = 0xF76543210;
+ for (j = 0; j < AVX_NUM_MD5_LANES; j++) {
+ state->hmac_md5_ooo.ldata[j].job_in_lane = NULL;
+
+ p = state->hmac_md5_ooo.ldata[j].extra_block;
+ size = sizeof(state->hmac_md5_ooo.ldata[j].extra_block);
+ memset (p, 0x00, size);
+ p[64] = 0x80;
+
+ p = state->hmac_md5_ooo.ldata[j].outer_block;
+ size = sizeof(state->hmac_md5_ooo.ldata[j].outer_block);
+ memset(p, 0x00, size);
+ p[4 * 4] = 0x80;
+ p[64 - 7] = 0x02;
+ p[64 - 8] = 0x80;
+ }
+
+ /* Init AES/XCBC OOO fields */
+ state->aes_xcbc_ooo.lens[0] = 0;
+ state->aes_xcbc_ooo.lens[1] = 0;
+ state->aes_xcbc_ooo.lens[2] = 0;
+ state->aes_xcbc_ooo.lens[3] = 0;
+ state->aes_xcbc_ooo.lens[4] = 0;
+ state->aes_xcbc_ooo.lens[5] = 0;
+ state->aes_xcbc_ooo.lens[6] = 0;
+ state->aes_xcbc_ooo.lens[7] = 0;
+ state->aes_xcbc_ooo.unused_lanes = 0xF76543210;
+ for (j = 0; j < 8; j++) {
+ state->aes_xcbc_ooo.ldata[j].job_in_lane = NULL;
+ state->aes_xcbc_ooo.ldata[j].final_block[16] = 0x80;
+ memset(state->aes_xcbc_ooo.ldata[j].final_block + 17, 0x00, 15);
+ }
+
+ /* Init AES-CCM auth out-of-order fields */
+ for (j = 0; j < 8; j++) {
+ state->aes_ccm_ooo.init_done[j] = 0;
+ state->aes_ccm_ooo.lens[j] = 0;
+ state->aes_ccm_ooo.job_in_lane[j] = NULL;
+ }
+ state->aes_ccm_ooo.unused_lanes = 0xF76543210;
+
+ /* Init AES-CMAC auth out-of-order fields */
+ for (j = 0; j < 8; j++) {
+ state->aes_cmac_ooo.init_done[j] = 0;
+ state->aes_cmac_ooo.lens[j] = 0;
+ state->aes_cmac_ooo.job_in_lane[j] = NULL;
+ }
+ state->aes_cmac_ooo.unused_lanes = 0xF76543210;
+
+ /* Init "in order" components */
+ state->next_job = 0;
+ state->earliest_job = -1;
+
+ /* set AVX handlers */
+ state->get_next_job = get_next_job_avx;
+ state->submit_job = submit_job_avx;
+ state->submit_job_nocheck = submit_job_nocheck_avx;
+ state->get_completed_job = get_completed_job_avx;
+ state->flush_job = flush_job_avx;
+ state->queue_size = queue_size_avx;
+ state->keyexp_128 = aes_keyexp_128_avx;
+ state->keyexp_192 = aes_keyexp_192_avx;
+ state->keyexp_256 = aes_keyexp_256_avx;
+ state->cmac_subkey_gen_128 = aes_cmac_subkey_gen_avx;
+ state->xcbc_keyexp = aes_xcbc_expand_key_avx;
+ state->des_key_sched = des_key_schedule;
+ state->sha1_one_block = sha1_one_block_avx;
+ state->sha1 = sha1_avx;
+ state->sha224_one_block = sha224_one_block_avx;
+ state->sha224 = sha224_avx;
+ state->sha256_one_block = sha256_one_block_avx;
+ state->sha256 = sha256_avx;
+ state->sha384_one_block = sha384_one_block_avx;
+ state->sha384 = sha384_avx;
+ state->sha512_one_block = sha512_one_block_avx;
+ state->sha512 = sha512_avx;
+ state->md5_one_block = md5_one_block_avx;
+ state->aes128_cfb_one = aes_cfb_128_one_avx;
+
+ state->eea3_1_buffer = zuc_eea3_1_buffer_avx;
+ state->eea3_4_buffer = zuc_eea3_4_buffer_avx;
+ state->eea3_n_buffer = zuc_eea3_n_buffer_avx;
+ state->eia3_1_buffer = zuc_eia3_1_buffer_avx;
+
+ state->f8_1_buffer = kasumi_f8_1_buffer_avx;
+ state->f8_1_buffer_bit = kasumi_f8_1_buffer_bit_avx;
+ state->f8_2_buffer = kasumi_f8_2_buffer_avx;
+ state->f8_3_buffer = kasumi_f8_3_buffer_avx;
+ state->f8_4_buffer = kasumi_f8_4_buffer_avx;
+ state->f8_n_buffer = kasumi_f8_n_buffer_avx;
+ state->f9_1_buffer = kasumi_f9_1_buffer_avx;
+ state->f9_1_buffer_user = kasumi_f9_1_buffer_user_avx;
+ state->kasumi_init_f8_key_sched = kasumi_init_f8_key_sched_avx;
+ state->kasumi_init_f9_key_sched = kasumi_init_f9_key_sched_avx;
+ state->kasumi_key_sched_size = kasumi_key_sched_size_avx;
+
+ state->snow3g_f8_1_buffer_bit = snow3g_f8_1_buffer_bit_avx;
+ state->snow3g_f8_1_buffer = snow3g_f8_1_buffer_avx;
+ state->snow3g_f8_2_buffer = snow3g_f8_2_buffer_avx;
+ state->snow3g_f8_4_buffer = snow3g_f8_4_buffer_avx;
+ state->snow3g_f8_8_buffer = snow3g_f8_8_buffer_avx;
+ state->snow3g_f8_n_buffer = snow3g_f8_n_buffer_avx;
+ state->snow3g_f8_8_buffer_multikey = snow3g_f8_8_buffer_multikey_avx;
+ state->snow3g_f8_n_buffer_multikey = snow3g_f8_n_buffer_multikey_avx;
+ state->snow3g_f9_1_buffer = snow3g_f9_1_buffer_avx;
+ state->snow3g_init_key_sched = snow3g_init_key_sched_avx;
+ state->snow3g_key_sched_size = snow3g_key_sched_size_avx;
+
+#ifndef NO_GCM
+ state->gcm128_enc = aes_gcm_enc_128_avx_gen2;
+ state->gcm192_enc = aes_gcm_enc_192_avx_gen2;
+ state->gcm256_enc = aes_gcm_enc_256_avx_gen2;
+ state->gcm128_dec = aes_gcm_dec_128_avx_gen2;
+ state->gcm192_dec = aes_gcm_dec_192_avx_gen2;
+ state->gcm256_dec = aes_gcm_dec_256_avx_gen2;
+ state->gcm128_init = aes_gcm_init_128_avx_gen2;
+ state->gcm192_init = aes_gcm_init_192_avx_gen2;
+ state->gcm256_init = aes_gcm_init_256_avx_gen2;
+ state->gcm128_enc_update = aes_gcm_enc_128_update_avx_gen2;
+ state->gcm192_enc_update = aes_gcm_enc_192_update_avx_gen2;
+ state->gcm256_enc_update = aes_gcm_enc_256_update_avx_gen2;
+ state->gcm128_dec_update = aes_gcm_dec_128_update_avx_gen2;
+ state->gcm192_dec_update = aes_gcm_dec_192_update_avx_gen2;
+ state->gcm256_dec_update = aes_gcm_dec_256_update_avx_gen2;
+ state->gcm128_enc_finalize = aes_gcm_enc_128_finalize_avx_gen2;
+ state->gcm192_enc_finalize = aes_gcm_enc_192_finalize_avx_gen2;
+ state->gcm256_enc_finalize = aes_gcm_enc_256_finalize_avx_gen2;
+ state->gcm128_dec_finalize = aes_gcm_dec_128_finalize_avx_gen2;
+ state->gcm192_dec_finalize = aes_gcm_dec_192_finalize_avx_gen2;
+ state->gcm256_dec_finalize = aes_gcm_dec_256_finalize_avx_gen2;
+ state->gcm128_precomp = aes_gcm_precomp_128_avx_gen2;
+ state->gcm192_precomp = aes_gcm_precomp_192_avx_gen2;
+ state->gcm256_precomp = aes_gcm_precomp_256_avx_gen2;
+ state->gcm128_pre = aes_gcm_pre_128_avx_gen2;
+ state->gcm192_pre = aes_gcm_pre_192_avx_gen2;
+ state->gcm256_pre = aes_gcm_pre_256_avx_gen2;
+#endif
+}
+
+#include "mb_mgr_code.h"
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_flush_avx.asm
new file mode 100644
index 000000000..750a630aa
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_flush_avx.asm
@@ -0,0 +1,298 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+extern sha1_mult_avx
+
+section .data
+default rel
+
+align 16
+byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+x80: ;ddq 0x00000000000000000000000000000080
+ dq 0x0000000000000080, 0x0000000000000000
+x00: ;ddq 0x00000000000000000000000000000000
+ dq 0x0000000000000000, 0x0000000000000000
+
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ ;ddq 0x00000000000000000000FFFF00000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ ;ddq 0x0000000000000000FFFF000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+
+section .text
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rbp, r12-r15
+%define idx rbp
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+
+%endif
+
+; This routine clobbers rbx, rbp
+struc STACK
+_gpr_save: resq 2
+_rsp_save: resq 1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* flush_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state)
+; arg 1 : rcx : state
+MKGLOBAL(flush_job_hmac_avx,function,internal)
+flush_job_hmac_avx:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes]
+ bt unused_lanes, 32+7
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [rel one]
+ cmp qword [state + _ldata + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [rel two]
+ cmp qword [state + _ldata + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [rel three]
+
+copy_lane_data:
+ ; copy valid lane (idx) to empty lanes
+ vmovdqa xmm0, [state + _lens]
+ mov tmp, [state + _args_data_ptr + PTR_SZ*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args_data_ptr + PTR_SZ*I], tmp
+ vpor xmm0, xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ vmovdqa [state + _lens], xmm0
+
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ vpshuflw xmm1, xmm1, 0
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mult_avx
+ ; state is intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+
+ ;; idx determines which column
+ ;; read off from consecutive rows
+ vmovd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3
+ vpshufb xmm0, xmm0, [rel byteswap]
+ mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ vmovdqa [lane_data + _outer_block], xmm0
+ mov [lane_data + _outer_block + 4*4], DWORD(tmp)
+
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ mov DWORD(tmp), [tmp + 4*4]
+ vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3
+ mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp)
+ jmp copy_lane_data
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp copy_lane_data
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; copy 12 bytes
+ mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+ mov [p + 0*4], DWORD(tmp2)
+ mov [p + 1*4], DWORD(tmp4)
+ mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp2)
+ mov [p + 2*4], DWORD(tmp2)
+
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 12
+ je clear_ret
+
+ ;; copy remaining 8 bytes to return 20 byte digest
+ mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+ mov [p + 3*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2)
+ mov [p + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp4)
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ vpxor xmm0, xmm0
+
+ ;; Clear digest (20B), outer_block (20B) and extra_block (64B)
+ ;; of returned job and NULL jobs
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
+ jne APPEND(skip_clear_,I)
+
+ ;; Clear digest
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 0*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 1*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 2*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 3*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 4*SHA1_DIGEST_ROW_SIZE], 0
+
+ lea lane_data, [state + _ldata + (I*_HMAC_SHA1_LANE_DATA_size)]
+ ;; Clear first 64 bytes of extra_block
+%assign offset 0
+%rep 4
+ vmovdqa [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+ ;; Clear first 20 bytes of outer_block
+ vmovdqa [lane_data + _outer_block], xmm0
+ mov dword [lane_data + _outer_block + 16], 0
+
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SAFE_DATA
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_flush_avx.asm
new file mode 100644
index 000000000..a53ad0843
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_flush_avx.asm
@@ -0,0 +1,321 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+extern md5_x4x2_avx
+
+section .data
+default rel
+align 16
+dupw: ;ddq 0x01000100010001000100010001000100
+ dq 0x0100010001000100, 0x0100010001000100
+x80: ;ddq 0x00000000000000000000000000000080
+ dq 0x0000000000000080, 0x0000000000000000
+x00: ;ddq 0x00000000000000000000000000000000
+ dq 0x0000000000000000, 0x0000000000000000
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ ;ddq 0x00000000000000000000FFFF00000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ ;ddq 0x0000000000000000FFFF000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+ ;ddq 0x000000000000FFFF0000000000000000
+ dq 0x0000000000000000, 0x000000000000FFFF
+ ;ddq 0x00000000FFFF00000000000000000000
+ dq 0x0000000000000000, 0x00000000FFFF0000
+ ;ddq 0x0000FFFF000000000000000000000000
+ dq 0x0000000000000000, 0x0000FFFF00000000
+ ;ddq 0xFFFF0000000000000000000000000000
+ dq 0x0000000000000000, 0xFFFF000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
+
+section .text
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbp
+%define idx rbp
+
+; unused_lanes must be in rax-rdx
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define tmp5 r9
+
+%endif
+
+; This routine and/or the called routine clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* flush_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state)
+; arg 1 : rcx : state
+MKGLOBAL(flush_job_hmac_md5_avx,function,internal)
+flush_job_hmac_md5_avx:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_md5]
+ bt unused_lanes, 32+3
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata_md5 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+ cmovne idx, [rel one]
+ cmp qword [state + _ldata_md5 + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+ cmovne idx, [rel two]
+ cmp qword [state + _ldata_md5 + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+ cmovne idx, [rel three]
+ cmp qword [state + _ldata_md5 + 4 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+ cmovne idx, [rel four]
+ cmp qword [state + _ldata_md5 + 5 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+ cmovne idx, [rel five]
+ cmp qword [state + _ldata_md5 + 6 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+ cmovne idx, [rel six]
+ cmp qword [state + _ldata_md5 + 7 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0
+ cmovne idx, [rel seven]
+
+copy_lane_data:
+ ; copy good lane (idx) to empty lanes
+ vmovdqa xmm0, [state + _lens_md5]
+ mov tmp, [state + _args_data_ptr_md5 + PTR_SZ*idx]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata_md5 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args_data_ptr_md5 + PTR_SZ*I], tmp
+ vpor xmm0, xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ vmovdqa [state + _lens_md5], xmm0
+
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ vpshufb xmm1, [rel dupw] ; duplicate words across all lanes
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_md5], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_x4x2_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_md5 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens_md5 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
+
+ vmovd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3
+ vmovdqa [lane_data + _outer_block], xmm0
+
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ vmovd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3
+ jmp copy_lane_data
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens_md5 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp copy_lane_data
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ mov unused_lanes, [state + _unused_lanes_md5]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes_md5], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; copy 12 bytes
+ mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE]
+ mov DWORD(tmp5), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE]
+; bswap DWORD(tmp2)
+; bswap DWORD(tmp4)
+; bswap DWORD(tmp3)
+ mov [p + 0*4], DWORD(tmp2)
+ mov [p + 1*4], DWORD(tmp4)
+ mov [p + 2*4], DWORD(tmp5)
+
+ cmp DWORD [job_rax + _auth_tag_output_len_in_bytes], 12
+ je clear_ret
+
+ ; copy 16 bytes
+ mov DWORD(tmp5), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE]
+ mov [p + 3*4], DWORD(tmp5)
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ vpxor xmm0, xmm0
+
+ ;; Clear digest (16B), outer_block (16B) and extra_block (64B)
+ ;; of returned job and NULL jobs
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata_md5 + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
+ jne APPEND(skip_clear_,I)
+
+ ;; Clear digest (16 bytes)
+%assign J 0
+%rep 4
+ mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*I + J*MD5_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+
+ lea lane_data, [state + _ldata_md5 + (I*_HMAC_SHA1_LANE_DATA_size)]
+ ;; Clear first 64 bytes of extra_block
+%assign offset 0
+%rep 4
+ vmovdqa [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+ ;; Clear first 16 bytes of outer_block
+ vmovdqa [lane_data + _outer_block], xmm0
+
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SAFE_DATA
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_submit_avx.asm
new file mode 100644
index 000000000..5e4627dca
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_submit_avx.asm
@@ -0,0 +1,355 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/memcpy.asm"
+%include "include/reg_sizes.asm"
+%include "include/const.inc"
+
+extern md5_x4x2_avx
+
+section .data
+default rel
+align 16
+dupw: ;ddq 0x01000100010001000100010001000100
+ dq 0x0100010001000100, 0x0100010001000100
+
+section .text
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define reg3 rcx
+%define reg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define reg3 rdi
+%define reg4 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbp
+%define last_len rbp
+%define idx rbp
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+%define tmp4 rbx
+
+%define job_rax rax
+%define len rax
+
+%define size_offset reg3
+%define tmp2 reg3
+
+%define lane reg4
+%define tmp3 reg4
+
+%define extra_blocks r8
+
+%define tmp r9
+%define p2 r9
+
+%define lane_data r10
+
+%endif
+
+; This routine and/or the called routine clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+; JOB* submit_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+MKGLOBAL(submit_job_hmac_md5_avx,function,internal)
+submit_job_hmac_md5_avx:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_md5]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_md5 + lane_data]
+ mov [state + _unused_lanes_md5], unused_lanes
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov tmp, len
+ shr tmp, 6 ; divide by 64, len in terms of blocks
+
+ mov [lane_data + _job_in_lane], job
+ mov dword [lane_data + _outer_done], 0
+
+ ;; insert len into proper lane
+ vmovdqa xmm0, [state + _lens_md5]
+ XVPINSRW xmm0, xmm1, p, lane, tmp, scale_x16
+ vmovdqa [state + _lens_md5], xmm0
+
+ mov last_len, len
+ and last_len, 63
+ lea extra_blocks, [last_len + 9 + 63]
+ shr extra_blocks, 6
+ mov [lane_data + _extra_blocks], DWORD(extra_blocks)
+
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _args_data_ptr_md5 + PTR_SZ*lane], p
+
+ cmp len, 64
+ jb copy_lt64
+
+fast_copy:
+ add p, len
+ vmovdqu xmm0, [p - 64 + 0*16]
+ vmovdqu xmm1, [p - 64 + 1*16]
+ vmovdqu xmm2, [p - 64 + 2*16]
+ vmovdqu xmm3, [p - 64 + 3*16]
+ vmovdqa [lane_data + _extra_block + 0*16], xmm0
+ vmovdqa [lane_data + _extra_block + 1*16], xmm1
+ vmovdqa [lane_data + _extra_block + 2*16], xmm2
+ vmovdqa [lane_data + _extra_block + 3*16], xmm3
+end_fast_copy:
+
+ mov size_offset, extra_blocks
+ shl size_offset, 6
+ sub size_offset, last_len
+ add size_offset, 64-8
+ mov [lane_data + _size_offset], DWORD(size_offset)
+ mov start_offset, 64
+ sub start_offset, last_len
+ mov [lane_data + _start_offset], DWORD(start_offset)
+
+ lea tmp, [8*64 + 8*len]
+; bswap tmp
+ mov [lane_data + _extra_block + size_offset], tmp
+
+ mov tmp, [job + _auth_key_xor_ipad]
+ vmovdqu xmm0, [tmp]
+ vmovd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 0*MD5_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3
+
+ test len, ~63
+ jnz ge64_bytes
+
+lt64_bytes:
+ vmovdqa xmm0, [state + _lens_md5]
+ XVPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16
+ vmovdqa [state + _lens_md5], xmm0
+
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_md5 + PTR_SZ*lane], tmp
+ mov dword [lane_data + _extra_blocks], 0
+
+ge64_bytes:
+ cmp unused_lanes, 0xf
+ jne return_null
+ jmp start_loop
+
+ align 16
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens_md5]
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_md5], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_x4x2_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_md5 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+
+ vmovdqa xmm0, [state + _lens_md5]
+ XVPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16
+ vmovdqa [state + _lens_md5], xmm0
+
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
+
+ vmovd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3
+; vpshufb xmm0, [byteswap wrt rip]
+ vmovdqa [lane_data + _outer_block], xmm0
+
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ vmovd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3
+ jmp start_loop
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+
+ vmovdqa xmm0, [state + _lens_md5]
+ XVPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16
+ vmovdqa [state + _lens_md5], xmm0
+
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp start_loop
+
+ align 16
+
+copy_lt64:
+ ;; less than one message block of data
+ ;; beginning of source block
+ ;; destination extrablock but backwards by len from where 0x80 pre-populated
+ ;; p2 clobbers unused_lanes, undo before exiting
+ lea p2, [lane_data + _extra_block + 64]
+ sub p2, len
+ memcpy_avx_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
+ mov unused_lanes, [state + _unused_lanes_md5]
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes_md5]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes_md5], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; copy 12 bytes
+ mov DWORD(tmp), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE]
+ mov [p + 0*4], DWORD(tmp)
+ mov [p + 1*4], DWORD(tmp2)
+ mov [p + 2*4], DWORD(tmp3)
+
+ cmp DWORD [job_rax + _auth_tag_output_len_in_bytes], 12
+ je clear_ret
+
+ ; copy 16 bytes
+ mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE]
+ mov [p + 3*4], DWORD(tmp3)
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ ;; Clear digest (16B), outer_block (16B) and extra_block (64B) of returned job
+ mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 0
+
+ vpxor xmm0, xmm0
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_md5 + lane_data]
+ ;; Clear first 64 bytes of extra_block
+%assign offset 0
+%rep 4
+ vmovdqa [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+ ;; Clear first 16 bytes of outer_block
+ vmovdqa [lane_data + _outer_block], xmm0
+%endif
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_flush_avx.asm
new file mode 100644
index 000000000..416dfb869
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_flush_avx.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define FUNC flush_job_hmac_sha_224_avx
+%define SHA224
+
+%include "avx/mb_mgr_hmac_sha_256_flush_avx.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_submit_avx.asm
new file mode 100644
index 000000000..ad0721cd7
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_submit_avx.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define FUNC submit_job_hmac_sha_224_avx
+%define SHA224
+
+%include "avx/mb_mgr_hmac_sha_256_submit_avx.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_flush_avx.asm
new file mode 100644
index 000000000..0d8b8e50e
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_flush_avx.asm
@@ -0,0 +1,356 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+extern sha_256_mult_avx
+
+section .data
+default rel
+align 16
+byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ ;ddq 0x00000000000000000000FFFF00000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ ;ddq 0x0000000000000000FFFF000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+
+section .text
+
+%ifndef FUNC
+%define FUNC flush_job_hmac_sha_256_avx
+%endif
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rbp, r13-r15
+%define idx rbp
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+
+%define tmp5 r9
+
+%define tmp6 r10
+
+%endif
+
+; This routine clobbers rbx, rbp; called routine also clobbers r12
+struc STACK
+_gpr_save: resq 3
+_rsp_save: resq 1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state)
+; arg 1 : rcx : state
+MKGLOBAL(FUNC,function,internal)
+FUNC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ bt unused_lanes, 32+7
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata_sha256 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [rel one]
+ cmp qword [state + _ldata_sha256 + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [rel two]
+ cmp qword [state + _ldata_sha256 + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [rel three]
+
+copy_lane_data:
+ ; copy idx to empty lanes
+ vmovdqa xmm0, [state + _lens_sha256]
+ mov tmp, [state + _args_data_ptr_sha256 + 8*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata_sha256 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args_data_ptr_sha256 + 8*I], tmp
+ vpor xmm0, xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ vmovdqa [state + _lens_sha256], xmm0
+
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ vpshuflw xmm1, xmm1, 0
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_sha256], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha_256_mult_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens_sha256 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr_sha256 + 8*idx], tmp
+
+ vmovd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3
+ vpshufb xmm0, xmm0, [rel byteswap]
+ vmovd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2
+%ifndef SHA224
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3
+%endif
+ vpshufb xmm1, xmm1, [rel byteswap]
+
+ vmovdqa [lane_data + _outer_block], xmm0
+ vmovdqa [lane_data + _outer_block + 4*4], xmm1
+%ifdef SHA224
+ mov dword [lane_data + _outer_block + 7*4], 0x80
+%endif
+
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ vmovdqu xmm1, [tmp + 4*4]
+ vmovd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3
+ vmovd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2
+ vpextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3
+ jmp copy_lane_data
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_sha256 + 8*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp copy_lane_data
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha256], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+%ifdef SHA224
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 14
+ jne copy_full_digest
+%else
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 16
+ jne copy_full_digest
+%endif
+
+ ;; copy 14 bytes for SHA224 / 16 bytes for SHA256
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp6), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+ bswap DWORD(tmp6)
+ bswap DWORD(tmp5)
+ mov [p + 0*4], DWORD(tmp2)
+ mov [p + 1*4], DWORD(tmp4)
+ mov [p + 2*4], DWORD(tmp6)
+%ifdef SHA224
+ mov [p + 3*4], WORD(tmp5)
+%else
+ mov [p + 3*4], DWORD(tmp5)
+%endif
+ jmp clear_ret
+
+copy_full_digest:
+ ;; copy 28 bytes for SHA224 / 32 bytes for SHA256
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp6), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+ bswap DWORD(tmp6)
+ bswap DWORD(tmp5)
+ mov [p + 0*4], DWORD(tmp2)
+ mov [p + 1*4], DWORD(tmp4)
+ mov [p + 2*4], DWORD(tmp6)
+ mov [p + 3*4], DWORD(tmp5)
+
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp6), [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE]
+%ifndef SHA224
+ mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE]
+%endif
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+ bswap DWORD(tmp6)
+%ifndef SHA224
+ bswap DWORD(tmp5)
+%endif
+ mov [p + 4*4], DWORD(tmp2)
+ mov [p + 5*4], DWORD(tmp4)
+ mov [p + 6*4], DWORD(tmp6)
+%ifndef SHA224
+ mov [p + 7*4], DWORD(tmp5)
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ vpxor xmm0, xmm0
+
+ ;; Clear digest (28B/32B), outer_block (28B/32B) and extra_block (64B)
+ ;; of returned job and NULL jobs
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata_sha256 + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
+ jne APPEND(skip_clear_,I)
+
+ ;; Clear digest (28 bytes for SHA-224, 32 bytes for SHA-256 bytes)
+%assign J 0
+%rep 7
+ mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*I + J*SHA256_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+%ifndef SHA224
+ mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*I + 7*SHA256_DIGEST_ROW_SIZE], 0
+%endif
+
+ lea lane_data, [state + _ldata_sha256 + (I*_HMAC_SHA1_LANE_DATA_size)]
+ ;; Clear first 64 bytes of extra_block
+%assign offset 0
+%rep 4
+ vmovdqa [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+ ;; Clear first 28 bytes (SHA-224) or 32 bytes (SHA-256) of outer_block
+ vmovdqa [lane_data + _outer_block], xmm0
+%ifdef SHA224
+ mov qword [lane_data + _outer_block + 16], 0
+ mov dword [lane_data + _outer_block + 24], 0
+%else
+ vmovdqa [lane_data + _outer_block + 16], xmm0
+%endif
+
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SAFE_DATA
+
+return:
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov rsp, [rsp + _rsp_save] ; original SP
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_submit_avx.asm
new file mode 100644
index 000000000..738d88b94
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_submit_avx.asm
@@ -0,0 +1,428 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+%include "include/const.inc"
+
+extern sha_256_mult_avx
+
+section .data
+default rel
+align 16
+byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+section .text
+
+%ifndef FUNC
+%define FUNC submit_job_hmac_sha_256_avx
+%endif
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define reg3 rcx
+%define reg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define reg3 rdi
+%define reg4 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rbp, r13-r15
+%define last_len rbp
+%define idx rbp
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+%define tmp4 rbx
+
+%define job_rax rax
+%define len rax
+
+%define size_offset reg3
+%define tmp2 reg3
+
+%define lane reg4
+%define tmp3 reg4
+
+%define extra_blocks r8
+
+%define tmp r9
+%define p2 r9
+
+%define lane_data r10
+
+%endif
+
+; This routine clobbers rbx, rbp, rsi, rdi; called routine also clobbers r12
+struc STACK
+_gpr_save: resq 5
+_rsp_save: resq 1
+endstruc
+
+; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+MKGLOBAL(FUNC,function,internal)
+FUNC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*3], rsi
+ mov [rsp + _gpr_save + 8*4], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ movzx lane, BYTE(unused_lanes)
+ shr unused_lanes, 8
+ imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ mov [state + _unused_lanes_sha256], unused_lanes
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov tmp, len
+ shr tmp, 6 ; divide by 64, len in terms of blocks
+
+ mov [lane_data + _job_in_lane], job
+ mov dword [lane_data + _outer_done], 0
+
+ vmovdqa xmm0, [state + _lens_sha256]
+ XVPINSRW xmm0, xmm1, p, lane, tmp, scale_x16
+ vmovdqa [state + _lens_sha256], xmm0
+
+ mov last_len, len
+ and last_len, 63
+ lea extra_blocks, [last_len + 9 + 63]
+ shr extra_blocks, 6
+ mov [lane_data + _extra_blocks], DWORD(extra_blocks)
+
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _args_data_ptr_sha256 + 8*lane], p
+
+ cmp len, 64
+ jb copy_lt64
+
+fast_copy:
+ add p, len
+ vmovdqu xmm0, [p - 64 + 0*16]
+ vmovdqu xmm1, [p - 64 + 1*16]
+ vmovdqu xmm2, [p - 64 + 2*16]
+ vmovdqu xmm3, [p - 64 + 3*16]
+ vmovdqa [lane_data + _extra_block + 0*16], xmm0
+ vmovdqa [lane_data + _extra_block + 1*16], xmm1
+ vmovdqa [lane_data + _extra_block + 2*16], xmm2
+ vmovdqa [lane_data + _extra_block + 3*16], xmm3
+end_fast_copy:
+
+ mov size_offset, extra_blocks
+ shl size_offset, 6
+ sub size_offset, last_len
+ add size_offset, 64-8
+ mov [lane_data + _size_offset], DWORD(size_offset)
+ mov start_offset, 64
+ sub start_offset, last_len
+ mov [lane_data + _start_offset], DWORD(start_offset)
+
+ lea tmp, [8*64 + 8*len]
+ bswap tmp
+ mov [lane_data + _extra_block + size_offset], tmp
+
+ mov tmp, [job + _auth_key_xor_ipad]
+ vmovdqu xmm0, [tmp]
+ vmovdqu xmm1, [tmp + 4*4]
+ vmovd [state + _args_digest_sha256 + 4*lane + 0*SHA256_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest_sha256 + 4*lane + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest_sha256 + 4*lane + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest_sha256 + 4*lane + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3
+ vmovd [state + _args_digest_sha256 + 4*lane + 4*SHA256_DIGEST_ROW_SIZE], xmm1
+ vpextrd [state + _args_digest_sha256 + 4*lane + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1
+ vpextrd [state + _args_digest_sha256 + 4*lane + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2
+ vpextrd [state + _args_digest_sha256 + 4*lane + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3
+
+ test len, ~63
+ jnz ge64_bytes
+
+lt64_bytes:
+ vmovdqa xmm0, [state + _lens_sha256]
+ XVPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16
+ vmovdqa [state + _lens_sha256], xmm0
+
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_sha256 + 8*lane], tmp
+ mov dword [lane_data + _extra_blocks], 0
+
+ge64_bytes:
+ cmp unused_lanes, 0xff
+ jne return_null
+ jmp start_loop
+
+ align 16
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens_sha256]
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ vpshuflw xmm1, xmm1, 0
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_sha256], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha_256_mult_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+
+ vmovdqa xmm0, [state + _lens_sha256]
+ XVPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16
+ vmovdqa [state + _lens_sha256], xmm0
+
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr_sha256 + 8*idx], tmp
+
+ vmovd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3
+ vpshufb xmm0, xmm0, [rel byteswap]
+ vmovd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2
+%ifndef SHA224
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3
+%endif
+ vpshufb xmm1, xmm1, [rel byteswap]
+ vmovdqa [lane_data + _outer_block], xmm0
+ vmovdqa [lane_data + _outer_block + 4*4], xmm1
+%ifdef SHA224
+ mov dword [lane_data + _outer_block + 7*4], 0x80
+%endif
+
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ vmovdqu xmm1, [tmp + 4*4]
+ vmovd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3
+ vmovd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2
+ vpextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3
+
+ jmp start_loop
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+
+ vmovdqa xmm0, [state + _lens_sha256]
+ XVPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16
+ vmovdqa [state + _lens_sha256], xmm0
+
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_sha256 + 8*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp start_loop
+
+ align 16
+
+copy_lt64:
+ ;; less than one message block of data
+ ;; beginning of source block
+ ;; destination extrablock but backwards by len from where 0x80 pre-populated
+ ;; p2 clobbers unused_lanes, undo before exit
+ lea p2, [lane_data + _extra_block + 64]
+ sub p2, len
+ memcpy_avx_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha256], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+%ifdef SHA224
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 14
+ jne copy_full_digest
+%else
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 16
+ jne copy_full_digest
+%endif
+ ; copy 14 bytes for SHA224 / 16 bytes for SHA256
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+ bswap DWORD(tmp4)
+ mov [p + 0*4], DWORD(tmp)
+ mov [p + 1*4], DWORD(tmp2)
+ mov [p + 2*4], DWORD(tmp3)
+%ifdef SHA224
+ mov [p + 3*4], WORD(tmp4)
+%else
+ mov [p + 3*4], DWORD(tmp4)
+%endif
+ jmp clear_ret
+
+copy_full_digest:
+ ;; copy 28 bytes for SHA224 / 32 bytes for SHA256
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+ bswap DWORD(tmp4)
+ mov [p + 0*4], DWORD(tmp)
+ mov [p + 1*4], DWORD(tmp2)
+ mov [p + 2*4], DWORD(tmp3)
+ mov [p + 3*4], DWORD(tmp4)
+
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE]
+%ifndef SHA224
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE]
+%endif
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+%ifndef SHA224
+ bswap DWORD(tmp4)
+%endif
+ mov [p + 4*4], DWORD(tmp)
+ mov [p + 5*4], DWORD(tmp2)
+ mov [p + 6*4], DWORD(tmp3)
+%ifndef SHA224
+ mov [p + 7*4], DWORD(tmp4)
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ ;; Clear digest (28B/32B), outer_block (28B/32B) and extra_block (64B) of returned job
+%assign J 0
+%rep 7
+ mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*idx + J*SHA256_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+%ifndef SHA224
+ mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*idx + 7*SHA256_DIGEST_ROW_SIZE], 0
+%endif
+
+ vpxor xmm0, xmm0
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ ;; Clear first 64 bytes of extra_block
+%assign offset 0
+%rep 4
+ vmovdqa [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+ ;; Clear first 28 bytes (SHA-224) or 32 bytes (SHA-256) of outer_block
+ vmovdqa [lane_data + _outer_block], xmm0
+%ifdef SHA224
+ mov qword [lane_data + _outer_block + 16], 0
+ mov dword [lane_data + _outer_block + 24], 0
+%else
+ vmovdqa [lane_data + _outer_block + 16], xmm0
+%endif
+%endif ;; SAFE_DATA
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*3]
+ mov rdi, [rsp + _gpr_save + 8*4]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_flush_avx.asm
new file mode 100644
index 000000000..f3491ab27
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_flush_avx.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define FUNC flush_job_hmac_sha_384_avx
+%define SHA_X_DIGEST_SIZE 384
+
+%include "avx/mb_mgr_hmac_sha_512_flush_avx.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_submit_avx.asm
new file mode 100644
index 000000000..a2fb0f1c6
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_submit_avx.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define FUNC submit_job_hmac_sha_384_avx
+%define SHA_X_DIGEST_SIZE 384
+
+%include "avx/mb_mgr_hmac_sha_512_submit_avx.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_flush_avx.asm
new file mode 100644
index 000000000..2de170948
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_flush_avx.asm
@@ -0,0 +1,339 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+extern sha512_x2_avx
+
+section .data
+default rel
+align 16
+byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607
+ dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+one: dq 1
+
+section .text
+
+%ifndef FUNC
+%define FUNC flush_job_hmac_sha_512_avx
+%define SHA_X_DIGEST_SIZE 512
+%endif
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rbp, r12-r15
+%define idx rbp
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+
+%define tmp5 r9
+
+%define tmp6 r10
+
+%endif
+
+; This routine clobbers rbx, rbp
+struc STACK
+_gpr_save: resq 2
+_rsp_save: resq 1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* FUNC(MB_MGR_HMAC_SHA_512_OOO *state)
+; arg 1 : rcx : state
+MKGLOBAL(FUNC,function,internal)
+FUNC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ bt unused_lanes, 16+7
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata_sha512 + 1 * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0
+ cmovne idx, [rel one]
+copy_lane_data:
+ ; copy good lane (idx) to empty lanes
+ vmovdqa xmm0, [state + _lens_sha512]
+ mov tmp, [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*idx]
+
+%assign I 0
+%rep 2
+ cmp qword [state + _ldata_sha512 + I * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0
+ jne APPEND(skip_,I)
+ mov [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*I], tmp
+ vpor xmm0, xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ vmovdqa [state + _lens_sha512], xmm0
+
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ vpshuflw xmm1, xmm1, 0xA0
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_sha512], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_x2_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _SHA512_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha512 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done_sha512], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done_sha512], 1
+ mov DWORD(size_offset), [lane_data + _size_offset_sha512]
+ mov qword [lane_data + _extra_block_sha512 + size_offset], 0
+ mov word [state + _lens_sha512 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block_sha512]
+ mov job, [lane_data + _job_in_lane_sha512]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
+
+ ; move digest into data location
+ %assign I 0
+ %rep (SHA_X_DIGEST_SIZE / (8*16))
+ vmovq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE]
+ vpinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1
+ vpshufb xmm0, [rel byteswap]
+ vmovdqa [lane_data + _outer_block_sha512 + I * 16], xmm0
+ %assign I (I+1)
+ %endrep
+
+ ; move the opad key into digest
+ mov tmp, [job + _auth_key_xor_opad]
+
+ %assign I 0
+ %rep 4
+ vmovdqu xmm0, [tmp + I * 16]
+ vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE], xmm0
+ vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1
+ %assign I (I+1)
+ %endrep
+
+ jmp copy_lane_data
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset_sha512]
+ mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block_sha512 + start_offset]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks_sha512], 0
+ jmp copy_lane_data
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane_sha512]
+ mov qword [lane_data + _job_in_lane_sha512], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha512], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+%if (SHA_X_DIGEST_SIZE != 384)
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 32
+ jne copy_full_digest
+%else
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 24
+ jne copy_full_digest
+%endif
+
+ ;; copy 32 bytes for SHA512 / 24 bytes for SHA384
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
+%endif
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp4)
+ bswap QWORD(tmp6)
+%if (SHA_X_DIGEST_SIZE != 384)
+ bswap QWORD(tmp5)
+%endif
+ mov [p + 0*8], QWORD(tmp2)
+ mov [p + 1*8], QWORD(tmp4)
+ mov [p + 2*8], QWORD(tmp6)
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov [p + 3*8], QWORD(tmp5)
+%endif
+ jmp clear_ret
+
+copy_full_digest:
+ ;; copy 64 bytes for SHA512 / 48 bytes for SHA384
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp4)
+ bswap QWORD(tmp6)
+ bswap QWORD(tmp5)
+ mov [p + 0*8], QWORD(tmp2)
+ mov [p + 1*8], QWORD(tmp4)
+ mov [p + 2*8], QWORD(tmp6)
+ mov [p + 3*8], QWORD(tmp5)
+
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 4*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 5*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA512_DIGEST_ROW_SIZE]
+%endif
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp4)
+%if (SHA_X_DIGEST_SIZE != 384)
+ bswap QWORD(tmp6)
+ bswap QWORD(tmp5)
+%endif
+ mov [p + 4*8], QWORD(tmp2)
+ mov [p + 5*8], QWORD(tmp4)
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov [p + 6*8], QWORD(tmp6)
+ mov [p + 7*8], QWORD(tmp5)
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ vpxor xmm0, xmm0
+
+ ;; Clear digest (48B/64B), outer_block (48B/64B) and extra_block (128B) of returned job
+%assign I 0
+%rep 2
+ cmp qword [state + _ldata_sha512 + (I*_SHA512_LANE_DATA_size) + _job_in_lane_sha512], 0
+ jne APPEND(skip_clear_,I)
+
+ ;; Clear digest (48 bytes for SHA-384, 64 bytes for SHA-512 bytes)
+%assign J 0
+%rep 6
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + J*SHA512_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + 6*SHA512_DIGEST_ROW_SIZE], 0
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + 7*SHA512_DIGEST_ROW_SIZE], 0
+%endif
+
+ lea lane_data, [state + _ldata_sha512 + (I*_SHA512_LANE_DATA_size)]
+ ;; Clear first 128 bytes of extra_block
+%assign offset 0
+%rep 8
+ vmovdqa [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+ ;; Clear first 48 bytes (SHA-384) or 64 bytes (SHA-512) of outer_block
+ vmovdqa [lane_data + _outer_block], xmm0
+ vmovdqa [lane_data + _outer_block + 16], xmm0
+ vmovdqa [lane_data + _outer_block + 32], xmm0
+%if (SHA_X_DIGEST_SIZE != 384)
+ vmovdqa [lane_data + _outer_block + 48], xmm0
+%endif
+
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SAFE_DATA
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_submit_avx.asm
new file mode 100644
index 000000000..b37884d0f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_submit_avx.asm
@@ -0,0 +1,416 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+%include "include/const.inc"
+
+extern sha512_x2_avx
+
+section .data
+default rel
+align 16
+byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607
+ dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+
+section .text
+
+%ifndef FUNC
+%define FUNC submit_job_hmac_sha_512_avx
+%define SHA_X_DIGEST_SIZE 512
+%endif
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define reg3 rcx
+%define reg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define reg3 rdi
+%define reg4 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rbp, r12-r15
+%define last_len rbp
+%define idx rbp
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+%define tmp4 rbx
+
+%define job_rax rax
+%define len rax
+
+%define size_offset reg3
+%define tmp2 reg3
+
+%define lane reg4
+%define tmp3 reg4
+
+%define extra_blocks r8
+
+%define tmp r9
+%define p2 r9
+
+%define lane_data r10
+
+%endif
+
+; This routine clobbers rbx, rbp, rsi, rdi
+struc STACK
+_gpr_save: resq 4
+_rsp_save: resq 1
+endstruc
+
+; JOB* FUNC(MB_MGR_HMAC_sha_512_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+MKGLOBAL(FUNC,function,internal)
+FUNC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*2], rsi
+ mov [rsp + _gpr_save + 8*3], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ movzx lane, BYTE(unused_lanes)
+ shr unused_lanes, 8
+ imul lane_data, lane, _SHA512_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha512 + lane_data]
+ mov [state + _unused_lanes_sha512], unused_lanes
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov tmp, len
+ shr tmp, 7 ; divide by 128, len in terms of blocks
+
+ mov [lane_data + _job_in_lane_sha512], job
+ mov dword [lane_data + _outer_done_sha512], 0
+
+ vmovdqa xmm0, [state + _lens_sha512]
+ XVPINSRW xmm0, xmm1, p, lane, tmp, scale_x16
+ vmovdqa [state + _lens_sha512], xmm0
+
+ mov last_len, len
+ and last_len, 127
+ lea extra_blocks, [last_len + 17 + 127]
+ shr extra_blocks, 7
+ mov [lane_data + _extra_blocks_sha512], DWORD(extra_blocks)
+
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], p
+
+ cmp len, 128
+ jb copy_lt128
+
+fast_copy:
+ add p, len
+%assign I 0
+%rep 2
+ vmovdqu xmm0, [p - 128 + I*4*16 + 0*16]
+ vmovdqu xmm1, [p - 128 + I*4*16 + 1*16]
+ vmovdqu xmm2, [p - 128 + I*4*16 + 2*16]
+ vmovdqu xmm3, [p - 128 + I*4*16 + 3*16]
+ vmovdqa [lane_data + _extra_block_sha512 + I*4*16 + 0*16], xmm0
+ vmovdqa [lane_data + _extra_block_sha512 + I*4*16 + 1*16], xmm1
+ vmovdqa [lane_data + _extra_block_sha512 + I*4*16 + 2*16], xmm2
+ vmovdqa [lane_data + _extra_block_sha512 + I*4*16 + 3*16], xmm3
+%assign I (I+1)
+%endrep
+
+end_fast_copy:
+
+ mov size_offset, extra_blocks
+ shl size_offset, 7
+ sub size_offset, last_len
+ add size_offset, 128-8
+ mov [lane_data + _size_offset_sha512], DWORD(size_offset)
+ mov start_offset, 128
+ sub start_offset, last_len
+ mov [lane_data + _start_offset_sha512], DWORD(start_offset)
+
+ lea tmp, [8*128 + 8*len]
+ bswap tmp
+ mov [lane_data + _extra_block_sha512 + size_offset], tmp
+
+ mov tmp, [job + _auth_key_xor_ipad]
+
+%assign I 0
+%rep 4
+ vmovdqu xmm0, [tmp + I * 2 * 8]
+ vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I)*SHA512_DIGEST_ROW_SIZE], xmm0
+ vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1
+%assign I (I+1)
+%endrep
+
+ test len, ~127
+ jnz ge128_bytes
+
+lt128_bytes:
+ vmovdqa xmm0, [state + _lens_sha512]
+ XVPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16
+ vmovdqa [state + _lens_sha512], xmm0
+
+ lea tmp, [lane_data + _extra_block_sha512 + start_offset]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], tmp ;; 8 to hold a UINT8
+ mov dword [lane_data + _extra_blocks_sha512], 0
+
+ge128_bytes:
+ cmp unused_lanes, 0xff
+ jne return_null
+ jmp start_loop
+
+ align 16
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens_sha512]
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...1)
+ cmp len2, 0
+ je len_is_0
+
+ vpshuflw xmm1, xmm1, 0xA0
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_sha512], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_x2_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _SHA512_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha512 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done_sha512], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done_sha512], 1
+ mov DWORD(size_offset), [lane_data + _size_offset_sha512]
+ mov qword [lane_data + _extra_block_sha512 + size_offset], 0
+
+ vmovdqa xmm0, [state + _lens_sha512]
+ XVPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16
+ vmovdqa [state + _lens_sha512], xmm0
+
+ lea tmp, [lane_data + _outer_block_sha512]
+ mov job, [lane_data + _job_in_lane_sha512]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
+
+%assign I 0
+%rep (SHA_X_DIGEST_SIZE / (8 * 16))
+ vmovq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE]
+ vpinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1
+ vpshufb xmm0, [rel byteswap]
+ vmovdqa [lane_data + _outer_block_sha512 + I * 16], xmm0
+%assign I (I+1)
+%endrep
+
+ mov tmp, [job + _auth_key_xor_opad]
+%assign I 0
+%rep 4
+ vmovdqu xmm0, [tmp + I * 16]
+ vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE], xmm0
+ vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1
+%assign I (I+1)
+%endrep
+
+ jmp start_loop
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset_sha512]
+
+ vmovdqa xmm0, [state + _lens_sha512]
+ XVPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16
+ vmovdqa [state + _lens_sha512], xmm0
+
+ lea tmp, [lane_data + _extra_block_sha512 + start_offset]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp ;; idx is index of shortest length message
+ mov dword [lane_data + _extra_blocks_sha512], 0
+ jmp start_loop
+
+ align 16
+copy_lt128:
+ ;; less than one message block of data
+ ;; destination extra block but backwards by len from where 0x80 pre-populated
+ lea p2, [lane_data + _extra_block + 128]
+ sub p2, len
+ memcpy_avx_128_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane_sha512]
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ mov qword [lane_data + _job_in_lane_sha512], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha512], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+%if (SHA_X_DIGEST_SIZE != 384)
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 32
+ jne copy_full_digest
+%else
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 24
+ jne copy_full_digest
+%endif
+ ;; copy 32 bytes for SHA512 / 24 bytes and SHA384
+ mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
+%endif
+ bswap QWORD(tmp)
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp3)
+%if (SHA_X_DIGEST_SIZE != 384)
+ bswap QWORD(tmp4)
+%endif
+ mov [p + 0*8], QWORD(tmp)
+ mov [p + 1*8], QWORD(tmp2)
+ mov [p + 2*8], QWORD(tmp3)
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov [p + 3*8], QWORD(tmp4)
+%endif
+ jmp clear_ret
+
+copy_full_digest:
+ ;; copy 64 bytes for SHA512 / 48 bytes and SHA384
+ mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
+ bswap QWORD(tmp)
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp3)
+ bswap QWORD(tmp4)
+ mov [p + 0*8], QWORD(tmp)
+ mov [p + 1*8], QWORD(tmp2)
+ mov [p + 2*8], QWORD(tmp3)
+ mov [p + 3*8], QWORD(tmp4)
+
+ mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 4*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 5*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA512_DIGEST_ROW_SIZE]
+%endif
+ bswap QWORD(tmp)
+ bswap QWORD(tmp2)
+%if (SHA_X_DIGEST_SIZE != 384)
+ bswap QWORD(tmp3)
+ bswap QWORD(tmp4)
+%endif
+ mov [p + 4*8], QWORD(tmp)
+ mov [p + 5*8], QWORD(tmp2)
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov [p + 6*8], QWORD(tmp3)
+ mov [p + 7*8], QWORD(tmp4)
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ ;; Clear digest (48B/64B), outer_block (48B/64B) and extra_block (128B) of returned job
+%assign J 0
+%rep 6
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + J*SHA512_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA256_DIGEST_ROW_SIZE], 0
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA256_DIGEST_ROW_SIZE], 0
+%endif
+
+ vpxor xmm0, xmm0
+ imul lane_data, idx, _SHA512_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha512 + lane_data]
+ ;; Clear first 128 bytes of extra_block
+%assign offset 0
+%rep 8
+ vmovdqa [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+ ;; Clear first 48 bytes (SHA-384) or 64 bytes (SHA-512) of outer_block
+ vmovdqa [lane_data + _outer_block], xmm0
+ vmovdqa [lane_data + _outer_block + 16], xmm0
+ vmovdqa [lane_data + _outer_block + 32], xmm0
+%if (SHA_X_DIGEST_SIZE != 384)
+ vmovdqa [lane_data + _outer_block + 48], xmm0
+%endif
+%endif ;; SAFE_DATA
+
+return:
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*2]
+ mov rdi, [rsp + _gpr_save + 8*3]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_submit_avx.asm
new file mode 100644
index 000000000..418f0bc43
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_submit_avx.asm
@@ -0,0 +1,358 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+%include "include/const.inc"
+
+extern sha1_mult_avx
+
+section .data
+default rel
+align 16
+byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+section .text
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define reg3 rcx
+%define reg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define reg3 rdi
+%define reg4 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rbp, r12-r15
+%define last_len rbp
+%define idx rbp
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+%define tmp4 rbx
+
+%define job_rax rax
+%define len rax
+
+%define size_offset reg3
+%define tmp2 reg3
+
+%define lane reg4
+%define tmp3 reg4
+
+%define extra_blocks r8
+
+%define tmp r9
+%define p2 r9
+
+%define lane_data r10
+
+%endif
+
+; This routine clobbers rdi, rsi, rbx, rbp
+struc STACK
+_gpr_save: resq 4
+_rsp_save: resq 1
+endstruc
+
+; JOB* submit_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+MKGLOBAL(submit_job_hmac_avx,function,internal)
+submit_job_hmac_avx:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*2], rsi
+ mov [rsp + _gpr_save + 8*3], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ shr unused_lanes, 8
+ imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov tmp, len
+ shr tmp, 6 ; divide by 64, len in terms of blocks
+
+ mov [lane_data + _job_in_lane], job
+ mov dword [lane_data + _outer_done], 0
+
+ vmovdqa xmm0, [state + _lens]
+ XVPINSRW xmm0, xmm1, p, lane, tmp, scale_x16
+ vmovdqa [state + _lens], xmm0
+
+ mov last_len, len
+ and last_len, 63
+ lea extra_blocks, [last_len + 9 + 63]
+ shr extra_blocks, 6
+ mov [lane_data + _extra_blocks], DWORD(extra_blocks)
+
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _args_data_ptr + PTR_SZ*lane], p
+ cmp len, 64
+ jb copy_lt64
+
+fast_copy:
+ add p, len
+ vmovdqu xmm0, [p - 64 + 0*16]
+ vmovdqu xmm1, [p - 64 + 1*16]
+ vmovdqu xmm2, [p - 64 + 2*16]
+ vmovdqu xmm3, [p - 64 + 3*16]
+ vmovdqa [lane_data + _extra_block + 0*16], xmm0
+ vmovdqa [lane_data + _extra_block + 1*16], xmm1
+ vmovdqa [lane_data + _extra_block + 2*16], xmm2
+ vmovdqa [lane_data + _extra_block + 3*16], xmm3
+end_fast_copy:
+
+ mov size_offset, extra_blocks
+ shl size_offset, 6
+ sub size_offset, last_len
+ add size_offset, 64-8
+ mov [lane_data + _size_offset], DWORD(size_offset)
+ mov start_offset, 64
+ sub start_offset, last_len
+ mov [lane_data + _start_offset], DWORD(start_offset)
+
+ lea tmp, [8*64 + 8*len]
+ bswap tmp
+ mov [lane_data + _extra_block + size_offset], tmp
+
+ mov tmp, [job + _auth_key_xor_ipad]
+ vmovdqu xmm0, [tmp]
+ mov DWORD(tmp), [tmp + 4*4]
+ vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 0*SHA1_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3
+ mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp)
+
+ test len, ~63
+ jnz ge64_bytes
+
+lt64_bytes:
+ vmovdqa xmm0, [state + _lens]
+ XVPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16
+ vmovdqa [state + _lens], xmm0
+
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr + PTR_SZ*lane], tmp
+ mov dword [lane_data + _extra_blocks], 0
+
+ge64_bytes:
+ cmp unused_lanes, 0xff
+ jne return_null
+ jmp start_loop
+
+ align 16
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens]
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ vpshuflw xmm1, xmm1, 0
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mult_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+
+ vmovdqa xmm0, [state + _lens]
+ XVPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16
+ vmovdqa [state + _lens], xmm0
+
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+
+ vmovd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3
+ vpshufb xmm0, xmm0, [rel byteswap]
+ mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ vmovdqa [lane_data + _outer_block], xmm0
+ mov [lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ mov DWORD(tmp), [tmp + 4*4]
+ vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3
+ mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp)
+ jmp start_loop
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+
+ vmovdqa xmm0, [state + _lens]
+ XVPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16
+ vmovdqa [state + _lens], xmm0
+
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp start_loop
+
+ align 16
+copy_lt64:
+ ;; less than one message block of data
+ ;; beginning of source block
+ ;; destination extrablock but backwards by len from where 0x80 pre-populated
+ lea p2, [lane_data + _extra_block + 64]
+ sub p2, len
+ memcpy_avx_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
+ mov unused_lanes, [state + _unused_lanes]
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; copy 12 bytes
+ mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+ mov [p + 0*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+ mov [p + 1*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2)
+ mov [p + 2*SHA1_DIGEST_WORD_SIZE], DWORD(tmp3)
+
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 12
+ je clear_ret
+
+ ;; copy remaining 8 bytes to return 20 byte digest
+ mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ mov [p + 3*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+ mov [p + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2)
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ ;; Clear digest (20B), outer_block (20B) and extra_block (64B) of returned job
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], 0
+
+ vpxor xmm0, xmm0
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+ ;; Clear first 64 bytes of extra_block
+%assign offset 0
+%rep 4
+ vmovdqa [lane_data + _extra_block + offset], xmm0
+%assign offset (offset + 16)
+%endrep
+
+ ;; Clear first 20 bytes of outer_block
+ vmovdqa [lane_data + _outer_block], xmm0
+ mov dword [lane_data + _outer_block + 16], 0
+%endif
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*2]
+ mov rdi, [rsp + _gpr_save + 8*3]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/md5_x4x2_avx.asm b/src/spdk/intel-ipsec-mb/avx/md5_x4x2_avx.asm
new file mode 100644
index 000000000..1aa2c2600
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/md5_x4x2_avx.asm
@@ -0,0 +1,716 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; code to compute octal MD5 using AVX
+
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rbp
+;;
+;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdi rbp
+;;
+;; clobbers xmm0-15
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+
+extern MD5_TABLE
+
+section .data
+default rel
+align 64
+ONES:
+ dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
+
+section .text
+
+%ifdef LINUX
+;; Linux Registers
+%define arg1 rdi
+%define arg2 rsi
+%define mem1 rcx
+%define mem2 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define mem1 rdi
+%define mem2 rsi
+%endif
+
+;; rbp is not clobbered
+
+%define state arg1
+%define num_blks arg2
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+%define inp4 r12
+%define inp5 r13
+%define inp6 r14
+%define inp7 r15
+
+%define TBL rax
+%define IDX rbx
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4 ; tmp
+%define F xmm5 ; tmp
+
+%define A2 xmm6
+%define B2 xmm7
+%define C2 xmm8
+%define D2 xmm9
+
+
+%define FUN E
+%define TMP F
+%define FUN2 xmm10
+%define TMP2 xmm11
+
+%define T0 xmm10
+%define T1 xmm11
+%define T2 xmm12
+%define T3 xmm13
+%define T4 xmm14
+%define T5 xmm15
+
+; Stack Layout
+;
+; 470 DD2
+; 460 CC2
+; 450 BB2
+; 440 AA2
+; 430 DD
+; 420 CC
+; 410 BB
+; 400 AA
+;
+; 3F0 data2[15] for lanes 7...4 \
+; ... \
+; 300 data2[0] for lanes 7...4 \
+; 2F0 data2[15] for lanes 3...0 > mem block 2
+; ... /
+; 210 data2[1] for lanes 3...0 /
+; 200 data2[0] for lanes 3...0 /
+;
+; 1F0 data1[15] for lanes 7...4 \
+; ... \
+; 100 data1[0] for lanes 7...4 \
+; F0 data1[15] for lanes 3...0 > mem block 1
+; ... /
+; 10 data1[1] for lanes 3...0 /
+; 0 data1[0] for lanes 3...0 /
+
+; stack size must be an odd multiple of 8 bytes in size
+struc STACK
+_DATA: reso 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs
+_DIGEST: reso 8 ; stores AA-DD, AA2-DD2
+ resb 8 ; for alignment
+endstruc
+%define STACK_SIZE STACK_size
+
+%define AA rsp + _DIGEST + 16*0
+%define BB rsp + _DIGEST + 16*1
+%define CC rsp + _DIGEST + 16*2
+%define DD rsp + _DIGEST + 16*3
+%define AA2 rsp + _DIGEST + 16*4
+%define BB2 rsp + _DIGEST + 16*5
+%define CC2 rsp + _DIGEST + 16*6
+%define DD2 rsp + _DIGEST + 16*7
+
+;;
+;; MD5 left rotations (number of bits)
+;;
+rot11 equ 7
+rot12 equ 12
+rot13 equ 17
+rot14 equ 22
+rot21 equ 5
+rot22 equ 9
+rot23 equ 14
+rot24 equ 20
+rot31 equ 4
+rot32 equ 11
+rot33 equ 16
+rot34 equ 23
+rot41 equ 6
+rot42 equ 10
+rot43 equ 15
+rot44 equ 21
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+ vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+
+;;
+;; Magic functions defined in RFC 1321
+;;
+; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z))))
+%macro MAGIC_F 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ vpxor %%F,%%Z, %%Y
+ vpand %%F,%%F,%%X
+ vpxor %%F,%%F,%%Z
+%endmacro
+
+; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y))
+%macro MAGIC_G 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ MAGIC_F %%F,%%Z,%%X,%%Y
+%endmacro
+
+; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z))
+%macro MAGIC_H 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ vpxor %%F,%%Z, %%Y
+ vpxor %%F,%%F, %%X
+%endmacro
+
+; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z)))
+%macro MAGIC_I 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ vpxor %%F,%%Z,[rel ONES] ; pnot %%F
+ vpor %%F,%%F,%%X
+ vpxor %%F,%%F,%%Y
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsrld %%tmp, %%reg, (32-%%imm)
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot
+%macro MD5_STEP1 14
+%define %%MAGIC_FUN %1
+%define %%A %2
+%define %%B %3
+%define %%C %4
+%define %%D %5
+%define %%A2 %6
+%define %%B2 %7
+%define %%C2 %8
+%define %%D2 %9
+%define %%FUN %10
+%define %%TMP %11
+%define %%data %12
+%define %%MD5const %13
+%define %%nrot %14
+
+ vpaddd %%A, %%A, %%MD5const
+ vpaddd %%A2, %%A2, %%MD5const
+ vpaddd %%A, %%A, [%%data]
+ vpaddd %%A2, %%A2, [%%data + 16*16]
+ %%MAGIC_FUN %%FUN, %%B,%%C,%%D
+ vpaddd %%A, %%A, %%FUN
+ %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2
+ vpaddd %%A2, %%A2, %%FUN
+ PROLD %%A,%%nrot, %%TMP
+ PROLD %%A2,%%nrot, %%TMP
+ vpaddd %%A, %%A, %%B
+ vpaddd %%A2, %%A2, %%B2
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data,
+; MD5const, nrot
+%macro MD5_STEP 16
+%define %%MAGIC_FUN %1
+%define %%A %2
+%define %%B %3
+%define %%C %4
+%define %%D %5
+%define %%A2 %6
+%define %%B2 %7
+%define %%C2 %8
+%define %%D2 %9
+%define %%FUN %10
+%define %%TMP %11
+%define %%FUN2 %12
+%define %%TMP2 %13
+%define %%data %14
+%define %%MD5const %15
+%define %%nrot %16
+
+ vmovdqa %%TMP,[%%data]
+ vmovdqa %%TMP2,[%%data + 16*16]
+ vpaddd %%A, %%A, %%MD5const
+ vpaddd %%A2, %%A2, %%MD5const
+ vpaddd %%A, %%A, %%TMP
+ vpaddd %%A2, %%A2, %%TMP2
+ %%MAGIC_FUN %%FUN, %%B,%%C,%%D
+ %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2
+ vpaddd %%A, %%A, %%FUN
+ vpaddd %%A2, %%A2, %%FUN2
+ PROLD %%A,%%nrot, %%TMP
+ PROLD %%A2,%%nrot, %%TMP2
+ vpaddd %%A, %%A, %%B
+ vpaddd %%A2, %%A2, %%B2
+%endmacro
+
+; void md5_x4x2_avx(MD5_ARGS *args, UINT64 num_blks)
+; arg 1 : pointer to MD5_ARGS structure
+; arg 2 : number of blocks (>=1)
+;
+align 32
+MKGLOBAL(md5_x4x2_avx,function,internal)
+md5_x4x2_avx:
+
+ sub rsp, STACK_SIZE
+
+ ;; each row of transposed digests is split into 2 parts, the right half stored in A, and left half in A2
+ ;; Initialize digests
+ vmovdqa A,[state + 0*16 + 0*MD5_DIGEST_ROW_SIZE]
+ vmovdqa B,[state + 0*16 + 1*MD5_DIGEST_ROW_SIZE]
+ vmovdqa C,[state + 0*16 + 2*MD5_DIGEST_ROW_SIZE]
+ vmovdqa D,[state + 0*16 + 3*MD5_DIGEST_ROW_SIZE]
+
+ vmovdqa A2,[state + 1*16 + 0*MD5_DIGEST_ROW_SIZE]
+ vmovdqa B2,[state + 1*16 + 1*MD5_DIGEST_ROW_SIZE]
+ vmovdqa C2,[state + 1*16 + 2*MD5_DIGEST_ROW_SIZE]
+ vmovdqa D2,[state + 1*16 + 3*MD5_DIGEST_ROW_SIZE]
+
+ lea TBL, [rel MD5_TABLE]
+
+ ;; load input pointers
+ mov inp0,[state+_data_ptr_md5 +0*PTR_SZ]
+ mov inp1,[state+_data_ptr_md5 +1*PTR_SZ]
+ mov inp2,[state+_data_ptr_md5 +2*PTR_SZ]
+ mov inp3,[state+_data_ptr_md5 +3*PTR_SZ]
+ mov inp4,[state+_data_ptr_md5 +4*PTR_SZ]
+ mov inp5,[state+_data_ptr_md5 +5*PTR_SZ]
+ mov inp6,[state+_data_ptr_md5 +6*PTR_SZ]
+ mov inp7,[state+_data_ptr_md5 +7*PTR_SZ]
+ xor IDX, IDX
+
+ ; Make ping-pong pointers to the two memory blocks
+ mov mem1, rsp
+ lea mem2, [rsp + 16*16*2]
+
+;; Load first block of data and save back to stack
+%assign I 0
+%rep 4
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem1+(I*4+0)*16],T0
+ vmovdqa [mem1+(I*4+1)*16],T1
+ vmovdqa [mem1+(I*4+2)*16],T2
+ vmovdqa [mem1+(I*4+3)*16],T3
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem1+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem1+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem1+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem1+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+%endrep
+
+lloop:
+ ; save old digests
+ vmovdqa [AA], A
+ vmovdqa [BB], B
+ vmovdqa [CC], C
+ vmovdqa [DD], D
+ ; save old digests
+ vmovdqa [AA2], A2
+ vmovdqa [BB2], B2
+ vmovdqa [CC2], C2
+ vmovdqa [DD2], D2
+
+ add IDX, 4*16
+ sub num_blks, 1
+ je lastblock
+
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+ 0*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 1*16, [TBL+ 1*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+ 2*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 3*16, [TBL+ 3*16], rot14
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+ 4*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 5*16, [TBL+ 5*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+ 6*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 7*16, [TBL+ 7*16], rot14
+
+%assign I 0
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16],T0
+ vmovdqa [mem2+(I*4+1)*16],T1
+ vmovdqa [mem2+(I*4+2)*16],T2
+ vmovdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+ 8*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 9*16, [TBL+ 9*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+10*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +11*16, [TBL+11*16], rot14
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+12*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +13*16, [TBL+13*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+14*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +15*16, [TBL+15*16], rot14
+
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem2+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem2+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+16*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 6*16, [TBL+17*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+18*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 0*16, [TBL+19*16], rot24
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+20*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +10*16, [TBL+21*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+22*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 4*16, [TBL+23*16], rot24
+
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16],T0
+ vmovdqa [mem2+(I*4+1)*16],T1
+ vmovdqa [mem2+(I*4+2)*16],T2
+ vmovdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+24*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +14*16, [TBL+25*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+26*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 8*16, [TBL+27*16], rot24
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+28*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 2*16, [TBL+29*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+30*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +12*16, [TBL+31*16], rot24
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem2+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem2+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+32*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 8*16, [TBL+33*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+34*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +14*16, [TBL+35*16], rot34
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+36*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 4*16, [TBL+37*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+38*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +10*16, [TBL+39*16], rot34
+
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16],T0
+ vmovdqa [mem2+(I*4+1)*16],T1
+ vmovdqa [mem2+(I*4+2)*16],T2
+ vmovdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+40*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 0*16, [TBL+41*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+42*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 6*16, [TBL+43*16], rot34
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+44*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +12*16, [TBL+45*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+46*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 2*16, [TBL+47*16], rot34
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem2+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem2+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+48*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 7*16, [TBL+49*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+50*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 5*16, [TBL+51*16], rot44
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+52*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 3*16, [TBL+53*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+54*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 1*16, [TBL+55*16], rot44
+
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16],T0
+ vmovdqa [mem2+(I*4+1)*16],T1
+ vmovdqa [mem2+(I*4+2)*16],T2
+ vmovdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+56*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +15*16, [TBL+57*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+58*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +13*16, [TBL+59*16], rot44
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+60*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +11*16, [TBL+61*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+62*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 9*16, [TBL+63*16], rot44
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem2+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem2+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+
+ vpaddd A,A,[AA]
+ vpaddd B,B,[BB]
+ vpaddd C,C,[CC]
+ vpaddd D,D,[DD]
+
+ vpaddd A2,A2,[AA2]
+ vpaddd B2,B2,[BB2]
+ vpaddd C2,C2,[CC2]
+ vpaddd D2,D2,[DD2]
+
+ ; swap mem1 and mem2
+ xchg mem1, mem2
+
+ jmp lloop
+
+lastblock:
+
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+ 0*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+ 1*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+ 2*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+ 3*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+ 4*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+ 5*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+ 6*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+ 7*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+ 8*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+ 9*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+10*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+11*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+12*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+13*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+14*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+15*16], rot14
+
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+16*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+17*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+18*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+19*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+20*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+21*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+22*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+23*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+24*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+25*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+26*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+27*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+28*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+29*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+30*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+31*16], rot24
+
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+32*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+33*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+34*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+35*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+36*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+37*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+38*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+39*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+40*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+41*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+42*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+43*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+44*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+45*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+46*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+47*16], rot34
+
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+48*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+49*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+50*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+51*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+52*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+53*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+54*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+55*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+56*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+57*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+58*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+59*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+60*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+61*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+62*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+63*16], rot44
+
+ vpaddd A,A,[AA]
+ vpaddd B,B,[BB]
+ vpaddd C,C,[CC]
+ vpaddd D,D,[DD]
+
+ vpaddd A2,A2,[AA2]
+ vpaddd B2,B2,[BB2]
+ vpaddd C2,C2,[CC2]
+ vpaddd D2,D2,[DD2]
+
+ ; write out digests
+ vmovdqu [state + 0*16 + 0*MD5_DIGEST_ROW_SIZE ], A
+ vmovdqu [state + 0*16 + 1*MD5_DIGEST_ROW_SIZE ], B
+ vmovdqu [state + 0*16 + 2*MD5_DIGEST_ROW_SIZE ], C
+ vmovdqu [state + 0*16 + 3*MD5_DIGEST_ROW_SIZE ], D
+ vmovdqu [state + 1*16 + 0*MD5_DIGEST_ROW_SIZE], A2
+ vmovdqu [state + 1*16 + 1*MD5_DIGEST_ROW_SIZE], B2
+ vmovdqu [state + 1*16 + 2*MD5_DIGEST_ROW_SIZE], C2
+ vmovdqu [state + 1*16 + 3*MD5_DIGEST_ROW_SIZE], D2
+
+ ;; update input pointers
+ add inp0, IDX
+ add inp1, IDX
+ add inp2, IDX
+ add inp3, IDX
+ add inp4, IDX
+ add inp5, IDX
+ add inp6, IDX
+ add inp7, IDX
+ mov [state +_data_ptr_md5 + 0*PTR_SZ], inp0
+ mov [state +_data_ptr_md5 + 1*PTR_SZ], inp1
+ mov [state +_data_ptr_md5 + 2*PTR_SZ], inp2
+ mov [state +_data_ptr_md5 + 3*PTR_SZ], inp3
+ mov [state +_data_ptr_md5 + 4*PTR_SZ], inp4
+ mov [state +_data_ptr_md5 + 5*PTR_SZ], inp5
+ mov [state +_data_ptr_md5 + 6*PTR_SZ], inp6
+ mov [state +_data_ptr_md5 + 7*PTR_SZ], inp7
+
+ ;; Clear stack frame (72*16 bytes)
+%ifdef SAFE_DATA
+ vpxor xmm0, xmm0
+%assign i 0
+%rep (2*2*16+8)
+ vmovdqa [rsp + i*16], xmm0
+%assign i (i+1)
+%endrep
+%endif
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+ add rsp, STACK_SIZE
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/pon_avx.asm b/src/spdk/intel-ipsec-mb/avx/pon_avx.asm
new file mode 100644
index 000000000..8510dc4a3
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/pon_avx.asm
@@ -0,0 +1,1170 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%use smartalign
+
+%include "job_aes_hmac.asm"
+%include "include/os.asm"
+%include "include/memcpy.asm"
+
+;;; This is implementation of stitched algorithms: AES128-CTR + CRC32 + BIP
+;;; This combination is required by PON/xPON/gPON standard.
+;;; Note: BIP is running XOR of double words
+;;; Order of operations:
+;;; - encrypt: HEC update (XGEM header), CRC32 (Ethernet FCS), AES-CTR and BIP
+;;; - decrypt: BIP, AES-CTR and CRC32 (Ethernet FCS)
+
+extern byteswap_const
+extern ddq_add_1
+
+section .data
+default rel
+
+;;; Precomputed constants for CRC32 (Ethernet FCS)
+;;; Details of the CRC algorithm and 4 byte buffer of
+;;; {0x01, 0x02, 0x03, 0x04}:
+;;; Result Poly Init RefIn RefOut XorOut
+;;; 0xB63CFBCD 0x04C11DB7 0xFFFFFFFF true true 0xFFFFFFFF
+align 16
+rk1:
+ dq 0x00000000ccaa009e, 0x00000001751997d0
+
+align 16
+rk5:
+ dq 0x00000000ccaa009e, 0x0000000163cd6124
+
+align 16
+rk7:
+ dq 0x00000001f7011640, 0x00000001db710640
+
+align 16
+pshufb_shf_table:
+ ;; use these values for shift registers with the pshufb instruction
+ dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+ dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+align 16
+init_crc_value:
+ dq 0x00000000FFFFFFFF, 0x0000000000000000
+
+align 16
+mask:
+ dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
+
+align 16
+mask2:
+ dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
+align 16
+mask3:
+ dq 0x8080808080808080, 0x8080808080808080
+
+align 16
+mask_out_top_bytes:
+ dq 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
+ dq 0x0000000000000000, 0x0000000000000000
+
+align 16
+ddq_add_1_1:
+ dq 0x1, 0x1
+
+;; Precomputed constants for HEC calculation (XGEM header)
+;; POLY 0x53900000:
+;; k1 = 0xf9800000
+;; k2 = 0xa0900000
+;; k3 = 0x7cc00000
+;; q = 0x46b927ec
+;; p_res = 0x53900000
+
+align 16
+k3_q:
+ dq 0x7cc00000, 0x46b927ec
+
+align 16
+p_res:
+ dq 0x53900000, 0
+
+align 16
+mask_out_top_64bits:
+ dq 0xffffffff_ffffffff, 0
+
+section .text
+
+%define NUM_AES_ROUNDS 10
+
+%define xcounter xmm0
+%define xbip xmm1
+%define xcrc xmm2
+%define xcrckey xmm3
+%define xtmp1 xmm4
+%define xtmp2 xmm5
+%define xtmp3 xmm6
+%define xtmp4 xmm7
+%define xtmp5 xmm8
+%define xtmp6 xmm9
+%define xtmp7 xmm10
+%define xtmp8 xmm11
+%define xtmp9 xmm12
+%define xtmp10 xmm13
+%define xtmp11 xmm14
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%define arg4 rcx
+%define tmp_1 r8
+%define tmp_2 r9
+%define tmp_3 r10
+%define tmp_4 r11
+%define tmp_5 r12
+%define tmp_6 r13
+%define tmp_7 r14
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 r8
+%define arg4 r9
+%define tmp_1 r10
+%define tmp_2 r11
+%define tmp_3 rax
+%define tmp_4 r12
+%define tmp_5 r13
+%define tmp_6 r14
+%define tmp_7 r15
+%endif
+
+%define job arg1
+
+%define p_in arg2
+%define p_keys arg3
+%define p_out arg4
+
+%define num_bytes tmp_1 ; bytes to cipher
+%define tmp tmp_2
+%define ctr_check tmp_3 ; counter block overflow check
+%define bytes_to_crc tmp_4 ; number of bytes to crc ( < num_bytes)
+
+%define ethernet_fcs tmp_6 ; not used together with tmp3
+%define tmp2 tmp_5
+%define tmp3 tmp_6
+
+%define write_back_crc tmp_7
+%define decrypt_not_done tmp_7
+
+;;; ============================================================================
+;;; Does all AES encryption rounds
+%macro AES_ENC_ROUNDS 3
+%define %%KP %1 ; [in] pointer to expanded keys
+%define %%N_ROUNDS %2 ; [in] max rounds (128bit: 10, 12, 14)
+%define %%BLOCK %3 ; [in/out] XMM with encrypted block
+
+%assign round 0
+ vpxor %%BLOCK, %%BLOCK, [%%KP + (round * 16)]
+
+%rep (%%N_ROUNDS - 1)
+%assign round (round + 1)
+ vaesenc %%BLOCK, %%BLOCK, [%%KP + (round * 16)]
+%endrep
+
+%assign round (round + 1)
+ vaesenclast %%BLOCK, %%BLOCK, [%%KP + (round * 16)]
+
+%endmacro
+
+;;; ============================================================================
+;;; Does all AES encryption rounds on 4 blocks
+%macro AES_ENC_ROUNDS_4 7
+%define %%KP %1 ; [in] pointer to expanded keys
+%define %%N_ROUNDS %2 ; [in] max rounds (128bit: 10, 12, 14)
+%define %%BLOCK1 %3 ; [in/out] XMM with encrypted block
+%define %%BLOCK2 %4 ; [in/out] XMM with encrypted block
+%define %%BLOCK3 %5 ; [in/out] XMM with encrypted block
+%define %%BLOCK4 %6 ; [in/out] XMM with encrypted block
+%define %%XT1 %7 ; [clobbered] temporary XMM register
+
+%assign round 0
+ vmovdqa %%XT1, [%%KP + (round * 16)]
+ vpxor %%BLOCK1, %%BLOCK1, %%XT1
+ vpxor %%BLOCK2, %%BLOCK2, %%XT1
+ vpxor %%BLOCK3, %%BLOCK3, %%XT1
+ vpxor %%BLOCK4, %%BLOCK4, %%XT1
+
+%rep (%%N_ROUNDS - 1)
+%assign round (round + 1)
+ vmovdqa %%XT1, [%%KP + (round * 16)]
+ vaesenc %%BLOCK1, %%BLOCK1, %%XT1
+ vaesenc %%BLOCK2, %%BLOCK2, %%XT1
+ vaesenc %%BLOCK3, %%BLOCK3, %%XT1
+ vaesenc %%BLOCK4, %%BLOCK4, %%XT1
+%endrep
+
+%assign round (round + 1)
+ vmovdqa %%XT1, [%%KP + (round * 16)]
+ vaesenclast %%BLOCK1, %%BLOCK1, %%XT1
+ vaesenclast %%BLOCK2, %%BLOCK2, %%XT1
+ vaesenclast %%BLOCK3, %%BLOCK3, %%XT1
+ vaesenclast %%BLOCK4, %%BLOCK4, %%XT1
+%endmacro
+
+;;; ============================================================================
+;;; CRC multiply before XOR against data block
+%macro CRC_CLMUL 3
+%define %%XCRC_IN_OUT %1 ; [in/out] XMM with CRC (can be anything if "no_crc" below)
+%define %%XCRC_MUL %2 ; [in] XMM with CRC constant (can be anything if "no_crc" below)
+%define %%XTMP %3 ; [clobbered] temporary XMM
+
+ vpclmulqdq %%XTMP, %%XCRC_IN_OUT, %%XCRC_MUL, 0x01
+ vpclmulqdq %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%XCRC_MUL, 0x10
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%XTMP
+%endmacro
+
+;;; ============================================================================
+;;; PON stitched algorithm round on a single AES block (16 bytes):
+;;; AES-CTR (optional, depending on %%CIPH)
+;;; - prepares counter block
+;;; - encrypts counter block
+;;; - loads text
+;;; - xor's text against encrypted blocks
+;;; - stores cipher text
+;;; BIP
+;;; - BIP update on 4 x 32-bits
+;;; CRC32
+;;; - CRC32 calculation
+;;; Note: via selection of no_crc, no_bip, no_load, no_store different macro
+;;; behaviour can be achieved to match needs of the overall algorithm.
+%macro DO_PON 15
+%define %%KP %1 ; [in] GP, pointer to expanded keys
+%define %%N_ROUNDS %2 ; [in] number of AES rounds (10, 12 or 14)
+%define %%CTR %3 ; [in/out] XMM with counter block
+%define %%INP %4 ; [in/out] GP with input text pointer or "no_load"
+%define %%OUTP %5 ; [in/out] GP with output text pointer or "no_store"
+%define %%XBIP_IN_OUT %6 ; [in/out] XMM with BIP value or "no_bip"
+%define %%XCRC_IN_OUT %7 ; [in/out] XMM with CRC (can be anything if "no_crc" below)
+%define %%XCRC_MUL %8 ; [in] XMM with CRC constant (can be anything if "no_crc" below)
+%define %%TXMM0 %9 ; [clobbered|out] XMM temporary or data out (no_store)
+%define %%TXMM1 %10 ; [clobbered|in] XMM temporary or data in (no_load)
+%define %%TXMM2 %11 ; [clobbered] XMM temporary
+%define %%CRC_TYPE %12 ; [in] "first_crc" or "next_crc" or "no_crc"
+%define %%DIR %13 ; [in] "ENC" or "DEC"
+%define %%CIPH %14 ; [in] "CTR" or "NO_CTR"
+%define %%CTR_CHECK %15 ; [in/out] GP with 64bit counter (to identify overflow)
+
+%ifidn %%CIPH, CTR
+ ;; prepare counter blocks for encryption
+ vpshufb %%TXMM0, %%CTR, [rel byteswap_const]
+ ;; perform 1 increment on whole 128 bits
+ add %%CTR_CHECK, 1
+ jc %%_ctr_overflow
+ vpaddq %%CTR, %%CTR, [rel ddq_add_1]
+ jmp %%_ctr_overflow_done
+%%_ctr_overflow:
+ vpaddq %%CTR, %%CTR, [rel ddq_add_1_1]
+%%_ctr_overflow_done:
+%endif
+
+ ;; CRC calculation
+%ifidn %%CRC_TYPE, next_crc
+ ;; CRC_MUL macro could be used here but its xor affects
+ ;; performance (blocks cipher xor's) so doing CLMUL
+ ;; only here and xor is done after the cipher.
+ vpclmulqdq %%TXMM2, %%XCRC_IN_OUT, %%XCRC_MUL, 0x01
+ vpclmulqdq %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%XCRC_MUL, 0x10
+%endif
+
+%ifnidn %%INP, no_load
+ vmovdqu %%TXMM1, [%%INP]
+%endif
+
+%ifidn %%CIPH, CTR
+ ;; AES rounds
+ AES_ENC_ROUNDS %%KP, %%N_ROUNDS, %%TXMM0
+
+ ;; xor plaintext/ciphertext against encrypted counter blocks
+ vpxor %%TXMM0, %%TXMM0, %%TXMM1
+%else ;; CIPH = NO_CTR
+ ;; register copy is needed as no_load/no_store options need it
+ vmovdqa %%TXMM0, %%TXMM1
+%endif ;; CIPH = CTR
+
+%ifnidn %%CRC_TYPE, no_crc
+%ifidn %%CRC_TYPE, next_crc
+ ;; Finish split CRC_MUL() operation
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXMM2
+%endif
+%ifidn %%CIPH, CTR
+ ;; CRC calculation for ENCRYPTION/DECRYPTION
+ ;; - always XOR against plaintext block
+%ifidn %%DIR, ENC
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXMM1
+%else
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXMM0
+%endif ; DECRYPT
+%else ;; CIPH = NO_CTR
+ ;; CRC calculation for NO CIPHER option
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXMM1
+%endif ;; CIPH = CTR
+%endif ;; CRC_TYPE != NO_CRC
+
+ ;; store the result in the output buffer
+%ifnidn %%OUTP, no_store
+%ifidn %%CIPH, CTR
+ vmovdqu [%%OUTP], %%TXMM0
+%else ;; CIPH = NO_CTR
+ vmovdqu [%%OUTP], %%TXMM1
+%endif ;; CIPH = CTR
+%endif
+
+ ;; update BIP value - always use cipher text for BIP
+%ifnidn %%XBIP_IN_OUT, no_bip
+%ifidn %%CIPH, CTR
+%ifidn %%DIR, ENC
+ vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%TXMM0
+%else
+ vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%TXMM1
+%endif ; DECRYPT
+%else ;; CIPH = NO_CTR
+ vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%TXMM1
+%endif ;; CIPH = CTR
+%endif ;; !NO_BIP
+
+ ;; increment in/out pointers
+%ifnidn %%INP, no_load
+ add %%INP, 16
+%endif
+%ifnidn %%OUTP, no_store
+ add %%OUTP, 16
+%endif
+%endmacro ; DO_PON
+
+;;; ============================================================================
+;;; PON stitched algorithm round on a single AES block (16 bytes):
+;;; AES-CTR (optional, depending on %%CIPH)
+;;; - prepares counter block
+;;; - encrypts counter block
+;;; - loads text
+;;; - xor's text against encrypted blocks
+;;; - stores cipher text
+;;; BIP
+;;; - BIP update on 4 x 32-bits
+;;; CRC32
+;;; - CRC32 calculation
+;;; Note: via selection of no_crc, no_bip, no_load, no_store different macro
+;;; behaviour can be achieved to match needs of the overall algorithm.
+%macro DO_PON_4 23
+%define %%KP %1 ; [in] GP, pointer to expanded keys
+%define %%N_ROUNDS %2 ; [in] number of AES rounds (10, 12 or 14)
+%define %%CTR %3 ; [in/out] XMM with counter block
+%define %%INP %4 ; [in/out] GP with input text pointer or "no_load"
+%define %%OUTP %5 ; [in/out] GP with output text pointer or "no_store"
+%define %%XBIP_IN_OUT %6 ; [in/out] XMM with BIP value or "no_bip"
+%define %%XCRC_IN_OUT %7 ; [in/out] XMM with CRC (can be anything if "no_crc" below)
+%define %%XCRC_MUL %8 ; [in] XMM with CRC constant (can be anything if "no_crc" below)
+%define %%T0 %9 ; [clobbered] XMM temporary
+%define %%T1 %10 ; [clobbered] XMM temporary
+%define %%T2 %11 ; [clobbered] XMM temporary
+%define %%T3 %12 ; [clobbered] XMM temporary
+%define %%T4 %13 ; [clobbered] XMM temporary
+%define %%T5 %14 ; [clobbered] XMM temporary
+%define %%T6 %15 ; [clobbered] XMM temporary
+%define %%T7 %16 ; [clobbered] XMM temporary
+%define %%T8 %17 ; [clobbered] XMM temporary
+%define %%T9 %18 ; [clobbered] XMM temporary
+%define %%T10 %19 ; [clobbered] XMM temporary
+%define %%CRC_TYPE %20 ; [in] "first_crc" or "next_crc" or "no_crc"
+%define %%DIR %21 ; [in] "ENC" or "DEC"
+%define %%CIPH %22 ; [in] "CTR" or "NO_CTR"
+%define %%CTR_CHECK %23 ; [in/out] GP with 64bit counter (to identify overflow)
+
+%define %%CTR1 %%T3
+%define %%CTR2 %%T4
+%define %%CTR3 %%T5
+%define %%CTR4 %%T6
+
+%define %%TXT1 %%T7
+%define %%TXT2 %%T8
+%define %%TXT3 %%T9
+%define %%TXT4 %%T10
+
+%ifidn %%CIPH, CTR
+ ;; prepare counter blocks for encryption
+ vmovdqa %%T0, [rel ddq_add_1]
+ vmovdqa %%T2, [rel byteswap_const]
+
+ ;; CTR1: copy saved CTR value as CTR1
+ vmovdqa %%CTR1, %%CTR
+
+ cmp %%CTR_CHECK, 0xffff_ffff_ffff_ffff - 4
+ ja %%_ctr_will_overflow
+
+ ;; case in which 64-bit counter will not overflow
+ vpaddq %%CTR2, %%CTR1, %%T0
+ vpaddq %%CTR3, %%CTR2, %%T0
+ vpaddq %%CTR4, %%CTR3, %%T0
+ vpaddq %%CTR, %%CTR4, %%T0
+ vpshufb %%CTR1, %%CTR1, %%T2
+ vpshufb %%CTR2, %%CTR2, %%T2
+ vpshufb %%CTR3, %%CTR3, %%T2
+ vpshufb %%CTR4, %%CTR4, %%T2
+ add %%CTR_CHECK, 4
+ jmp %%_ctr_update_done
+
+%%_ctr_will_overflow:
+ vmovdqa %%T1, [rel ddq_add_1_1]
+ ;; CTR2: perform 1 increment on whole 128 bits
+ add %%CTR_CHECK, 1
+ jc %%_ctr2_overflow
+ vpaddq %%CTR2, %%CTR1, %%T0
+ jmp %%_ctr2_overflow_done
+%%_ctr2_overflow:
+ vpaddq %%CTR2, %%CTR1, %%T1
+%%_ctr2_overflow_done:
+ vpshufb %%CTR1, %%CTR1, %%T2
+
+ ;; CTR3: perform 1 increment on whole 128 bits
+ add %%CTR_CHECK, 1
+ jc %%_ctr3_overflow
+ vpaddq %%CTR3, %%CTR2, %%T0
+ jmp %%_ctr3_overflow_done
+%%_ctr3_overflow:
+ vpaddq %%CTR3, %%CTR2, %%T1
+%%_ctr3_overflow_done:
+ vpshufb %%CTR2, %%CTR2, %%T2
+
+ ;; CTR4: perform 1 increment on whole 128 bits
+ add %%CTR_CHECK, 1
+ jc %%_ctr4_overflow
+ vpaddq %%CTR4, %%CTR3, %%T0
+ jmp %%_ctr4_overflow_done
+%%_ctr4_overflow:
+ vpaddq %%CTR4, %%CTR3, %%T1
+%%_ctr4_overflow_done:
+ vpshufb %%CTR3, %%CTR3, %%T2
+
+ ;; CTR: perform 1 increment on whole 128 bits (for the next iteration)
+ add %%CTR_CHECK, 1
+ jc %%_ctr_overflow
+ vpaddq %%CTR, %%CTR4, %%T0
+ jmp %%_ctr_overflow_done
+%%_ctr_overflow:
+ vpaddq %%CTR, %%CTR4, %%T1
+%%_ctr_overflow_done:
+ vpshufb %%CTR4, %%CTR4, %%T2
+%%_ctr_update_done:
+%endif
+
+%ifidn %%CRC_TYPE, next_crc
+ ;; CRC_MUL macro could be used here but its xor affects
+ ;; performance (blocks cipher xor's) so doing CLMUL
+ ;; only here and xor is done after the cipher.
+ vpclmulqdq %%T2, %%XCRC_IN_OUT, %%XCRC_MUL, 0x01
+ vpclmulqdq %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%XCRC_MUL, 0x10
+%endif
+
+ ;; load plaintext/ciphertext
+ vmovdqu %%TXT1, [%%INP]
+ vmovdqu %%TXT2, [%%INP + 16]
+ vmovdqu %%TXT3, [%%INP + 32]
+ vmovdqu %%TXT4, [%%INP + 48]
+
+%ifidn %%CIPH, CTR
+ AES_ENC_ROUNDS_4 %%KP, %%N_ROUNDS, %%CTR1, %%CTR2, %%CTR3, %%CTR4, %%T0
+
+ ;; xor plaintext/ciphertext against encrypted counter blocks
+ vpxor %%CTR1, %%CTR1, %%TXT1
+ vpxor %%CTR2, %%CTR2, %%TXT2
+ vpxor %%CTR3, %%CTR3, %%TXT3
+ vpxor %%CTR4, %%CTR4, %%TXT4
+%endif ;; CIPH = CTR
+
+%ifidn %%CRC_TYPE, next_crc
+ ;; Finish split CRC_MUL() operation
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%T2
+%endif
+%ifidn %%CIPH, CTR
+%ifidn %%DIR, ENC
+ ;; CRC calculation for ENCRYPTION (blocks 1 & 2)
+ ;; - XOR CRC against plaintext block
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT1
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT2
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+%else
+ ;; CRC calculation for DECRYPTION (blocks 1 & 2)
+ ;; - XOR CRC against plaintext block
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%CTR1
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%CTR2
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+%endif ; DECRYPT
+%else ;; CIPH = NO_CTR
+ ;; CRC calculation for NO CIPHER option (blocks 1 & 2)
+ ;; - XOR CRC against plaintext block
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT1
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT2
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+%endif ;; CIPH = CTR
+
+ ;; store ciphertext/plaintext
+%ifidn %%CIPH, CTR
+ vmovdqu [%%OUTP], %%CTR1
+ vmovdqu [%%OUTP + 16], %%CTR2
+ vmovdqu [%%OUTP + 32], %%CTR3
+ vmovdqu [%%OUTP + 48], %%CTR4
+%else ;; CIPH = NO_CTR
+ vmovdqu [%%OUTP], %%TXT1
+ vmovdqu [%%OUTP + 16], %%TXT2
+ vmovdqu [%%OUTP + 32], %%TXT3
+ vmovdqu [%%OUTP + 48], %%TXT4
+%endif ;; CIPH = CTR
+
+ ;; update BIP value
+%ifidn %%CIPH, CTR
+ ;; - always use ciphertext for BIP
+%ifidn %%DIR, ENC
+ vpxor %%T0, %%CTR1, %%CTR2
+ vpxor %%T1, %%CTR3, %%CTR4
+%else
+ vpxor %%T0, %%TXT1, %%TXT2
+ vpxor %%T1, %%TXT3, %%TXT4
+%endif ; DECRYPT
+%else ;; CIPH = NO_CTR
+ vpxor %%T0, %%TXT1, %%TXT2
+ vpxor %%T1, %%TXT3, %%TXT4
+%endif ;; CIPH = CTR
+ vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%T0
+ vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%T1
+
+ ;; increment in/out pointers
+ add %%INP, 64
+ add %%OUTP, 64
+
+%ifidn %%CIPH, CTR
+%ifidn %%DIR, ENC
+ ;; CRC calculation for ENCRYPTION (blocks 3 & 4)
+ ;; - XOR CRC against plaintext block
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT3
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT4
+%else
+ ;; CRC calculation for DECRYPTION (blocks 3 & 4)
+ ;; - XOR CRC against plaintext block
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%CTR3
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%CTR4
+%endif ; DECRYPT
+%else ;; CIPH = NO_CTR
+ ;; CRC calculation for NO CIPHER option (blocks 3 & 4)
+ ;; - XOR CRC against plaintext block
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT3
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT4
+%endif ;; CIPH = CTR
+
+%endmacro ; DO_PON_4
+
+;;; ============================================================================
+;;; CIPHER and BIP specified number of bytes
+%macro CIPHER_BIP_REST 14
+%define %%NUM_BYTES %1 ; [in/clobbered] number of bytes to cipher
+%define %%DIR %2 ; [in] "ENC" or "DEC"
+%define %%CIPH %3 ; [in] "CTR" or "NO_CTR"
+%define %%PTR_IN %4 ; [in/clobbered] GPR pointer to input buffer
+%define %%PTR_OUT %5 ; [in/clobbered] GPR pointer to output buffer
+%define %%PTR_KEYS %6 ; [in] GPR pointer to expanded keys
+%define %%XBIP_IN_OUT %7 ; [in/out] XMM 128-bit BIP state
+%define %%XCTR_IN_OUT %8 ; [in/out] XMM 128-bit AES counter block
+%define %%XMMT1 %9 ; [clobbered] temporary XMM
+%define %%XMMT2 %10 ; [clobbered] temporary XMM
+%define %%XMMT3 %11 ; [clobbered] temporary XMM
+%define %%CTR_CHECK %12 ; [in/out] GP with 64bit counter (to identify overflow)
+%define %%GPT1 %13 ; [clobbered] temporary GP
+%define %%GPT2 %14 ; [clobbered] temporary GP
+
+ align 16
+%%_cipher_last_blocks:
+ cmp %%NUM_BYTES, 16
+ jb %%_partial_block_left
+
+ DO_PON %%PTR_KEYS, NUM_AES_ROUNDS, %%XCTR_IN_OUT, %%PTR_IN, %%PTR_OUT, %%XBIP_IN_OUT, \
+ no_crc, no_crc, %%XMMT1, %%XMMT2, %%XMMT3, no_crc, %%DIR, %%CIPH, %%CTR_CHECK
+ sub %%NUM_BYTES, 16
+ jz %%_bip_done
+ jmp %%_cipher_last_blocks
+
+%%_partial_block_left:
+ simd_load_avx_15_1 %%XMMT2, %%PTR_IN, %%NUM_BYTES
+
+ ;; DO_PON() is not loading nor storing the data in this case:
+ ;; XMMT2 = data in
+ ;; XMMT1 = data out
+ DO_PON %%PTR_KEYS, NUM_AES_ROUNDS, %%XCTR_IN_OUT, no_load, no_store, no_bip, \
+ no_crc, no_crc, %%XMMT1, %%XMMT2, %%XMMT3, no_crc, %%DIR, %%CIPH, %%CTR_CHECK
+
+ ;; bip update for partial block (mask out bytes outside the message)
+ lea %%GPT1, [rel mask_out_top_bytes + 16]
+ sub %%GPT1, %%NUM_BYTES
+ vmovdqu %%XMMT3, [%%GPT1]
+ ;; put masked cipher text into XMMT2 for BIP update
+%ifidn %%DIR, ENC
+ vpand %%XMMT2, %%XMMT1, %%XMMT3
+%else
+ vpand %%XMMT2, %%XMMT2, %%XMMT3
+%endif
+ vpxor %%XBIP_IN_OUT, %%XMMT2
+
+ ;; store partial bytes in the output buffer
+ simd_store_avx_15 %%PTR_OUT, %%XMMT1, %%NUM_BYTES, %%GPT1, %%GPT2
+
+%%_bip_done:
+%endmacro ; CIPHER_BIP_REST
+
+;; =============================================================================
+;; Barrett reduction from 128-bits to 32-bits modulo Ethernet FCS polynomial
+
+%macro CRC32_REDUCE_128_TO_32 5
+%define %%CRC %1 ; [out] GP to store 32-bit Ethernet FCS value
+%define %%XCRC %2 ; [in/clobbered] XMM with CRC
+%define %%XT1 %3 ; [clobbered] temporary xmm register
+%define %%XT2 %4 ; [clobbered] temporary xmm register
+%define %%XT3 %5 ; [clobbered] temporary xmm register
+
+%define %%XCRCKEY %%XT3
+
+ ;; compute crc of a 128-bit value
+ vmovdqa %%XCRCKEY, [rel rk5]
+
+ ;; 64b fold
+ vpclmulqdq %%XT1, %%XCRC, %%XCRCKEY, 0x00
+ vpsrldq %%XCRC, %%XCRC, 8
+ vpxor %%XCRC, %%XCRC, %%XT1
+
+ ;; 32b fold
+ vpslldq %%XT1, %%XCRC, 4
+ vpclmulqdq %%XT1, %%XT1, %%XCRCKEY, 0x10
+ vpxor %%XCRC, %%XCRC, %%XT1
+
+%%_crc_barrett:
+ ;; Barrett reduction
+ vpand %%XCRC, [rel mask2]
+ vmovdqa %%XT1, %%XCRC
+ vmovdqa %%XT2, %%XCRC
+ vmovdqa %%XCRCKEY, [rel rk7]
+
+ vpclmulqdq %%XCRC, %%XCRCKEY, 0x00
+ vpxor %%XCRC, %%XT2
+ vpand %%XCRC, [rel mask]
+ vmovdqa %%XT2, %%XCRC
+ vpclmulqdq %%XCRC, %%XCRCKEY, 0x10
+ vpxor %%XCRC, %%XT2
+ vpxor %%XCRC, %%XT1
+ vpextrd DWORD(%%CRC), %%XCRC, 2 ; 32-bit CRC value
+ not DWORD(%%CRC)
+%endmacro
+
+;; =============================================================================
+;; Barrett reduction from 128-bits to 32-bits modulo 0x53900000 polynomial
+
+%macro HEC_REDUCE_128_TO_32 4
+%define %%XMM_IN_OUT %1 ; [in/out] xmm register with data in and out
+%define %%XT1 %2 ; [clobbered] temporary xmm register
+%define %%XT2 %3 ; [clobbered] temporary xmm register
+%define %%XT3 %4 ; [clobbered] temporary xmm register
+
+%define %%K3_Q %%XT1
+%define %%P_RES %%XT2
+%define %%XTMP %%XT3
+
+ ;; 128 to 64 bit reduction
+ vmovdqa %%K3_Q, [k3_q]
+ vmovdqa %%P_RES, [p_res]
+
+ vpclmulqdq %%XTMP, %%XMM_IN_OUT, %%K3_Q, 0x01 ; K3
+ vpxor %%XTMP, %%XTMP, %%XMM_IN_OUT
+
+ vpclmulqdq %%XTMP, %%XTMP, %%K3_Q, 0x01 ; K3
+ vpxor %%XMM_IN_OUT, %%XTMP, %%XMM_IN_OUT
+
+ vpand %%XMM_IN_OUT, [rel mask_out_top_64bits]
+
+ ;; 64 to 32 bit reduction
+ vpsrldq %%XTMP, %%XMM_IN_OUT, 4
+ vpclmulqdq %%XTMP, %%XTMP, %%K3_Q, 0x10 ; Q
+ vpxor %%XTMP, %%XTMP, %%XMM_IN_OUT
+ vpsrldq %%XTMP, %%XTMP, 4
+
+ vpclmulqdq %%XTMP, %%XTMP, %%P_RES, 0x00 ; P
+ vpxor %%XMM_IN_OUT, %%XTMP, %%XMM_IN_OUT
+%endmacro
+
+;; =============================================================================
+;; Barrett reduction from 64-bits to 32-bits modulo 0x53900000 polynomial
+
+%macro HEC_REDUCE_64_TO_32 4
+%define %%XMM_IN_OUT %1 ; [in/out] xmm register with data in and out
+%define %%XT1 %2 ; [clobbered] temporary xmm register
+%define %%XT2 %3 ; [clobbered] temporary xmm register
+%define %%XT3 %4 ; [clobbered] temporary xmm register
+
+%define %%K3_Q %%XT1
+%define %%P_RES %%XT2
+%define %%XTMP %%XT3
+
+ vmovdqa %%K3_Q, [k3_q]
+ vmovdqa %%P_RES, [p_res]
+
+ ;; 64 to 32 bit reduction
+ vpsrldq %%XTMP, %%XMM_IN_OUT, 4
+ vpclmulqdq %%XTMP, %%XTMP, %%K3_Q, 0x10 ; Q
+ vpxor %%XTMP, %%XTMP, %%XMM_IN_OUT
+ vpsrldq %%XTMP, %%XTMP, 4
+
+ vpclmulqdq %%XTMP, %%XTMP, %%P_RES, 0x00 ; P
+ vpxor %%XMM_IN_OUT, %%XTMP, %%XMM_IN_OUT
+%endmacro
+
+;; =============================================================================
+;; HEC compute and header update for 32-bit XGEM headers
+%macro HEC_COMPUTE_32 6
+%define %%HEC_IN_OUT %1 ; [in/out] GP register with HEC in LE format
+%define %%GT1 %2 ; [clobbered] temporary GP register
+%define %%XT1 %4 ; [clobbered] temporary xmm register
+%define %%XT2 %5 ; [clobbered] temporary xmm register
+%define %%XT3 %6 ; [clobbered] temporary xmm register
+%define %%XT4 %7 ; [clobbered] temporary xmm register
+
+ mov DWORD(%%GT1), DWORD(%%HEC_IN_OUT)
+ ;; shift out 13 bits of HEC value for CRC computation
+ shr DWORD(%%GT1), 13
+
+ ;; mask out current HEC value to merge with an updated HEC at the end
+ and DWORD(%%HEC_IN_OUT), 0xffff_e000
+
+ ;; prepare the message for CRC computation
+ vmovd %%XT1, DWORD(%%GT1)
+ vpslldq %%XT1, 4 ; shift left by 32-bits
+
+ HEC_REDUCE_64_TO_32 %%XT1, %%XT2, %%XT3, %%XT4
+
+ ;; extract 32-bit value
+ ;; - normally perform 20 bit shift right but bit 0 is a parity bit
+ vmovd DWORD(%%GT1), %%XT1
+ shr DWORD(%%GT1), (20 - 1)
+
+ ;; merge header bytes with updated 12-bit CRC value and
+ ;; compute parity
+ or DWORD(%%GT1), DWORD(%%HEC_IN_OUT)
+ popcnt DWORD(%%HEC_IN_OUT), DWORD(%%GT1)
+ and DWORD(%%HEC_IN_OUT), 1
+ or DWORD(%%HEC_IN_OUT), DWORD(%%GT1)
+%endmacro
+
+;; =============================================================================
+;; HEC compute and header update for 64-bit XGEM headers
+%macro HEC_COMPUTE_64 6
+%define %%HEC_IN_OUT %1 ; [in/out] GP register with HEC in LE format
+%define %%GT1 %2 ; [clobbered] temporary GP register
+%define %%XT1 %3 ; [clobbered] temporary xmm register
+%define %%XT2 %4 ; [clobbered] temporary xmm register
+%define %%XT3 %5 ; [clobbered] temporary xmm register
+%define %%XT4 %6 ; [clobbered] temporary xmm register
+
+ mov %%GT1, %%HEC_IN_OUT
+ ;; shift out 13 bits of HEC value for CRC computation
+ shr %%GT1, 13
+
+ ;; mask out current HEC value to merge with an updated HEC at the end
+ and %%HEC_IN_OUT, 0xffff_ffff_ffff_e000
+
+ ;; prepare the message for CRC computation
+ vmovq %%XT1, %%GT1
+ vpslldq %%XT1, 4 ; shift left by 32-bits
+
+ HEC_REDUCE_128_TO_32 %%XT1, %%XT2, %%XT3, %%XT4
+
+ ;; extract 32-bit value
+ ;; - normally perform 20 bit shift right but bit 0 is a parity bit
+ vmovd DWORD(%%GT1), %%XT1
+ shr DWORD(%%GT1), (20 - 1)
+
+ ;; merge header bytes with updated 12-bit CRC value and
+ ;; compute parity
+ or %%GT1, %%HEC_IN_OUT
+ popcnt %%HEC_IN_OUT, %%GT1
+ and %%HEC_IN_OUT, 1
+ or %%HEC_IN_OUT, %%GT1
+%endmacro
+
+;;; ============================================================================
+;;; PON stitched algorithm of AES128-CTR, CRC and BIP
+;;; - this is master macro that implements encrypt/decrypt API
+;;; - calls other macros and directly uses registers
+;;; defined at the top of the file
+%macro AES128_CTR_PON 2
+%define %%DIR %1 ; [in] direction "ENC" or "DEC"
+%define %%CIPH %2 ; [in] cipher "CTR" or "NO_CTR"
+
+ push r12
+ push r13
+ push r14
+%ifndef LINUX
+ push r15
+%endif
+
+%ifidn %%DIR, ENC
+ ;; by default write back CRC for encryption
+ mov DWORD(write_back_crc), 1
+%else
+ ;; mark decryption as finished
+ mov DWORD(decrypt_not_done), 1
+%endif
+ ;; START BIP (and update HEC if encrypt direction)
+ ;; - load XGEM header (8 bytes) for BIP (not part of encrypted payload)
+ ;; - convert it into LE
+ ;; - update HEC field in the header
+ ;; - convert it into BE
+ ;; - store back the header (with updated HEC)
+ ;; - start BIP
+ ;; (free to use tmp_1, tmp2 and tmp_3 at this stage)
+ mov tmp_2, [job + _src]
+ add tmp_2, [job + _hash_start_src_offset_in_bytes]
+ mov tmp_3, [tmp_2]
+%ifidn %%DIR, ENC
+ bswap tmp_3 ; go to LE
+ HEC_COMPUTE_64 tmp_3, tmp_1, xtmp1, xtmp2, xtmp3, xtmp4
+ mov bytes_to_crc, tmp_3
+ shr bytes_to_crc, (48 + 2) ; PLI = MSB 14 bits
+ bswap tmp_3 ; go back to BE
+ mov [tmp_2], tmp_3
+ vmovq xbip, tmp_3
+%else
+ vmovq xbip, tmp_3
+ mov bytes_to_crc, tmp_3
+ bswap bytes_to_crc ; go to LE
+ shr bytes_to_crc, (48 + 2) ; PLI = MSB 14 bits
+%endif
+ cmp bytes_to_crc, 4
+ ja %%_crc_not_zero
+ ;; XGEM payload shorter or equal to 4 bytes
+%ifidn %%DIR, ENC
+ ;; On encryption, do not write Ethernet FCS back into the message
+ xor DWORD(write_back_crc), DWORD(write_back_crc)
+%else
+ ;; Mark decryption as not finished
+ ;; - Ethernet FCS is not computed
+ ;; - decrypt + BIP to be done at the end
+ xor DWORD(decrypt_not_done), DWORD(decrypt_not_done)
+%endif
+ mov DWORD(bytes_to_crc), 4 ; it will be zero after the next line (avoid jmp)
+%%_crc_not_zero:
+ sub bytes_to_crc, 4 ; subtract size of the CRC itself
+
+%ifidn %%CIPH, CTR
+ ;; - read 16 bytes of IV
+ ;; - convert to little endian format
+ ;; - save least significant 8 bytes in GP register for overflow check
+ mov tmp, [job + _iv]
+ vmovdqu xcounter, [tmp]
+ vpshufb xcounter, [rel byteswap_const]
+ vmovq ctr_check, xcounter
+%endif
+
+ ;; get input buffer (after XGEM header)
+ mov p_in, [job + _src]
+ add p_in, [job + _cipher_start_src_offset_in_bytes]
+
+ ;; get output buffer
+ mov p_out, [job + _dst]
+
+%ifidn %%CIPH, CTR
+ ;; get key pointers
+ mov p_keys, [job + _aes_enc_key_expanded]
+%endif
+
+ ;; initial CRC value
+ vmovdqa xcrc, [rel init_crc_value]
+
+ ;; load CRC constants
+ vmovdqa xcrckey, [rel rk1] ; rk1 and rk2 in xcrckey
+
+ ;; get number of bytes to cipher
+%ifidn %%CIPH, CTR
+ mov num_bytes, [job + _msg_len_to_cipher_in_bytes]
+%else
+ ;; Message length to cipher is 0
+ ;; - length is obtained from message length to hash (BIP) minus XGEM header size
+ mov num_bytes, [job + _msg_len_to_hash_in_bytes]
+ sub num_bytes, 8
+%endif
+ or bytes_to_crc, bytes_to_crc
+ jz %%_crc_done
+
+ cmp bytes_to_crc, 32
+ jae %%_at_least_32_bytes
+
+%ifidn %%DIR, DEC
+ ;; decrypt the buffer first
+ mov tmp, num_bytes
+ CIPHER_BIP_REST tmp, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \
+ xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3
+
+ ;; correct in/out pointers - go back to start of the buffers
+ mov tmp, num_bytes
+ and tmp, -16 ; partial block handler doesn't increment pointers
+ sub p_in, tmp
+ sub p_out, tmp
+%endif ; DECRYPTION
+
+ ;; less than 32 bytes
+ cmp bytes_to_crc, 16
+ je %%_exact_16_left
+ jl %%_less_than_16_left
+ ;; load the plaintext
+%ifidn %%DIR, ENC
+ vmovdqu xtmp1, [p_in]
+%else
+ vmovdqu xtmp1, [p_out]
+%endif
+ vpxor xcrc, xtmp1 ; xor the initial crc value
+ jmp %%_crc_two_xmms
+
+%%_exact_16_left:
+%ifidn %%DIR, ENC
+ vmovdqu xtmp1, [p_in]
+%else
+ vmovdqu xtmp1, [p_out]
+%endif
+ vpxor xcrc, xtmp1 ; xor the initial crc value
+ jmp %%_128_done
+
+%%_less_than_16_left:
+%ifidn %%DIR, ENC
+ simd_load_avx_15_1 xtmp1, p_in, bytes_to_crc
+%else
+ simd_load_avx_15_1 xtmp1, p_out, bytes_to_crc
+%endif
+ vpxor xcrc, xtmp1 ; xor the initial crc value
+
+ lea tmp, [rel pshufb_shf_table]
+ vmovdqu xtmp1, [tmp + bytes_to_crc]
+ vpshufb xcrc, xtmp1
+ jmp %%_128_done
+
+%%_at_least_32_bytes:
+ cmp bytes_to_crc, 64
+ jb %%_crc_below_64_bytes
+
+ DO_PON_4 p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \
+ xcrc, xcrckey, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, xtmp6, \
+ xtmp7, xtmp8, xtmp9, xtmp10, xtmp11, first_crc, %%DIR, \
+ %%CIPH, ctr_check
+ sub num_bytes, 64
+ sub bytes_to_crc, 64
+%ifidn %%DIR, ENC
+ jz %%_128_done
+%endif
+
+ align 16
+%%_main_loop_64:
+ cmp bytes_to_crc, 64
+ jb %%_main_loop
+
+ DO_PON_4 p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \
+ xcrc, xcrckey, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, xtmp6, \
+ xtmp7, xtmp8, xtmp9, xtmp10, xtmp11, next_crc, %%DIR, \
+ %%CIPH, ctr_check
+ sub num_bytes, 64
+ sub bytes_to_crc, 64
+%ifidn %%DIR, ENC
+ jz %%_128_done
+%endif
+ jmp %%_main_loop_64
+
+%%_crc_below_64_bytes:
+ DO_PON p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \
+ xcrc, xcrckey, xtmp1, xtmp2, xtmp3, first_crc, %%DIR, \
+ %%CIPH, ctr_check
+ sub num_bytes, 16
+ sub bytes_to_crc, 16
+
+ align 16
+%%_main_loop:
+ cmp bytes_to_crc, 16
+ jb %%_exit_loop
+ DO_PON p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \
+ xcrc, xcrckey, xtmp1, xtmp2, xtmp3, next_crc, %%DIR, \
+ %%CIPH, ctr_check
+ sub num_bytes, 16
+ sub bytes_to_crc, 16
+%ifidn %%DIR, ENC
+ jz %%_128_done
+%endif
+ jmp %%_main_loop
+
+%%_exit_loop:
+
+%ifidn %%DIR, DEC
+ ;; decrypt rest of the message including CRC and optional padding
+ mov tmp, num_bytes
+
+ CIPHER_BIP_REST tmp, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \
+ xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3
+
+ mov tmp, num_bytes ; correct in/out pointers - to point before cipher & BIP
+ and tmp, -16 ; partial block handler doesn't increment pointers
+ sub p_in, tmp
+ sub p_out, tmp
+
+ or bytes_to_crc, bytes_to_crc
+ jz %%_128_done
+%endif ; DECRYPTION
+
+ ;; Partial bytes left - complete CRC calculation
+%%_crc_two_xmms:
+ lea tmp, [rel pshufb_shf_table]
+ vmovdqu xtmp2, [tmp + bytes_to_crc]
+ ;; @note: in case of in-place operation (default) this load is
+ ;; creating store-to-load problem.
+ ;; However, there is no easy way to address it at the moment.
+%ifidn %%DIR, ENC
+ vmovdqu xtmp1, [p_in - 16 + bytes_to_crc] ; xtmp1 = data for CRC
+%else
+ vmovdqu xtmp1, [p_out - 16 + bytes_to_crc] ; xtmp1 = data for CRC
+%endif
+ vmovdqa xtmp3, xcrc
+ vpshufb xcrc, xtmp2 ; top num_bytes with LSB xcrc
+ vpxor xtmp2, [rel mask3]
+ vpshufb xtmp3, xtmp2 ; bottom (16 - num_bytes) with MSB xcrc
+
+ ;; data bytes_to_crc (top) blended with MSB bytes of CRC (bottom)
+ vpblendvb xtmp3, xtmp1, xtmp2
+
+ ;; final CRC calculation
+ vpclmulqdq xtmp1, xcrc, xcrckey, 0x01
+ vpclmulqdq xcrc, xcrc, xcrckey, 0x10
+ vpxor xcrc, xtmp3
+ vpxor xcrc, xtmp1
+
+%%_128_done:
+ CRC32_REDUCE_128_TO_32 ethernet_fcs, xcrc, xtmp1, xtmp2, xcrckey
+
+%%_crc_done:
+ ;; @todo - store-to-load problem in ENC case (to be fixed later)
+ ;; - store CRC in input buffer and authentication tag output
+ ;; - encrypt remaining bytes
+%ifidn %%DIR, ENC
+ or DWORD(write_back_crc), DWORD(write_back_crc)
+ jz %%_skip_crc_write_back
+ mov [p_in + bytes_to_crc], DWORD(ethernet_fcs)
+%%_skip_crc_write_back:
+%endif
+ mov tmp, [job + _auth_tag_output]
+ mov [tmp + 4], DWORD(ethernet_fcs)
+
+ or num_bytes, num_bytes
+ jz %%_do_not_cipher_the_rest
+
+ ;; encrypt rest of the message
+ ;; - partial bytes including CRC and optional padding
+ ;; decrypt rest of the message
+ ;; - this may only happen when XGEM payload is short and padding is added
+%ifidn %%DIR, DEC
+ or DWORD(decrypt_not_done), DWORD(decrypt_not_done)
+ jnz %%_do_not_cipher_the_rest
+%endif
+ CIPHER_BIP_REST num_bytes, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \
+ xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3
+
+%%_do_not_cipher_the_rest:
+
+ ;; finalize BIP
+ vpsrldq xtmp1, xbip, 4
+ vpsrldq xtmp2, xbip, 8
+ vpsrldq xtmp3, xbip, 12
+ vpxor xtmp1, xtmp1, xtmp2
+ vpxor xbip, xbip, xtmp3
+ vpxor xbip, xbip, xtmp1
+ vmovd [tmp], xbip ; tmp already holds _auth_tag_output
+
+ ;; set job status
+ or dword [job + _status], STS_COMPLETED
+
+ ;; return job
+ mov rax, job
+
+%ifndef LINUX
+ pop r15
+%endif
+ pop r14
+ pop r13
+ pop r12
+%endmacro ; AES128_CTR_PON
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; submit_job_pon_enc_avx(JOB_AES_HMAC *job)
+align 64
+MKGLOBAL(submit_job_pon_enc_avx,function,internal)
+submit_job_pon_enc_avx:
+ AES128_CTR_PON ENC, CTR
+ ret
+
+;;; submit_job_pon_dec_avx(JOB_AES_HMAC *job)
+align 64
+MKGLOBAL(submit_job_pon_dec_avx,function,internal)
+submit_job_pon_dec_avx:
+ AES128_CTR_PON DEC, CTR
+ ret
+
+;;; submit_job_pon_enc_no_ctr_avx(JOB_AES_HMAC *job)
+align 64
+MKGLOBAL(submit_job_pon_enc_no_ctr_avx,function,internal)
+submit_job_pon_enc_no_ctr_avx:
+ AES128_CTR_PON ENC, NO_CTR
+ ret
+
+;;; submit_job_pon_dec_no_ctr_avx(JOB_AES_HMAC *job)
+align 64
+MKGLOBAL(submit_job_pon_dec_no_ctr_avx,function,internal)
+submit_job_pon_dec_no_ctr_avx:
+ AES128_CTR_PON DEC, NO_CTR
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/sha1_mult_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha1_mult_avx.asm
new file mode 100644
index 000000000..b850a227b
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/sha1_mult_avx.asm
@@ -0,0 +1,434 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+
+section .data
+default rel
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19: ;ddq 0x5A8279995A8279995A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+
+section .text
+
+;; code to compute quad SHA1 using AVX
+;; derived from ...\sha1_multiple\sha1_quad4.asm
+;; variation of sha1_mult2.asm : clobbers all xmm regs, rcx left intact
+;; rbx, rsi, rdi, rbp, r12-r15 left intact
+;; This version is not safe to call from C/C++
+
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers: rax rdx r8 r9 r10 r11
+;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15
+;;
+;; Linux clobbers: rax rsi r8 r9 r10 r11
+;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15
+;;
+;; clobbers xmm0-15
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+
+ vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+;;
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF, %%regC,%%regD
+ vpand %%regF, %%regF,%%regB
+ vpxor %%regF, %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF,%%regD,%%regC
+ vpxor %%regF,%%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpor %%regF,%%regB,%%regC
+ vpand %%regT,%%regB,%%regC
+ vpand %%regF,%%regF,%%regD
+ vpor %%regF,%%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsrld %%tmp, %%reg, (32-(%%imm))
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PROLD_nd reg, imm, tmp, src
+%macro PROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpsrld %%tmp, %%src, (32-(%%imm))
+ vpslld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ vpaddd %%regE, %%regE,%%immCNT
+ vpaddd %%regE, %%regE,[rsp + (%%memW * 16)]
+ PROLD_nd %%regT,5, %%regF,%%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE, %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqa W14, [rsp + ((%%memW - 14) & 15) * 16]
+ vpxor W16, W16, W14
+ vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 16]
+ vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 16]
+
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF
+ vpaddd %%regE, %%regE,%%regF
+
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; FRAMESZ must be an odd multiple of 8
+%define FRAMESZ 16*16 + 8
+
+%define VMOVPS vmovdqu
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%define IDX rax
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4
+%define F xmm5 ; tmp
+%define G xmm6 ; tmp
+
+%define TMP G
+%define FUN F
+%define K xmm7
+
+%define AA xmm8
+%define BB xmm9
+%define CC xmm10
+%define DD xmm11
+%define EE xmm12
+
+%define T0 xmm6
+%define T1 xmm7
+%define T2 xmm8
+%define T3 xmm9
+%define T4 xmm10
+%define T5 xmm11
+
+%define W14 xmm13
+%define W15 xmm14
+%define W16 xmm15
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+align 32
+
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void sha1_mult_avx(SHA1_ARGS *args, UINT32 size_in_blocks);
+; arg 1 : rcx : pointer to args
+; arg 2 : rdx : size (in blocks) ;; assumed to be >= 1
+MKGLOBAL(sha1_mult_avx,function,internal)
+sha1_mult_avx:
+
+ sub rsp, FRAMESZ
+
+ ;; Initialize digests
+ vmovdqa A, [arg1 + 0*SHA1_DIGEST_ROW_SIZE]
+ vmovdqa B, [arg1 + 1*SHA1_DIGEST_ROW_SIZE]
+ vmovdqa C, [arg1 + 2*SHA1_DIGEST_ROW_SIZE]
+ vmovdqa D, [arg1 + 3*SHA1_DIGEST_ROW_SIZE]
+ vmovdqa E, [arg1 + 4*SHA1_DIGEST_ROW_SIZE]
+
+ ;; transpose input onto stack
+ mov inp0,[arg1 + _data_ptr_sha1 + 0*PTR_SZ]
+ mov inp1,[arg1 + _data_ptr_sha1 + 1*PTR_SZ]
+ mov inp2,[arg1 + _data_ptr_sha1 + 2*PTR_SZ]
+ mov inp3,[arg1 + _data_ptr_sha1 + 3*PTR_SZ]
+
+ xor IDX, IDX
+lloop:
+ vmovdqa F, [rel PSHUFFLE_BYTE_FLIP_MASK]
+%assign I 0
+%rep 4
+ VMOVPS T2,[inp0+IDX]
+ VMOVPS T1,[inp1+IDX]
+ VMOVPS T4,[inp2+IDX]
+ VMOVPS T3,[inp3+IDX]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vpshufb T0, T0, F
+ vmovdqa [rsp+(I*4+0)*16],T0
+ vpshufb T1, T1, F
+ vmovdqa [rsp+(I*4+1)*16],T1
+ vpshufb T2, T2, F
+ vmovdqa [rsp+(I*4+2)*16],T2
+ vpshufb T3, T3, F
+ vmovdqa [rsp+(I*4+3)*16],T3
+ add IDX, 4*4
+%assign I (I+1)
+%endrep
+
+ ; save old digests
+ vmovdqa AA, A
+ vmovdqa BB, B
+ vmovdqa CC, C
+ vmovdqa DD, D
+ vmovdqa EE, E
+
+;;
+;; perform 0-79 steps
+;;
+ vmovdqa K, [rel K00_19]
+;; do rounds 0...15
+%assign I 0
+%rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 16...19
+ vmovdqa W16, [rsp + ((16 - 16) & 15) * 16]
+ vmovdqa W15, [rsp + ((16 - 15) & 15) * 16]
+%rep 4
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 20...39
+ vmovdqa K, [rel K20_39]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 40...59
+ vmovdqa K, [rel K40_59]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 60...79
+ vmovdqa K, [rel K60_79]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+ vpaddd A,A,AA
+ vpaddd B,B,BB
+ vpaddd C,C,CC
+ vpaddd D,D,DD
+ vpaddd E,E,EE
+
+ sub arg2, 1
+ jne lloop
+
+ ; write out digests
+ vmovdqa [arg1 + 0*SHA1_DIGEST_ROW_SIZE], A
+ vmovdqa [arg1 + 1*SHA1_DIGEST_ROW_SIZE], B
+ vmovdqa [arg1 + 2*SHA1_DIGEST_ROW_SIZE], C
+ vmovdqa [arg1 + 3*SHA1_DIGEST_ROW_SIZE], D
+ vmovdqa [arg1 + 4*SHA1_DIGEST_ROW_SIZE], E
+
+ ; update input pointers
+ add inp0, IDX
+ mov [arg1 + _data_ptr_sha1 + 0*PTR_SZ], inp0
+ add inp1, IDX
+ mov [arg1 + _data_ptr_sha1 + 1*PTR_SZ], inp1
+ add inp2, IDX
+ mov [arg1 + _data_ptr_sha1 + 2*PTR_SZ], inp2
+ add inp3, IDX
+ mov [arg1 + _data_ptr_sha1 + 3*PTR_SZ], inp3
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ ;; Clear all stack containing part of message
+%ifdef SAFE_DATA
+ vpxor xmm0, xmm0
+%assign i 0
+%rep 16
+ vmovdqa [rsp + i*16], xmm0
+%assign i (i+1)
+%endrep
+%endif
+
+ add rsp, FRAMESZ
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/sha1_one_block_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha1_one_block_avx.asm
new file mode 100644
index 000000000..090285e54
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/sha1_one_block_avx.asm
@@ -0,0 +1,501 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; SHA1 code, hybrid, rolled, interleaved
+; Uses AVX instructions
+%include "include/os.asm"
+
+section .data
+default rel
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19: ;ddq 0x5A8279995A8279995A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+
+section .text
+
+%define VMOVDQ vmovdqu ;; assume buffers not aligned
+
+%ifdef LINUX
+%define INP rdi ; 1st arg
+%define CTX rsi ; 2nd arg
+%define REG3 edx
+%define REG4 ecx
+%else
+%define INP rcx ; 1st arg
+%define CTX rdx ; 2nd arg
+%define REG3 edi
+%define REG4 esi
+%endif
+
+%define FRAMESZ 3*16 + 1*8
+%define _RSP FRAMESZ-1*8 + rsp
+
+%define a eax
+%define b ebx
+%define c REG3
+%define d REG4
+%define e r8d
+%define T1 r9d
+%define f r10d
+%define RND r11d
+%define g r12d
+%define h r13d
+
+%define XTMP0 xmm0
+%define XTMP1 xmm1
+%define XK xmm2
+
+%xdefine X0 xmm3
+%xdefine X1 xmm4
+%xdefine X2 xmm5
+%xdefine X3 xmm6
+%xdefine X4 xmm7
+
+%define XFER xmm8
+
+%define SZ 4
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
+
+%macro rotate_Xs 0
+%xdefine X_ X0
+%xdefine X0 X1
+%xdefine X1 X2
+%xdefine X2 X3
+%xdefine X3 X4
+%xdefine X4 X_
+%endmacro
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ mov %%regF,%%regC
+ xor %%regF,%%regD
+ and %%regF,%%regB
+ xor %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ mov %%regF,%%regD
+ xor %%regF,%%regC
+ xor %%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ mov %%regF,%%regB
+ mov %%regT,%%regB
+ or %%regF,%%regC
+ and %%regT,%%regC
+ and %%regF,%%regD
+ or %%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+;; input is T1
+%macro ROUND 1
+%define %%MAGIC %1
+ add e,T1
+ mov T1,a
+ rol T1,5
+ add e,T1
+ %%MAGIC h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
+ rol b,30
+ add h,e
+ROTATE_ARGS
+%endmacro
+
+%macro do_4i 1
+ vpaddd XFER, XK, X0
+ vpextrd T1, XFER, 0
+ ;ROUND %1
+ add e,T1
+ ;SCHEDULE_4
+ vpalignr XTMP0, X1, X0, 8 ; XTMP0 = W[-14]
+ mov T1,a
+ rol T1,5
+ vpxor XTMP1, X2, X0 ; XTMP1 = W[-8] ^ W[-16]
+ add e,T1
+ vpxor XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-8] ^ W[-14] ^ W[-16]
+ %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
+
+ ;; Finish low half
+ rol b,30
+ vpsrldq X4, X3, 4 ; X4 = W[-3] {xxBA}
+ add h,e
+ROTATE_ARGS
+ vpextrd T1, XFER, 1
+ ;ROUND %1
+ add e,T1
+ vpxor X4, X4, XTMP0
+ mov T1,a
+ rol T1,5
+ ;; rotate X4 left 1
+ vpsrld XTMP1, X4, (32-1)
+ add e,T1
+ vpslld X4, X4, 1
+ %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
+ vpxor X4, X4, XTMP1 ; X4 = W[0] {xxBA}
+ rol b,30
+ add h,e
+ROTATE_ARGS
+ vpextrd T1, XFER, 2
+ ;ROUND %1
+ add e,T1
+ mov T1,a
+
+ ;; Finish high half
+ vpalignr XTMP1, X4, X3, 4 ; XTMP1 = w[-3] {DCxx}
+ rol T1,5
+ add e,T1
+ vpxor XTMP0, XTMP0, XTMP1
+ %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
+ ;; rotate XTMP0 left 1
+ vpsrld XTMP1, XTMP0, (32-1)
+ rol b,30
+ add h,e
+ROTATE_ARGS
+ vpextrd T1, XFER, 3
+ ;ROUND %1
+ add e,T1
+ mov T1,a
+ vpslld XTMP0, XTMP0, 1
+ rol T1,5
+ add e,T1
+ vpxor XTMP0, XTMP0, XTMP1 ; XTMP0 = W[0] {DCxx}
+ %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D)
+ ;; COMBINE HALVES
+ vshufps X4, X4, XTMP0, 11100100b ; X4 = X[0] {DCBA}
+ rol b,30
+ add h,e
+
+ rotate_Xs
+ROTATE_ARGS
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void sha1_block_avx(void *input_data, UINT32 digest[5])
+;; arg 1 : (in) pointer to input data
+;; arg 2 : (in/out) pointer to read/write digest
+MKGLOBAL(sha1_block_avx,function,internal)
+align 32
+sha1_block_avx:
+ push rbx
+ push rsi
+ push rdi
+ push r12
+ push r13
+
+ vmovdqa XTMP0, [rel PSHUFFLE_BYTE_FLIP_MASK]
+
+%ifndef LINUX
+ mov rax,rsp ; copy rsp
+ sub rsp,FRAMESZ
+ and rsp,-16 ; align stack frame
+ mov [_RSP],rax ; save copy of rsp
+ vmovdqa [rsp + 0 * 16], xmm6
+ vmovdqa [rsp + 1 * 16], xmm7
+ vmovdqa [rsp + 2 * 16], xmm8
+%endif
+
+ VMOVDQ X0, [INP + 0*16]
+ VMOVDQ X1, [INP + 1*16]
+
+ ;; load next message block
+ VMOVDQ X2, [INP + 2*16]
+ VMOVDQ X3, [INP + 3*16]
+
+ ;; set up a-f based on h0-h4
+ ;; byte swap first 16 dwords
+ mov a, [SZ*0 + CTX]
+ vpshufb X0, XTMP0
+ mov b, [SZ*1 + CTX]
+ vpshufb X1, XTMP0
+ mov c, [SZ*2 + CTX]
+ vpshufb X2, XTMP0
+ mov d, [SZ*3 + CTX]
+ vpshufb X3, XTMP0
+ mov e, [SZ*4 + CTX]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; do rounds 00-19
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqa XK, [rel K00_19]
+ mov RND, 3
+ ROTATE_ARGS
+ ROTATE_ARGS
+ ROTATE_ARGS
+ ROTATE_ARGS
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ jmp loop1_5
+align 16
+loop1:
+
+ do_4i MAGIC_F0
+
+loop1_5:
+ do_4i MAGIC_F0
+
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ vmovdqa X0, X2
+ vmovdqa X2, X4
+ vmovdqa X4, X1
+ vmovdqa X1, X3
+
+ sub RND, 1
+ jne loop1
+
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; end rounds 00-19
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; do rounds 20-39
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqa XK, [rel K20_39]
+ mov RND, 3
+ ROTATE_ARGS
+ ROTATE_ARGS
+ ROTATE_ARGS
+ ROTATE_ARGS
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ jmp loop2_5
+align 16
+loop2:
+
+ do_4i MAGIC_F1
+
+loop2_5:
+ do_4i MAGIC_F1
+
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ vmovdqa X0, X2
+ vmovdqa X2, X4
+ vmovdqa X4, X1
+ vmovdqa X1, X3
+
+ sub RND, 1
+ jne loop2
+
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; end rounds 20-39
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; do rounds 40-59
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqa XK, [rel K40_59]
+ mov RND, 3
+ ROTATE_ARGS
+ ROTATE_ARGS
+ ROTATE_ARGS
+ ROTATE_ARGS
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ jmp loop3_5
+align 16
+loop3:
+
+ do_4i MAGIC_F2
+
+loop3_5:
+ do_4i MAGIC_F2
+
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ vmovdqa X0, X2
+ vmovdqa X2, X4
+ vmovdqa X4, X1
+ vmovdqa X1, X3
+
+ sub RND, 1
+ jne loop3
+
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+ rotate_Xs
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; end rounds 40-59
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; do rounds 60-79
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqa XK, [rel K60_79]
+
+ do_4i MAGIC_F3
+
+ vpaddd XFER, XK, X0
+ vpextrd T1, XFER, 0
+ ROUND MAGIC_F3
+ vpextrd T1, XFER, 1
+ ROUND MAGIC_F3
+ vpextrd T1, XFER, 2
+ ROUND MAGIC_F3
+ vpextrd T1, XFER, 3
+ ROUND MAGIC_F3
+
+ vpaddd XFER, XK, X1
+ vpextrd T1, XFER, 0
+ ROUND MAGIC_F3
+ vpextrd T1, XFER, 1
+ ROUND MAGIC_F3
+ vpextrd T1, XFER, 2
+ ROUND MAGIC_F3
+ vpextrd T1, XFER, 3
+ ROUND MAGIC_F3
+
+ vpaddd XFER, XK, X2
+ vpextrd T1, XFER, 0
+ ROUND MAGIC_F3
+ vpextrd T1, XFER, 1
+ ROUND MAGIC_F3
+ vpextrd T1, XFER, 2
+ ROUND MAGIC_F3
+ vpextrd T1, XFER, 3
+ ROUND MAGIC_F3
+
+ vpaddd XFER, XK, X3
+ vpextrd T1, XFER, 0
+ ROUND MAGIC_F3
+ vpextrd T1, XFER, 1
+ ROUND MAGIC_F3
+ vpextrd T1, XFER, 2
+ ROUND MAGIC_F3
+ vpextrd T1, XFER, 3
+ ROUND MAGIC_F3
+
+ ;; update result digest h0-h4
+ add [SZ*0 + CTX], a
+ add [SZ*1 + CTX], b
+ add [SZ*2 + CTX], c
+ add [SZ*3 + CTX], d
+ add [SZ*4 + CTX], e
+
+%ifndef LINUX
+ vmovdqa xmm8, [rsp + 2 * 16]
+ vmovdqa xmm7, [rsp + 1 * 16]
+ vmovdqa xmm6, [rsp + 0 * 16]
+
+%ifdef SAFE_DATA
+ ;; Clear potential sensitive data stored in stack
+ vpxor xmm0, xmm0
+ vmovdqa [rsp + 0 * 16], xmm0
+ vmovdqa [rsp + 1 * 16], xmm0
+ vmovdqa [rsp + 2 * 16], xmm0
+%endif
+
+ mov rsp,[_RSP]
+%endif ;; LINUX
+
+ pop r13
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbx
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/sha224_one_block_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha224_one_block_avx.asm
new file mode 100644
index 000000000..57d997dd3
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/sha224_one_block_avx.asm
@@ -0,0 +1,33 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define FUNC sha224_block_avx
+
+%include "avx/sha256_one_block_avx.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/sha256_one_block_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha256_one_block_avx.asm
new file mode 100644
index 000000000..9c96f036b
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/sha256_one_block_avx.asm
@@ -0,0 +1,553 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%include "include/os.asm"
+
+section .data
+default rel
+align 64
+K256:
+ dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+; shuffle xBxA -> 00BA
+_SHUF_00BA: ;ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100
+ dq 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF
+
+; shuffle xDxC -> DC00
+_SHUF_DC00: ;ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF
+ dq 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100
+
+section .text
+
+%define VMOVDQ vmovdqu ;; assume buffers not aligned
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
+
+%macro MY_ROR 2
+ shld %1,%1,(32-(%2))
+%endm
+
+; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+; Load xmm with mem and byte swap each dword
+%macro COPY_XMM_AND_BSWAP 3
+ VMOVDQ %1, %2
+ vpshufb %1, %1, %3
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define X0 xmm4
+%define X1 xmm5
+%define X2 xmm6
+%define X3 xmm7
+
+%define XTMP0 xmm0
+%define XTMP1 xmm1
+%define XTMP2 xmm2
+%define XTMP3 xmm3
+%define XTMP4 xmm8
+%define XFER xmm9
+%define XTMP5 xmm11
+
+%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
+%define SHUF_DC00 xmm12 ; shuffle xDxC -> DC00
+%define BYTE_FLIP_MASK xmm13
+
+%ifdef LINUX
+%define CTX rsi ; 2nd arg
+%define INP rdi ; 1st arg
+
+%define SRND rdi ; clobbers INP
+%define c ecx
+%define d r8d
+%define e edx
+%else
+%define CTX rdx ; 2nd arg
+%define INP rcx ; 1st arg
+
+%define SRND rcx ; clobbers INP
+%define c edi
+%define d esi
+%define e r8d
+
+%endif
+%define TBL rbp
+%define a eax
+%define b ebx
+
+%define f r9d
+%define g r10d
+%define h r11d
+
+%define y0 r13d
+%define y1 r14d
+%define y2 r15d
+
+
+struc STACK
+%ifndef LINUX
+_XMM_SAVE: reso 7
+%endif
+_XFER: reso 1
+endstruc
+
+%ifndef FUNC
+%define FUNC sha256_block_avx
+%endif
+
+; rotate_Xs
+; Rotate values of symbols X0...X3
+%macro rotate_Xs 0
+%xdefine X_ X0
+%xdefine X0 X1
+%xdefine X1 X2
+%xdefine X2 X3
+%xdefine X3 X_
+%endm
+
+; ROTATE_ARGS
+; Rotate values of symbols a...h
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+%macro FOUR_ROUNDS_AND_SCHED 0
+ ;; compute s0 four at a time and s1 two at a time
+ ;; compute W[-16] + W[-7] 4 at a time
+ ;vmovdqa XTMP0, X3
+ mov y0, e ; y0 = e
+ MY_ROR y0, (25-11) ; y0 = e >> (25-11)
+ mov y1, a ; y1 = a
+ vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7]
+ MY_ROR y1, (22-13) ; y1 = a >> (22-13)
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ mov y2, f ; y2 = f
+ MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ ;vmovdqa XTMP1, X1
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ xor y2, g ; y2 = f^g
+ vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+ MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ ;; compute s0
+ vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15]
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+
+ MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y2, y0 ; y2 = S1 + CH
+ add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH
+
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+
+ vpsrld XTMP2, XTMP1, 7
+
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+
+ vpslld XTMP3, XTMP1, (32-7)
+
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+
+ vpor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7
+
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+
+ mov y0, e ; y0 = e
+ mov y1, a ; y1 = a
+
+
+ MY_ROR y0, (25-11) ; y0 = e >> (25-11)
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ mov y2, f ; y2 = f
+ MY_ROR y1, (22-13) ; y1 = a >> (22-13)
+
+ vpsrld XTMP2, XTMP1,18
+
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor y2, g ; y2 = f^g
+
+ vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3
+
+ MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+ MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+
+ vpslld XTMP1, XTMP1, (32-18)
+
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+
+ vpxor XTMP3, XTMP3, XTMP1
+
+ add y2, y0 ; y2 = S1 + CH
+ add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH
+ MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+
+ vpxor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18
+
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+
+ vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0
+
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ ;; compute low s1
+ vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA}
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+ ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
+
+ mov y0, e ; y0 = e
+ mov y1, a ; y1 = a
+ MY_ROR y0, (25-11) ; y0 = e >> (25-11)
+
+ ;vmovdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
+
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ MY_ROR y1, (22-13) ; y1 = a >> (22-13)
+ mov y2, f ; y2 = f
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+
+ vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
+
+ xor y2, g ; y2 = f^g
+
+ vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xBxA}
+
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+
+ vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xBxA}
+
+ MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ vpxor XTMP2, XTMP2, XTMP3
+ add y2, y0 ; y2 = S1 + CH
+ MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH
+ vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ ;; compute high s1
+ vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC}
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+ ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
+ mov y0, e ; y0 = e
+ MY_ROR y0, (25-11) ; y0 = e >> (25-11)
+ mov y1, a ; y1 = a
+ ;vmovdqa XTMP5, XTMP2 ; XTMP5 = W[-2] {DDCC}
+ MY_ROR y1, (22-13) ; y1 = a >> (22-13)
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ mov y2, f ; y2 = f
+ MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+
+ vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC}
+
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ xor y2, g ; y2 = f^g
+
+ vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xDxC}
+
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+ MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+
+ vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xDxC}
+
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+
+ vpxor XTMP2, XTMP2, XTMP3
+
+ MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y2, y0 ; y2 = S1 + CH
+ add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH
+ vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC}
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00}
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ROTATE_ARGS
+rotate_Xs
+%endm
+
+;; input is [rsp + _XFER + %1 * 4]
+%macro DO_ROUND 1
+ mov y0, e ; y0 = e
+ MY_ROR y0, (25-11) ; y0 = e >> (25-11)
+ mov y1, a ; y1 = a
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ MY_ROR y1, (22-13) ; y1 = a >> (22-13)
+ mov y2, f ; y2 = f
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor y2, g ; y2 = f^g
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ and y2, e ; y2 = (f^g)&e
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ add y2, y0 ; y2 = S1 + CH
+ MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void FUNC(void *input_data, UINT32 digest[8], UINT64 num_blks)
+;; arg 1 : pointer to input data
+;; arg 2 : pointer to digest
+section .text
+MKGLOBAL(FUNC,function,internal)
+align 32
+FUNC:
+ push rbx
+%ifndef LINUX
+ push rsi
+ push rdi
+%endif
+ push rbp
+ push r13
+ push r14
+ push r15
+
+ sub rsp,STACK_size
+%ifndef LINUX
+ vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6
+ vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7
+ vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8
+ vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9
+ vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10
+ vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11
+ vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12
+ vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13
+%endif
+
+ ;; load initial digest
+ mov a, [4*0 + CTX]
+ mov b, [4*1 + CTX]
+ mov c, [4*2 + CTX]
+ mov d, [4*3 + CTX]
+ mov e, [4*4 + CTX]
+ mov f, [4*5 + CTX]
+ mov g, [4*6 + CTX]
+ mov h, [4*7 + CTX]
+
+ vmovdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK]
+ vmovdqa SHUF_00BA, [rel _SHUF_00BA]
+ vmovdqa SHUF_DC00, [rel _SHUF_DC00]
+
+ lea TBL,[rel K256]
+
+ ;; byte swap first 16 dwords
+ COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
+
+ ;; schedule 48 input dwords, by doing 3 rounds of 16 each
+ mov SRND, 3
+align 16
+loop1:
+ vpaddd XFER, X0, [TBL + 0*16]
+ vmovdqa [rsp + _XFER], XFER
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddd XFER, X0, [TBL + 1*16]
+ vmovdqa [rsp + _XFER], XFER
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddd XFER, X0, [TBL + 2*16]
+ vmovdqa [rsp + _XFER], XFER
+ FOUR_ROUNDS_AND_SCHED
+
+ vpaddd XFER, X0, [TBL + 3*16]
+ vmovdqa [rsp + _XFER], XFER
+ add TBL, 4*16
+ FOUR_ROUNDS_AND_SCHED
+
+ sub SRND, 1
+ jne loop1
+
+ mov SRND, 2
+loop2:
+ vpaddd XFER, X0, [TBL + 0*16]
+ vmovdqa [rsp + _XFER], XFER
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ vpaddd XFER, X1, [TBL + 1*16]
+ vmovdqa [rsp + _XFER], XFER
+ add TBL, 2*16
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ vmovdqa X0, X2
+ vmovdqa X1, X3
+
+ sub SRND, 1
+ jne loop2
+
+ add [4*0 + CTX], a
+ add [4*1 + CTX], b
+ add [4*2 + CTX], c
+ add [4*3 + CTX], d
+ add [4*4 + CTX], e
+ add [4*5 + CTX], f
+ add [4*6 + CTX], g
+ add [4*7 + CTX], h
+
+done_hash:
+%ifndef LINUX
+ vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16]
+ vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16]
+ vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16]
+ vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16]
+ vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16]
+ vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16]
+ vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16]
+ vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16]
+%ifdef SAFE_DATA
+ ;; Clear potential sensitive data stored in stack
+ vpxor xmm0, xmm0
+ vmovdqa [rsp + _XMM_SAVE + 0 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 1 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 2 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 3 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 4 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 5 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 6 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 7 * 16], xmm0
+%endif
+%endif ;; LINUX
+
+ add rsp, STACK_size
+
+ pop r15
+ pop r14
+ pop r13
+ pop rbp
+%ifndef LINUX
+ pop rdi
+ pop rsi
+%endif
+ pop rbx
+
+ ret
+
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/sha384_one_block_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha384_one_block_avx.asm
new file mode 100644
index 000000000..dddc5df28
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/sha384_one_block_avx.asm
@@ -0,0 +1,33 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define FUNC sha384_block_avx
+
+%include "avx/sha512_one_block_avx.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx/sha512_one_block_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha512_one_block_avx.asm
new file mode 100644
index 000000000..040518e76
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/sha512_one_block_avx.asm
@@ -0,0 +1,473 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; This code schedules 1 blocks at a time, with 4 lanes per block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%include "include/os.asm"
+
+%define VMOVDQ vmovdqu ;; assume buffers not aligned
+
+%ifndef FUNC
+%define FUNC sha512_block_avx
+%endif
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
+
+%macro MY_ROR 2
+shld %1,%1,(64-(%2))
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+; Load xmm with mem and byte swap each dword
+%macro COPY_XMM_AND_BSWAP 3
+ VMOVDQ %1, %2
+ vpshufb %1, %3
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define X0 xmm4
+%define X1 xmm5
+%define X2 xmm6
+%define X3 xmm7
+%define X4 xmm8
+%define X5 xmm9
+%define X6 xmm10
+%define X7 xmm11
+
+%define XTMP0 xmm0
+%define XTMP1 xmm1
+%define XTMP2 xmm2
+%define XTMP3 xmm3
+%define XFER xmm13
+
+%define BYTE_FLIP_MASK xmm12
+
+%ifdef LINUX
+%define CTX rsi ; 2nd arg
+%define INP rdi ; 1st arg
+
+%define SRND rdi ; clobbers INP
+%define c rcx
+%define d r8
+%define e rdx
+%else
+%define CTX rdx ; 2nd arg
+%define INP rcx ; 1st arg
+
+%define SRND rcx ; clobbers INP
+%define c rdi
+%define d rsi
+%define e r8
+
+%endif
+%define TBL rbp
+%define a rax
+%define b rbx
+
+%define f r9
+%define g r10
+%define h r11
+
+%define y0 r13
+%define y1 r14
+%define y2 r15
+
+struc STACK
+%ifndef LINUX
+_XMM_SAVE: reso 8
+%endif
+_XFER: reso 1
+endstruc
+
+
+; rotate_Xs
+; Rotate values of symbols X0...X7
+%macro rotate_Xs 0
+%xdefine X_ X0
+%xdefine X0 X1
+%xdefine X1 X2
+%xdefine X2 X3
+%xdefine X3 X4
+%xdefine X4 X5
+%xdefine X5 X6
+%xdefine X6 X7
+%xdefine X7 X_
+%endm
+
+; ROTATE_ARGS
+; Rotate values of symbols a...h
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+%macro TWO_ROUNDS_AND_SCHED 0
+
+ vpalignr XTMP0, X5, X4, 8 ; XTMP0 = W[-7]
+ ;; compute s0 four at a time and s1 two at a time
+ ;; compute W[-16] + W[-7] 4 at a time
+ mov y0, e ; y0 = e
+ mov y1, a ; y1 = a
+ MY_ROR y0, (41-18) ; y0 = e >> (41-18)
+ vpaddq XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
+ xor y0, e ; y0 = e ^ (e >> (41-18))
+ mov y2, f ; y2 = f
+ MY_ROR y1, (39-34) ; y1 = a >> (39-34)
+ ;; compute s0
+ vpalignr XTMP1, X1, X0, 8 ; XTMP1 = W[-15]
+ xor y1, a ; y1 = a ^ (a >> (39-34)
+ MY_ROR y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14))
+ vpsllq XTMP2, XTMP1, (64-1)
+ xor y2, g ; y2 = f^g
+ MY_ROR y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28))
+ vpsrlq XTMP3, XTMP1, 1
+ xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (41-14))
+ and y2, e ; y2 = (f^g)&e
+ MY_ROR y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41)
+ vpor XTMP2, XTMP2, XTMP3 ; XTMP2 = W[-15] ror 1
+ xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28))
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ add y2, y0 ; y2 = S1 + CH
+ vpsrlq XTMP3, XTMP1, 8
+ add y2, [rsp + _XFER + 0*8] ; y2 = k + w + S1 + CH
+ MY_ROR y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39)
+ mov y0, a ; y0 = a
+ vpsllq X0, XTMP1, (64-8)
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ or y0, c ; y0 = a|c
+ vpor X0, X0, XTMP3
+ add d, h ; d = d + t1
+ and y2, c ; y2 = a&c
+ and y0, b ; y0 = (a|c)&b
+ vpsrlq XTMP1, XTMP1, 7 ; X0 = W[-15] >> 7
+ add h, y1 ; h = t1 + S0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ vpxor XTMP1, XTMP1, XTMP2 ; XTMP1 = W[-15] ror 1 ^ W[-15] ror 8
+ add h, y0 ; h = t1 + S0 + MAJ
+ vpxor XTMP1, XTMP1, X0 ; XTMP1 = s0
+
+
+ROTATE_ARGS
+ ;; compute s1
+ vpaddq XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
+ mov y0, e ; y0 = e
+ mov y1, a ; y1 = a
+ MY_ROR y0, (41-18) ; y0 = e >> (41-18)
+ vpsllq XTMP3, X7, (64-19)
+ xor y0, e ; y0 = e ^ (e >> (41-18))
+ mov y2, f ; y2 = f
+ MY_ROR y1, (39-34) ; y1 = a >> (39-34)
+ vpsrlq X0, X7, 19
+ xor y1, a ; y1 = a ^ (a >> (39-34)
+ MY_ROR y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14))
+ vpor XTMP3, XTMP3, X0 ; XTMP3 = W[-2] ror 19
+ xor y2, g ; y2 = f^g
+ MY_ROR y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28))
+ vpsllq XTMP2, X7, (64-61)
+ xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (41-14))
+ and y2, e ; y2 = (f^g)&e
+ MY_ROR y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41)
+ vpsrlq XTMP1, X7, 61
+ xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28))
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ add y2, y0 ; y2 = S1 + CH
+ vpor XTMP2, XTMP2, XTMP1 ; XTMP2 = W[-2] ror 61
+ add y2, [rsp + _XFER + 1*8] ; y2 = k + w + S1 + CH
+ MY_ROR y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39)
+ mov y0, a ; y0 = a
+ vpsrlq XTMP1, X7, 6 ; XTMP1 = W[-2] >> 6
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ or y0, c ; y0 = a|c
+ vpxor XTMP1, XTMP1, XTMP2
+ add d, h ; d = d + t1
+ and y2, c ; y2 = a&c
+ and y0, b ; y0 = (a|c)&b
+ vpxor X0, XTMP3, XTMP1 ; X0 = s1
+ add h, y1 ; h = t1 + S0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = t1 + S0 + MAJ
+ vpaddq X0, X0, XTMP0 ; X0 = {W[1], W[0]}
+
+ROTATE_ARGS
+rotate_Xs
+%endm
+
+;; input is [rsp + _XFER + %1 * 8]
+%macro DO_ROUND 1
+ mov y0, e ; y0 = e
+ MY_ROR y0, (41-18) ; y0 = e >> (41-18)
+ mov y1, a ; y1 = a
+ xor y0, e ; y0 = e ^ (e >> (41-18))
+ MY_ROR y1, (39-34) ; y1 = a >> (39-34)
+ mov y2, f ; y2 = f
+ xor y1, a ; y1 = a ^ (a >> (39-34)
+ MY_ROR y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14))
+ xor y2, g ; y2 = f^g
+ xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (25-6))
+ MY_ROR y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28))
+ and y2, e ; y2 = (f^g)&e
+ xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28))
+ MY_ROR y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41)
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ add y2, y0 ; y2 = S1 + CH
+ MY_ROR y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39)
+ add y2, [rsp + _XFER + %1*8] ; y2 = k + w + S1 + CH
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + t1
+ and y2, c ; y2 = a&c
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = t1 + S0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = t1 + S0 + MAJ
+ ROTATE_ARGS
+%endm
+
+section .data
+default rel
+align 64
+K512:
+ dq 0x428a2f98d728ae22,0x7137449123ef65cd
+ dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ dq 0x3956c25bf348b538,0x59f111f1b605d019
+ dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ dq 0xd807aa98a3030242,0x12835b0145706fbe
+ dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ dq 0x9bdc06a725c71235,0xc19bf174cf692694
+ dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ dq 0x983e5152ee66dfab,0xa831c66d2db43210
+ dq 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ dq 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ dq 0x06ca6351e003826f,0x142929670a0e6e70
+ dq 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ dq 0x650a73548baf63de,0x766a0abb3c77b2a8
+ dq 0x81c2c92e47edaee6,0x92722c851482353b
+ dq 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ dq 0xc24b8b70d0f89791,0xc76c51a30654be30
+ dq 0xd192e819d6ef5218,0xd69906245565a910
+ dq 0xf40e35855771202a,0x106aa07032bbd1b8
+ dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ dq 0x748f82ee5defb2fc,0x78a5636f43172f60
+ dq 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ dq 0x90befffa23631e28,0xa4506cebde82bde9
+ dq 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ dq 0xca273eceea26619c,0xd186b8c721c0c207
+ dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ dq 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ dq 0x113f9804bef90dae,0x1b710b35131c471b
+ dq 0x28db77f523047d84,0x32caab7b40c72493
+ dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607
+ dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void FUNC(void *input_data, UINT64 digest[8])
+;; arg 1 : pointer to input data
+;; arg 2 : pointer to digest
+section .text
+MKGLOBAL(FUNC,function,internal)
+align 32
+FUNC:
+ push rbx
+%ifndef LINUX
+ push rsi
+ push rdi
+%endif
+ push rbp
+ push r13
+ push r14
+ push r15
+
+ sub rsp,STACK_size
+%ifndef LINUX
+ vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6
+ vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7
+ vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8
+ vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9
+ vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10
+ vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11
+ vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12
+ vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13
+%endif
+
+ ;; load initial digest
+ mov a, [8*0 + CTX]
+ mov b, [8*1 + CTX]
+ mov c, [8*2 + CTX]
+ mov d, [8*3 + CTX]
+ mov e, [8*4 + CTX]
+ mov f, [8*5 + CTX]
+ mov g, [8*6 + CTX]
+ mov h, [8*7 + CTX]
+
+ vmovdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK]
+
+ lea TBL,[rel K512]
+
+ ;; byte swap first 16 qwords
+ COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X4, [INP + 4*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X5, [INP + 5*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X6, [INP + 6*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X7, [INP + 7*16], BYTE_FLIP_MASK
+
+ ;; schedule 64 input qwords, by doing 4 iterations of 16 rounds
+ mov SRND, 4
+align 16
+loop1:
+
+%assign i 0
+%rep 7
+ vpaddq XFER, X0, [TBL + i*16]
+ vmovdqa [rsp + _XFER], XFER
+ TWO_ROUNDS_AND_SCHED
+%assign i (i+1)
+%endrep
+
+ vpaddq XFER, X0, [TBL + 7*16]
+ vmovdqa [rsp + _XFER], XFER
+ add TBL, 8*16
+ TWO_ROUNDS_AND_SCHED
+
+ sub SRND, 1
+ jne loop1
+
+ mov SRND, 2
+ jmp loop2a
+loop2:
+ vmovdqa X0, X4
+ vmovdqa X1, X5
+ vmovdqa X2, X6
+ vmovdqa X3, X7
+
+loop2a:
+ vpaddq X0, X0, [TBL + 0*16]
+ vmovdqa [rsp + _XFER], X0
+ DO_ROUND 0
+ DO_ROUND 1
+
+ vpaddq X1, X1, [TBL + 1*16]
+ vmovdqa [rsp + _XFER], X1
+ DO_ROUND 0
+ DO_ROUND 1
+
+ vpaddq X2, X2, [TBL + 2*16]
+ vmovdqa [rsp + _XFER], X2
+ DO_ROUND 0
+ DO_ROUND 1
+
+ vpaddq X3, X3, [TBL + 3*16]
+ vmovdqa [rsp + _XFER], X3
+ add TBL, 4*16
+ DO_ROUND 0
+ DO_ROUND 1
+
+ sub SRND, 1
+ jne loop2
+
+ add [8*0 + CTX], a
+ add [8*1 + CTX], b
+ add [8*2 + CTX], c
+ add [8*3 + CTX], d
+ add [8*4 + CTX], e
+ add [8*5 + CTX], f
+ add [8*6 + CTX], g
+ add [8*7 + CTX], h
+
+done_hash:
+%ifndef LINUX
+ vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16]
+ vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16]
+ vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16]
+ vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16]
+ vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16]
+ vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16]
+ vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16]
+ vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16]
+%ifdef SAFE_DATA
+ ;; Clear potential sensitive data stored in stack
+ vpxor xmm0, xmm0
+ vmovdqa [rsp + _XMM_SAVE + 0 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 1 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 2 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 3 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 4 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 5 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 6 * 16], xmm0
+ vmovdqa [rsp + _XMM_SAVE + 7 * 16], xmm0
+%endif
+%endif ;; LINUX
+
+ add rsp, STACK_size
+
+ pop r15
+ pop r14
+ pop r13
+ pop rbp
+%ifndef LINUX
+ pop rdi
+ pop rsi
+%endif
+ pop rbx
+
+ ret
+
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/sha512_x2_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha512_x2_avx.asm
new file mode 100644
index 000000000..d7d712e2c
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/sha512_x2_avx.asm
@@ -0,0 +1,381 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; code to compute SHA512 by-2 using AVX
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers: rax rdx r8 r9 r10 r11
+;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15
+;;
+;; Linux clobbers: rax rsi r8 r9 r10 r11
+;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15
+;;
+;; clobbers xmm0-15
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+extern K512_2
+
+section .data
+default rel
+
+align 32
+; one from sha512_rorx
+; this does the big endian to little endian conversion
+; over a quad word
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607
+ dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+ ;ddq 0x18191a1b1c1d1e1f1011121314151617
+ dq 0x1011121314151617, 0x18191a1b1c1d1e1f
+
+section .text
+
+%ifdef LINUX ; Linux definitions
+%define arg1 rdi
+%define arg2 rsi
+%else ; Windows definitions
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+; Common definitions
+%define STATE arg1
+%define INP_SIZE arg2
+
+%define IDX rax
+%define ROUND r8
+%define TBL r11
+
+%define inp0 r9
+%define inp1 r10
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1 xmm14
+%define TMP xmm15
+
+
+
+%define SZ2 2*SHA512_DIGEST_WORD_SIZE ; Size of one vector register
+%define ROUNDS 80*SZ2
+
+; Define stack usage
+
+struc STACK
+_DATA: resb SZ2 * 16
+_DIGEST: resb SZ2 * NUM_SHA512_DIGEST_WORDS
+ resb 8 ; for alignment, must be odd multiple of 8
+endstruc
+
+%define VMOVPD vmovupd
+
+; transpose r0, r1, t0
+; Input looks like {r0 r1}
+; r0 = {a1 a0}
+; r1 = {b1 b0}
+;
+; output looks like
+; r0 = {b0, a0}
+; t0 = {b1, a1}
+
+%macro TRANSPOSE 3
+%define %%r0 %1
+%define %%r1 %2
+%define %%t0 %3
+ vshufpd %%t0, %%r0, %%r1, 11b ; t0 = b1 a1
+ vshufpd %%r0, %%r0, %%r1, 00b ; r0 = b0 a0
+%endm
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORQ reg, imm, tmp
+; packed-rotate-right-double
+; does a rotate by doing two shifts and an or
+%macro PRORQ 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsllq %%tmp, %%reg, (64-(%%imm))
+ vpsrlq %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORQ_nd reg, imm, tmp, src
+%macro PRORQ_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpsllq %%tmp, %%src, (64-(%%imm))
+ vpsrlq %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PRORQ dst/src, amt
+%macro PRORQ 2
+ PRORQ %1, %2, TMP
+%endmacro
+
+; PRORQ_nd dst, src, amt
+%macro PRORQ_nd 3
+ PRORQ_nd %1, %3, TMP, %2
+%endmacro
+
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i %2
+ PRORQ_nd a0, e, (18-14) ; sig1: a0 = (e >> 4)
+
+ vpxor a2, f, g ; ch: a2 = f^g
+ vpand a2, a2, e ; ch: a2 = (f^g)&e
+ vpxor a2, a2, g ; a2 = ch
+
+ PRORQ_nd a1, e, 41 ; sig1: a1 = (e >> 41)
+ vmovdqa [SZ2*(%%i&0xf) + rsp + _DATA],%%T1
+ vpaddq %%T1,%%T1,[TBL + ROUND] ; T1 = W + K
+ vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18)
+ vpaddq h, h, a2 ; h = h + ch
+ PRORQ_nd a2, a, (34-28) ; sig0: a2 = (a >> 6)
+ vpaddq h, h, %%T1 ; h = h + ch + W + K
+ vpxor a0, a0, a1 ; a0 = sigma1
+ vmovdqa %%T1, a ; maj: T1 = a
+ PRORQ_nd a1, a, 39 ; sig0: a1 = (a >> 39)
+ vpxor %%T1, %%T1, c ; maj: T1 = a^c
+ add ROUND, SZ2 ; ROUND++
+ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
+ vpaddq h, h, a0
+
+ vpaddq d, d, h
+
+ vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34)
+ vpxor a2, a2, a1 ; a2 = sig0
+ vpand a1, a, c ; maj: a1 = a&c
+ vpor a1, a1, %%T1 ; a1 = maj
+ vpaddq h, h, a1 ; h = h + ch + W + K + maj
+ vpaddq h, h, a2 ; h = h + ch + W + K + maj + sigma0
+ ROTATE_ARGS
+
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i %2
+ vmovdqa %%T1, [SZ2*((%%i-15)&0xf) + rsp + _DATA]
+ vmovdqa a1, [SZ2*((%%i-2)&0xf) + rsp + _DATA]
+ vmovdqa a0, %%T1
+ PRORQ %%T1, 8-1
+ vmovdqa a2, a1
+ PRORQ a1, 61-19
+ vpxor %%T1, %%T1, a0
+ PRORQ %%T1, 1
+ vpxor a1, a1, a2
+ PRORQ a1, 19
+ vpsrlq a0, a0, 7
+ vpxor %%T1, %%T1, a0
+ vpsrlq a2, a2, 6
+ vpxor a1, a1, a2
+ vpaddq %%T1, %%T1, [SZ2*((%%i-16)&0xf) + rsp + _DATA]
+ vpaddq a1, a1, [SZ2*((%%i-7)&0xf) + rsp + _DATA]
+ vpaddq %%T1, %%T1, a1
+
+ ROUND_00_15 %%T1, %%i
+
+%endm
+
+
+
+;; SHA512_ARGS:
+;; UINT128 digest[8]; // transposed digests
+;; UINT8 *data_ptr[2];
+;;
+
+;; void sha512_x2_avx(SHA512_ARGS *args, UINT64 msg_size_in_blocks)
+;; arg 1 : STATE : pointer args
+;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
+;;
+MKGLOBAL(sha512_x2_avx,function,internal)
+align 32
+sha512_x2_avx:
+ ; general registers preserved in outer calling routine
+ ; outer calling routine saves all the XMM registers
+
+ sub rsp, STACK_size
+
+ ;; Load the pre-transposed incoming digest.
+ vmovdqa a,[STATE + 0 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa b,[STATE + 1 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa c,[STATE + 2 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa d,[STATE + 3 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa e,[STATE + 4 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa f,[STATE + 5 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa g,[STATE + 6 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa h,[STATE + 7 * SHA512_DIGEST_ROW_SIZE]
+
+ lea TBL,[rel K512_2]
+
+ ;; load the address of each of the 2 message lanes
+ ;; getting ready to transpose input onto stack
+ mov inp0,[STATE + _data_ptr_sha512 +0*PTR_SZ]
+ mov inp1,[STATE + _data_ptr_sha512 +1*PTR_SZ]
+
+ xor IDX, IDX
+lloop:
+
+ xor ROUND, ROUND
+
+ ;; save old digest
+ vmovdqa [rsp + _DIGEST + 0*SZ2], a
+ vmovdqa [rsp + _DIGEST + 1*SZ2], b
+ vmovdqa [rsp + _DIGEST + 2*SZ2], c
+ vmovdqa [rsp + _DIGEST + 3*SZ2], d
+ vmovdqa [rsp + _DIGEST + 4*SZ2], e
+ vmovdqa [rsp + _DIGEST + 5*SZ2], f
+ vmovdqa [rsp + _DIGEST + 6*SZ2], g
+ vmovdqa [rsp + _DIGEST + 7*SZ2], h
+
+%assign i 0
+%rep 8
+ ;; load up the shuffler for little-endian to big-endian format
+ vmovdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK]
+ VMOVPD TT0,[inp0+IDX+i*16] ;; double precision is 64 bits
+ VMOVPD TT2,[inp1+IDX+i*16]
+
+ TRANSPOSE TT0, TT2, TT1
+ vpshufb TT0, TT0, TMP
+ vpshufb TT1, TT1, TMP
+
+ ROUND_00_15 TT0,(i*2+0)
+ ROUND_00_15 TT1,(i*2+1)
+%assign i (i+1)
+%endrep
+
+;; Increment IDX by message block size == 8 (loop) * 16 (XMM width in bytes)
+ add IDX, 8 * 16
+
+%assign i (i*4)
+
+ jmp Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+ ROUND_16_XX T1, i
+%assign i (i+1)
+%endrep
+
+ cmp ROUND,ROUNDS
+ jb Lrounds_16_xx
+
+ ;; add old digest
+ vpaddq a, a, [rsp + _DIGEST + 0*SZ2]
+ vpaddq b, b, [rsp + _DIGEST + 1*SZ2]
+ vpaddq c, c, [rsp + _DIGEST + 2*SZ2]
+ vpaddq d, d, [rsp + _DIGEST + 3*SZ2]
+ vpaddq e, e, [rsp + _DIGEST + 4*SZ2]
+ vpaddq f, f, [rsp + _DIGEST + 5*SZ2]
+ vpaddq g, g, [rsp + _DIGEST + 6*SZ2]
+ vpaddq h, h, [rsp + _DIGEST + 7*SZ2]
+
+ sub INP_SIZE, 1 ;; consumed one message block
+ jne lloop
+
+ ; write back to memory (state object) the transposed digest
+ vmovdqa [STATE+0*SHA512_DIGEST_ROW_SIZE],a
+ vmovdqa [STATE+1*SHA512_DIGEST_ROW_SIZE],b
+ vmovdqa [STATE+2*SHA512_DIGEST_ROW_SIZE],c
+ vmovdqa [STATE+3*SHA512_DIGEST_ROW_SIZE],d
+ vmovdqa [STATE+4*SHA512_DIGEST_ROW_SIZE],e
+ vmovdqa [STATE+5*SHA512_DIGEST_ROW_SIZE],f
+ vmovdqa [STATE+6*SHA512_DIGEST_ROW_SIZE],g
+ vmovdqa [STATE+7*SHA512_DIGEST_ROW_SIZE],h
+
+ ; update input pointers
+ add inp0, IDX
+ mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0
+ add inp1, IDX
+ mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ ;; Clear stack frame ((16 + 8)*16 bytes)
+%ifdef SAFE_DATA
+ vpxor xmm0, xmm0
+%assign i 0
+%rep (16+NUM_SHA512_DIGEST_WORDS)
+ vmovdqa [rsp + i*SZ2], xmm0
+%assign i (i+1)
+%endrep
+%endif
+
+ add rsp, STACK_size
+
+ ; outer calling routine restores XMM and other GP registers
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/sha_256_mult_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha_256_mult_avx.asm
new file mode 100644
index 000000000..c1895a3f5
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/sha_256_mult_avx.asm
@@ -0,0 +1,391 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; code to compute quad SHA256 using AVX
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers: rax rbx rdx r8 r9 r10 r11 r12
+;; Windows preserves: rcx rsi rdi rbp r12 r14 r15
+;;
+;; Linux clobbers: rax rbx rsi r8 r9 r10 r11 r12
+;; Linux preserves: rcx rdx rdi rbp r13 r14 r15
+;;
+;; clobbers xmm0-15
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+
+extern K256_4
+
+%ifdef LINUX
+ %define arg1 rdi
+ %define arg2 rsi
+%else
+ ; Windows definitions
+ %define arg1 rcx
+ %define arg2 rdx
+%endif
+
+; Common definitions
+%define STATE arg1
+%define INP_SIZE arg2
+
+%define IDX rax
+%define ROUND rbx
+%define TBL r12
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1 xmm14
+%define TMP xmm15
+
+%define SZ4 4*SHA256_DIGEST_WORD_SIZE ; Size of one vector register
+%define ROUNDS 64*SZ4
+
+; Define stack usage
+struc STACK
+_DATA: resb SZ4 * 16
+_DIGEST: resb SZ4 * NUM_SHA256_DIGEST_WORDS
+ resb 8 ; for alignment, must be odd multiple of 8
+endstruc
+
+%define VMOVPS vmovups
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+
+ vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpslld %%tmp, %%reg, (32-(%%imm))
+ vpsrld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORD_nd reg, imm, tmp, src
+%macro PRORD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ ;vmovdqa %%tmp, %%reg
+ vpslld %%tmp, %%src, (32-(%%imm))
+ vpsrld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PRORD dst/src, amt
+%macro PRORD 2
+ PRORD %1, %2, TMP
+%endmacro
+
+; PRORD_nd dst, src, amt
+%macro PRORD_nd 3
+ PRORD_nd %1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i %2
+ PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
+
+ vpxor a2, f, g ; ch: a2 = f^g
+ vpand a2, a2, e ; ch: a2 = (f^g)&e
+ vpxor a2, a2, g ; a2 = ch
+
+ PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
+ vmovdqa [SZ4*(%%i&0xf) + rsp + _DATA], %%T1
+ vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
+ vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ vpaddd h, h, a2 ; h = h + ch
+ PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
+ vpaddd h, h, %%T1 ; h = h + ch + W + K
+ vpxor a0, a0, a1 ; a0 = sigma1
+ PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
+ vpxor %%T1, a, c ; maj: T1 = a^c
+ add ROUND, SZ4 ; ROUND++
+ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
+ vpaddd h, h, a0
+
+ vpaddd d, d, h
+
+ vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ vpxor a2, a2, a1 ; a2 = sig0
+ vpand a1, a, c ; maj: a1 = a&c
+ vpor a1, a1, %%T1 ; a1 = maj
+ vpaddd h, h, a1 ; h = h + ch + W + K + maj
+ vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i %2
+ vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp + _DATA]
+ vmovdqa a1, [SZ4*((%%i-2)&0xf) + rsp + _DATA]
+ vmovdqa a0, %%T1
+ PRORD %%T1, 18-7
+ vmovdqa a2, a1
+ PRORD a1, 19-17
+ vpxor %%T1, %%T1, a0
+ PRORD %%T1, 7
+ vpxor a1, a1, a2
+ PRORD a1, 17
+ vpsrld a0, a0, 3
+ vpxor %%T1, %%T1, a0
+ vpsrld a2, a2, 10
+ vpxor a1, a1, a2
+ vpaddd %%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp + _DATA]
+ vpaddd a1, a1, [SZ4*((%%i-7)&0xf) + rsp + _DATA]
+ vpaddd %%T1, %%T1, a1
+
+ ROUND_00_15 %%T1, %%i
+%endm
+
+section .data
+default rel
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+section .text
+
+;; SHA256_ARGS:
+;; UINT128 digest[8]; // transposed digests
+;; UINT8 *data_ptr[4];
+;;
+
+;; void sha_256_mult_avx(SHA256_ARGS *args, UINT64 num_blocks);
+;; arg 1 : STATE : pointer args
+;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
+;;
+MKGLOBAL(sha_256_mult_avx,function,internal)
+align 16
+sha_256_mult_avx:
+ ; general registers preserved in outer calling routine
+ ; outer calling routine saves all the XMM registers
+ sub rsp, STACK_size
+
+ ;; Load the pre-transposed incoming digest.
+ vmovdqa a,[STATE+0*SHA256_DIGEST_ROW_SIZE]
+ vmovdqa b,[STATE+1*SHA256_DIGEST_ROW_SIZE]
+ vmovdqa c,[STATE+2*SHA256_DIGEST_ROW_SIZE]
+ vmovdqa d,[STATE+3*SHA256_DIGEST_ROW_SIZE]
+ vmovdqa e,[STATE+4*SHA256_DIGEST_ROW_SIZE]
+ vmovdqa f,[STATE+5*SHA256_DIGEST_ROW_SIZE]
+ vmovdqa g,[STATE+6*SHA256_DIGEST_ROW_SIZE]
+ vmovdqa h,[STATE+7*SHA256_DIGEST_ROW_SIZE]
+
+ lea TBL,[rel K256_4]
+
+ ;; load the address of each of the 4 message lanes
+ ;; getting ready to transpose input onto stack
+ mov inp0,[STATE + _data_ptr_sha256 + 0*PTR_SZ]
+ mov inp1,[STATE + _data_ptr_sha256 + 1*PTR_SZ]
+ mov inp2,[STATE + _data_ptr_sha256 + 2*PTR_SZ]
+ mov inp3,[STATE + _data_ptr_sha256 + 3*PTR_SZ]
+
+ xor IDX, IDX
+lloop:
+ xor ROUND, ROUND
+
+ ;; save old digest
+ vmovdqa [rsp + _DIGEST + 0*SZ4], a
+ vmovdqa [rsp + _DIGEST + 1*SZ4], b
+ vmovdqa [rsp + _DIGEST + 2*SZ4], c
+ vmovdqa [rsp + _DIGEST + 3*SZ4], d
+ vmovdqa [rsp + _DIGEST + 4*SZ4], e
+ vmovdqa [rsp + _DIGEST + 5*SZ4], f
+ vmovdqa [rsp + _DIGEST + 6*SZ4], g
+ vmovdqa [rsp + _DIGEST + 7*SZ4], h
+
+%assign i 0
+%rep 4
+ vmovdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK]
+ VMOVPS TT2,[inp0+IDX+i*16]
+ VMOVPS TT1,[inp1+IDX+i*16]
+ VMOVPS TT4,[inp2+IDX+i*16]
+ VMOVPS TT3,[inp3+IDX+i*16]
+ TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5
+ vpshufb TT0, TT0, TMP
+ vpshufb TT1, TT1, TMP
+ vpshufb TT2, TT2, TMP
+ vpshufb TT3, TT3, TMP
+ ROUND_00_15 TT0,(i*4+0)
+ ROUND_00_15 TT1,(i*4+1)
+ ROUND_00_15 TT2,(i*4+2)
+ ROUND_00_15 TT3,(i*4+3)
+%assign i (i+1)
+%endrep
+ add IDX, 4*4*4
+
+%assign i (i*4)
+
+ jmp Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+ ROUND_16_XX T1, i
+%assign i (i+1)
+%endrep
+
+ cmp ROUND,ROUNDS
+ jb Lrounds_16_xx
+
+ ;; add old digest
+ vpaddd a, a, [rsp + _DIGEST + 0*SZ4]
+ vpaddd b, b, [rsp + _DIGEST + 1*SZ4]
+ vpaddd c, c, [rsp + _DIGEST + 2*SZ4]
+ vpaddd d, d, [rsp + _DIGEST + 3*SZ4]
+ vpaddd e, e, [rsp + _DIGEST + 4*SZ4]
+ vpaddd f, f, [rsp + _DIGEST + 5*SZ4]
+ vpaddd g, g, [rsp + _DIGEST + 6*SZ4]
+ vpaddd h, h, [rsp + _DIGEST + 7*SZ4]
+
+ sub INP_SIZE, 1 ;; unit is blocks
+ jne lloop
+
+ ; write back to memory (state object) the transposed digest
+ vmovdqa [STATE+0*SHA256_DIGEST_ROW_SIZE],a
+ vmovdqa [STATE+1*SHA256_DIGEST_ROW_SIZE],b
+ vmovdqa [STATE+2*SHA256_DIGEST_ROW_SIZE],c
+ vmovdqa [STATE+3*SHA256_DIGEST_ROW_SIZE],d
+ vmovdqa [STATE+4*SHA256_DIGEST_ROW_SIZE],e
+ vmovdqa [STATE+5*SHA256_DIGEST_ROW_SIZE],f
+ vmovdqa [STATE+6*SHA256_DIGEST_ROW_SIZE],g
+ vmovdqa [STATE+7*SHA256_DIGEST_ROW_SIZE],h
+
+ ; update input pointers
+ add inp0, IDX
+ mov [STATE + _data_ptr_sha256 + 0*8], inp0
+ add inp1, IDX
+ mov [STATE + _data_ptr_sha256 + 1*8], inp1
+ add inp2, IDX
+ mov [STATE + _data_ptr_sha256 + 2*8], inp2
+ add inp3, IDX
+ mov [STATE + _data_ptr_sha256 + 3*8], inp3
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+%ifdef SAFE_DATA
+ ;; Clear stack frame ((16 + 8)*16 bytes)
+ vpxor xmm0, xmm0
+%assign i 0
+%rep (16+NUM_SHA256_DIGEST_WORDS)
+ vmovdqa [rsp + i*SZ4], xmm0
+%assign i (i+1)
+%endrep
+%endif
+
+ add rsp, STACK_size
+ ; outer calling routine restores XMM and other GP registers
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/snow3g_avx.c b/src/spdk/intel-ipsec-mb/avx/snow3g_avx.c
new file mode 100644
index 000000000..8c6995fb8
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/snow3g_avx.c
@@ -0,0 +1,42 @@
+/*******************************************************************************
+ Copyright (c) 2019, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define AVX
+#define SNOW3G_F8_1_BUFFER_BIT snow3g_f8_1_buffer_bit_avx
+#define SNOW3G_F8_1_BUFFER snow3g_f8_1_buffer_avx
+#define SNOW3G_F8_2_BUFFER snow3g_f8_2_buffer_avx
+#define SNOW3G_F8_4_BUFFER snow3g_f8_4_buffer_avx
+#define SNOW3G_F8_8_BUFFER snow3g_f8_8_buffer_avx
+#define SNOW3G_F8_N_BUFFER snow3g_f8_n_buffer_avx
+#define SNOW3G_F8_8_BUFFER_MULTIKEY snow3g_f8_8_buffer_multikey_avx
+#define SNOW3G_F8_N_BUFFER_MULTIKEY snow3g_f8_n_buffer_multikey_avx
+#define SNOW3G_F9_1_BUFFER snow3g_f9_1_buffer_avx
+#define SNOW3G_INIT_KEY_SCHED snow3g_init_key_sched_avx
+#define SNOW3G_KEY_SCHED_SIZE snow3g_key_sched_size_avx
+#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_xmms_avx
+
+#include "include/snow3g_common.h"
diff --git a/src/spdk/intel-ipsec-mb/avx/zuc_avx.asm b/src/spdk/intel-ipsec-mb/avx/zuc_avx.asm
new file mode 100755
index 000000000..e7c6bad8a
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/zuc_avx.asm
@@ -0,0 +1,1146 @@
+;;
+;; Copyright (c) 2009-2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "include/reg_sizes.asm"
+
+extern lookup_8bit_avx
+
+section .data
+default rel
+align 64
+S0:
+db 0x3e,0x72,0x5b,0x47,0xca,0xe0,0x00,0x33,0x04,0xd1,0x54,0x98,0x09,0xb9,0x6d,0xcb
+db 0x7b,0x1b,0xf9,0x32,0xaf,0x9d,0x6a,0xa5,0xb8,0x2d,0xfc,0x1d,0x08,0x53,0x03,0x90
+db 0x4d,0x4e,0x84,0x99,0xe4,0xce,0xd9,0x91,0xdd,0xb6,0x85,0x48,0x8b,0x29,0x6e,0xac
+db 0xcd,0xc1,0xf8,0x1e,0x73,0x43,0x69,0xc6,0xb5,0xbd,0xfd,0x39,0x63,0x20,0xd4,0x38
+db 0x76,0x7d,0xb2,0xa7,0xcf,0xed,0x57,0xc5,0xf3,0x2c,0xbb,0x14,0x21,0x06,0x55,0x9b
+db 0xe3,0xef,0x5e,0x31,0x4f,0x7f,0x5a,0xa4,0x0d,0x82,0x51,0x49,0x5f,0xba,0x58,0x1c
+db 0x4a,0x16,0xd5,0x17,0xa8,0x92,0x24,0x1f,0x8c,0xff,0xd8,0xae,0x2e,0x01,0xd3,0xad
+db 0x3b,0x4b,0xda,0x46,0xeb,0xc9,0xde,0x9a,0x8f,0x87,0xd7,0x3a,0x80,0x6f,0x2f,0xc8
+db 0xb1,0xb4,0x37,0xf7,0x0a,0x22,0x13,0x28,0x7c,0xcc,0x3c,0x89,0xc7,0xc3,0x96,0x56
+db 0x07,0xbf,0x7e,0xf0,0x0b,0x2b,0x97,0x52,0x35,0x41,0x79,0x61,0xa6,0x4c,0x10,0xfe
+db 0xbc,0x26,0x95,0x88,0x8a,0xb0,0xa3,0xfb,0xc0,0x18,0x94,0xf2,0xe1,0xe5,0xe9,0x5d
+db 0xd0,0xdc,0x11,0x66,0x64,0x5c,0xec,0x59,0x42,0x75,0x12,0xf5,0x74,0x9c,0xaa,0x23
+db 0x0e,0x86,0xab,0xbe,0x2a,0x02,0xe7,0x67,0xe6,0x44,0xa2,0x6c,0xc2,0x93,0x9f,0xf1
+db 0xf6,0xfa,0x36,0xd2,0x50,0x68,0x9e,0x62,0x71,0x15,0x3d,0xd6,0x40,0xc4,0xe2,0x0f
+db 0x8e,0x83,0x77,0x6b,0x25,0x05,0x3f,0x0c,0x30,0xea,0x70,0xb7,0xa1,0xe8,0xa9,0x65
+db 0x8d,0x27,0x1a,0xdb,0x81,0xb3,0xa0,0xf4,0x45,0x7a,0x19,0xdf,0xee,0x78,0x34,0x60
+
+S1:
+db 0x55,0xc2,0x63,0x71,0x3b,0xc8,0x47,0x86,0x9f,0x3c,0xda,0x5b,0x29,0xaa,0xfd,0x77
+db 0x8c,0xc5,0x94,0x0c,0xa6,0x1a,0x13,0x00,0xe3,0xa8,0x16,0x72,0x40,0xf9,0xf8,0x42
+db 0x44,0x26,0x68,0x96,0x81,0xd9,0x45,0x3e,0x10,0x76,0xc6,0xa7,0x8b,0x39,0x43,0xe1
+db 0x3a,0xb5,0x56,0x2a,0xc0,0x6d,0xb3,0x05,0x22,0x66,0xbf,0xdc,0x0b,0xfa,0x62,0x48
+db 0xdd,0x20,0x11,0x06,0x36,0xc9,0xc1,0xcf,0xf6,0x27,0x52,0xbb,0x69,0xf5,0xd4,0x87
+db 0x7f,0x84,0x4c,0xd2,0x9c,0x57,0xa4,0xbc,0x4f,0x9a,0xdf,0xfe,0xd6,0x8d,0x7a,0xeb
+db 0x2b,0x53,0xd8,0x5c,0xa1,0x14,0x17,0xfb,0x23,0xd5,0x7d,0x30,0x67,0x73,0x08,0x09
+db 0xee,0xb7,0x70,0x3f,0x61,0xb2,0x19,0x8e,0x4e,0xe5,0x4b,0x93,0x8f,0x5d,0xdb,0xa9
+db 0xad,0xf1,0xae,0x2e,0xcb,0x0d,0xfc,0xf4,0x2d,0x46,0x6e,0x1d,0x97,0xe8,0xd1,0xe9
+db 0x4d,0x37,0xa5,0x75,0x5e,0x83,0x9e,0xab,0x82,0x9d,0xb9,0x1c,0xe0,0xcd,0x49,0x89
+db 0x01,0xb6,0xbd,0x58,0x24,0xa2,0x5f,0x38,0x78,0x99,0x15,0x90,0x50,0xb8,0x95,0xe4
+db 0xd0,0x91,0xc7,0xce,0xed,0x0f,0xb4,0x6f,0xa0,0xcc,0xf0,0x02,0x4a,0x79,0xc3,0xde
+db 0xa3,0xef,0xea,0x51,0xe6,0x6b,0x18,0xec,0x1b,0x2c,0x80,0xf7,0x74,0xe7,0xff,0x21
+db 0x5a,0x6a,0x54,0x1e,0x41,0x31,0x92,0x35,0xc4,0x33,0x07,0x0a,0xba,0x7e,0x0e,0x34
+db 0x88,0xb1,0x98,0x7c,0xf3,0x3d,0x60,0x6c,0x7b,0xca,0xd3,0x1f,0x32,0x65,0x04,0x28
+db 0x64,0xbe,0x85,0x9b,0x2f,0x59,0x8a,0xd7,0xb0,0x25,0xac,0xaf,0x12,0x03,0xe2,0xf2
+
+EK_d:
+dw 0x44D7, 0x26BC, 0x626B, 0x135E, 0x5789, 0x35E2, 0x7135, 0x09AF,
+dw 0x4D78, 0x2F13, 0x6BC4, 0x1AF1, 0x5E26, 0x3C4D, 0x789A, 0x47AC
+
+mask31:
+dd 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
+
+align 16
+bit_reverse_table_l:
+db 0x00, 0x08, 0x04, 0x0c, 0x02, 0x0a, 0x06, 0x0e, 0x01, 0x09, 0x05, 0x0d, 0x03, 0x0b, 0x07, 0x0f
+
+align 16
+bit_reverse_table_h:
+db 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0
+
+align 16
+bit_reverse_and_table:
+db 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f
+
+align 16
+data_mask_64bits:
+dd 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
+
+bit_mask_table:
+db 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe
+
+
+section .text
+align 64
+
+%define OFFSET_FR1 (16*4)
+%define OFFSET_FR2 (17*4)
+%define OFFSET_BRC_X0 (18*4)
+%define OFFSET_BRC_X1 (19*4)
+%define OFFSET_BRC_X2 (20*4)
+%define OFFSET_BRC_X3 (21*4)
+
+%define MASK31 xmm12
+
+%define OFS_R1 (16*(4*4))
+%define OFS_R2 (OFS_R1 + (4*4))
+%define OFS_X0 (OFS_R2 + (4*4))
+%define OFS_X1 (OFS_X0 + (4*4))
+%define OFS_X2 (OFS_X1 + (4*4))
+%define OFS_X3 (OFS_X2 + (4*4))
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE 16*10
+%else
+ %define XMM_STORAGE 0
+%endif
+
+%define VARIABLE_OFFSET XMM_STORAGE
+
+%macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+%ifidn __OUTPUT_FORMAT__, win64
+ push rdi
+ push rsi
+%endif
+ mov r14, rsp
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm7
+ vmovdqu [rsp + 2*16],xmm8
+ vmovdqu [rsp + 3*16],xmm9
+ vmovdqu [rsp + 4*16],xmm10
+ vmovdqu [rsp + 5*16],xmm11
+ vmovdqu [rsp + 6*16],xmm12
+ vmovdqu [rsp + 7*16],xmm13
+ vmovdqu [rsp + 8*16],xmm14
+ vmovdqu [rsp + 9*16],xmm15
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15, [rsp + 9*16]
+ vmovdqu xmm14, [rsp + 8*16]
+ vmovdqu xmm13, [rsp + 7*16]
+ vmovdqu xmm12, [rsp + 6*16]
+ vmovdqu xmm11, [rsp + 5*16]
+ vmovdqu xmm10, [rsp + 4*16]
+ vmovdqu xmm9, [rsp + 3*16]
+ vmovdqu xmm8, [rsp + 2*16]
+ vmovdqu xmm7, [rsp + 1*16]
+ vmovdqu xmm6, [rsp + 0*16]
+%endif
+ mov rsp, r14
+%ifidn __OUTPUT_FORMAT__, win64
+ pop rsi
+ pop rdi
+%endif
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%endmacro
+
+
+;;
+;; make_u31()
+;;
+%macro make_u31 4
+
+%define %%Rt %1
+%define %%Ke %2
+%define %%Ek %3
+%define %%Iv %4
+ xor %%Rt, %%Rt
+ shrd %%Rt, %%Iv, 8
+ shrd %%Rt, %%Ek, 15
+ shrd %%Rt, %%Ke, 9
+%endmacro
+
+
+;
+; bits_reorg4()
+;
+; params
+; %1 - round number
+; rax - LFSR pointer
+; uses
+;
+; return
+;
+%macro bits_reorg4 1
+ ;
+ ; xmm15 = LFSR_S15
+ ; xmm14 = LFSR_S14
+ ; xmm11 = LFSR_S11
+ ; xmm9 = LFSR_S9
+ ; xmm7 = LFSR_S7
+ ; xmm5 = LFSR_S5
+ ; xmm2 = LFSR_S2
+ ; xmm0 = LFSR_S0
+ ;
+ vmovdqa xmm15, [rax + ((15 + %1) % 16)*16]
+ vmovdqa xmm14, [rax + ((14 + %1) % 16)*16]
+ vmovdqa xmm11, [rax + ((11 + %1) % 16)*16]
+ vmovdqa xmm9, [rax + (( 9 + %1) % 16)*16]
+ vmovdqa xmm7, [rax + (( 7 + %1) % 16)*16]
+ vmovdqa xmm5, [rax + (( 5 + %1) % 16)*16]
+ vmovdqa xmm2, [rax + (( 2 + %1) % 16)*16]
+ vmovdqa xmm0, [rax + (( 0 + %1) % 16)*16]
+
+ vpxor xmm1, xmm1
+ vpslld xmm15, 1
+ vpblendw xmm3, xmm14, xmm1, 0xAA
+ vpblendw xmm15, xmm3, xmm15, 0xAA
+
+ vmovdqa [rax + OFS_X0], xmm15 ; BRC_X0
+ vpslld xmm11, 16
+ vpsrld xmm9, 15
+ vpor xmm11, xmm9
+ vmovdqa [rax + OFS_X1], xmm11 ; BRC_X1
+ vpslld xmm7, 16
+ vpsrld xmm5, 15
+ vpor xmm7, xmm5
+ vmovdqa [rax + OFS_X2], xmm7 ; BRC_X2
+ vpslld xmm2, 16
+ vpsrld xmm0, 15
+ vpor xmm2, xmm0
+ vmovdqa [rax + OFS_X3], xmm2 ; BRC_X3
+%endmacro
+
+%macro lookup_single_sbox 2
+%define %%table %1 ; [in] Pointer to table to look up
+%define %%idx_val %2 ; [in/out] Index to look up and returned value (rcx, rdx, r8, r9)
+
+%ifdef SAFE_LOOKUP
+ ;; Save all registers used in lookup_8bit (xmm0-5, r9,r10)
+ ;; and registers for param passing and return (4 regs, OS dependent)
+ ;; (6*16 + 6*8 = 144 bytes)
+ sub rsp, 144
+
+ vmovdqu [rsp], xmm0
+ vmovdqu [rsp + 16], xmm1
+ vmovdqu [rsp + 32], xmm2
+ vmovdqu [rsp + 48], xmm3
+ vmovdqu [rsp + 64], xmm4
+ vmovdqu [rsp + 80], xmm5
+ mov [rsp + 96], r9
+ mov [rsp + 104], r10
+
+%ifdef LINUX
+ mov [rsp + 112], rdi
+ mov [rsp + 120], rsi
+ mov [rsp + 128], rdx
+ mov rdi, %%table
+ mov rsi, %%idx_val
+ mov rdx, 256
+%else
+%ifnidni %%idx_val, rcx
+ mov [rsp + 112], rcx
+%endif
+%ifnidni %%idx_val, rdx
+ mov [rsp + 120], rdx
+%endif
+%ifnidni %%idx_val, r8
+ mov [rsp + 128], r8
+%endif
+
+ mov rdx, %%idx_val
+ mov rcx, %%table
+ mov r8, 256
+%endif
+ mov [rsp + 136], rax
+
+ call lookup_8bit_avx
+
+ ;; Restore all registers
+ vmovdqu xmm0, [rsp]
+ vmovdqu xmm1, [rsp + 16]
+ vmovdqu xmm2, [rsp + 32]
+ vmovdqu xmm3, [rsp + 48]
+ vmovdqu xmm4, [rsp + 64]
+ vmovdqu xmm5, [rsp + 80]
+ mov r9, [rsp + 96]
+ mov r10, [rsp + 104]
+
+%ifdef LINUX
+ mov rdi, [rsp + 112]
+ mov rsi, [rsp + 120]
+ mov rdx, [rsp + 128]
+%else
+%ifnidni %%idx_val, rcx
+ mov rcx, [rsp + 112]
+%endif
+%ifnidni %%idx_val, rdx
+ mov rdx, [rsp + 120]
+%endif
+%ifnidni %%idx_val, rcx
+ mov r8, [rsp + 128]
+%endif
+%endif
+
+ ;; Move returned value from lookup function, before restoring rax
+ mov DWORD(%%idx_val), eax
+ mov rax, [rsp + 136]
+
+ add rsp, 144
+
+%else ;; SAFE_LOOKUP
+
+ movzx DWORD(%%idx_val), BYTE [%%table + %%idx_val]
+
+%endif ;; SAFE_LOOKUP
+%endmacro
+
+;
+; sbox_lkup()
+;
+; params
+; %1 R1/R2 table offset
+; %2 R1/R2 entry offset
+; %3 xmm reg name
+; uses
+; rcx,rdx,r8,r9,r10,rsi
+; return
+;
+%macro sbox_lkup 3
+ vpextrb rcx, %3, (0 + (%2 * 4))
+ lookup_single_sbox rsi, rcx
+ vpextrb rdx, %3, (1 + (%2 * 4))
+ lookup_single_sbox rdi, rdx
+
+ xor r10, r10
+ vpextrb r8, %3, (2 + (%2 * 4))
+ lookup_single_sbox rsi, r8
+ vpextrb r9, %3, (3 + (%2 * 4))
+ lookup_single_sbox rdi, r9
+
+ shrd r10d, ecx, 8
+ shrd r10d, edx, 8
+ shrd r10d, r8d, 8
+ shrd r10d, r9d, 8
+ mov [rax + %1 + (%2 * 4)], r10d
+%endmacro
+
+
+;
+; rot_mod32()
+;
+; uses xmm7
+;
+%macro rot_mod32 3
+ vpslld %1, %2, %3
+ vpsrld xmm7, %2, (32 - %3)
+
+ vpor %1, xmm7
+%endmacro
+
+
+;
+; nonlin_fun4()
+;
+; params
+; %1 == 1, then calculate W
+; uses
+;
+; return
+; xmm0 = W value, updates F_R1[] / F_R2[]
+;
+%macro nonlin_fun4 1
+
+%if (%1 == 1)
+ vmovdqa xmm0, [rax + OFS_X0]
+ vpxor xmm0, [rax + OFS_R1]
+ vpaddd xmm0, [rax + OFS_R2] ; W = (BRC_X0 ^ F_R1) + F_R2
+%endif
+ ;
+ vmovdqa xmm1, [rax + OFS_R1]
+ vmovdqa xmm2, [rax + OFS_R2]
+ vpaddd xmm1, [rax + OFS_X1] ; W1 = F_R1 + BRC_X1
+ vpxor xmm2, [rax + OFS_X2] ; W2 = F_R2 ^ BRC_X2
+ ;
+
+ vpslld xmm3, xmm1, 16
+ vpsrld xmm4, xmm1, 16
+ vpslld xmm5, xmm2, 16
+ vpsrld xmm6, xmm2, 16
+ vpor xmm1, xmm3, xmm6
+ vpor xmm2, xmm4, xmm5
+
+ ;
+ rot_mod32 xmm3, xmm1, 2
+ rot_mod32 xmm4, xmm1, 10
+ rot_mod32 xmm5, xmm1, 18
+ rot_mod32 xmm6, xmm1, 24
+ vpxor xmm1, xmm3
+ vpxor xmm1, xmm4
+ vpxor xmm1, xmm5
+ vpxor xmm1, xmm6 ; XMM1 = U = L1(P)
+
+ sbox_lkup OFS_R1, 0, xmm1 ; F_R1[0]
+ sbox_lkup OFS_R1, 1, xmm1 ; F_R1[1]
+ sbox_lkup OFS_R1, 2, xmm1 ; F_R1[2]
+ sbox_lkup OFS_R1, 3, xmm1 ; F_R1[3]
+ ;
+ rot_mod32 xmm3, xmm2, 8
+ rot_mod32 xmm4, xmm2, 14
+ rot_mod32 xmm5, xmm2, 22
+ rot_mod32 xmm6, xmm2, 30
+ vpxor xmm2, xmm3
+ vpxor xmm2, xmm4
+ vpxor xmm2, xmm5
+ vpxor xmm2, xmm6 ; XMM2 = V = L2(Q)
+ ;
+
+ sbox_lkup OFS_R2, 0, xmm2 ; F_R2[0]
+ sbox_lkup OFS_R2, 1, xmm2 ; F_R2[1]
+ sbox_lkup OFS_R2, 2, xmm2 ; F_R2[2]
+ sbox_lkup OFS_R2, 3, xmm2 ; F_R2[3]
+%endmacro
+
+
+;
+; store_kstr4()
+;
+; params
+;
+; uses
+; xmm0 as input
+; return
+;
+%macro store_kstr4 0
+ vpxor xmm0, [rax + OFS_X3]
+ vpextrd r15d, xmm0, 3
+ pop r9 ; *pKeyStr4
+ vpextrd r14d, xmm0, 2
+ pop r8 ; *pKeyStr3
+ vpextrd r13d, xmm0, 1
+ pop rdx ; *pKeyStr2
+ vpextrd r12d, xmm0, 0
+ pop rcx ; *pKeyStr1
+ mov [r9], r15d
+ mov [r8], r14d
+ mov [rdx], r13d
+ mov [rcx], r12d
+ add rcx, 4
+ add rdx, 4
+ add r8, 4
+ add r9, 4
+ push rcx
+ push rdx
+ push r8
+ push r9
+%endmacro
+
+
+;
+; add_mod31()
+; add two 32-bit args and reduce mod (2^31-1)
+; params
+; %1 - arg1/res
+; %2 - arg2
+; uses
+; xmm2
+; return
+; %1
+%macro add_mod31 2
+ vpaddd %1, %2
+ vpsrld xmm2, %1, 31
+ vpand %1, MASK31
+ vpaddd %1, xmm2
+%endmacro
+
+
+;
+; rot_mod31()
+; rotate (mult by pow of 2) 32-bit arg and reduce mod (2^31-1)
+; params
+; %1 - arg
+; %2 - # of bits
+; uses
+; xmm2
+; return
+; %1
+%macro rot_mod31 2
+
+ vpslld xmm2, %1, %2
+ vpsrld %1, %1, (31 - %2)
+
+ vpor %1, xmm2
+ vpand %1, MASK31
+%endmacro
+
+
+;
+; lfsr_updt4()
+;
+; params
+; %1 - round number
+; uses
+; xmm0 as input (ZERO or W)
+; return
+;
+%macro lfsr_updt4 1
+ ;
+ ; xmm1 = LFSR_S0
+ ; xmm4 = LFSR_S4
+ ; xmm10 = LFSR_S10
+ ; xmm13 = LFSR_S13
+ ; xmm15 = LFSR_S15
+ ;
+ vpxor xmm3, xmm3
+ vmovdqa xmm1, [rax + (( 0 + %1) % 16)*16]
+ vmovdqa xmm4, [rax + (( 4 + %1) % 16)*16]
+ vmovdqa xmm10, [rax + ((10 + %1) % 16)*16]
+ vmovdqa xmm13, [rax + ((13 + %1) % 16)*16]
+ vmovdqa xmm15, [rax + ((15 + %1) % 16)*16]
+
+ ; Calculate LFSR feedback
+ add_mod31 xmm0, xmm1
+ rot_mod31 xmm1, 8
+ add_mod31 xmm0, xmm1
+ rot_mod31 xmm4, 20
+ add_mod31 xmm0, xmm4
+ rot_mod31 xmm10, 21
+ add_mod31 xmm0, xmm10
+ rot_mod31 xmm13, 17
+ add_mod31 xmm0, xmm13
+ rot_mod31 xmm15, 15
+ add_mod31 xmm0, xmm15
+
+
+
+ vmovdqa [rax + (( 0 + %1) % 16)*16], xmm0
+
+ ; LFSR_S16 = (LFSR_S15++) = eax
+%endmacro
+
+
+;
+; key_expand_4()
+;
+%macro key_expand_4 2
+ movzx r8d, byte [rdi + (%1 + 0)]
+ movzx r9d, word [rbx + ((%1 + 0)*2)]
+ movzx r10d, byte [rsi + (%1 + 0)]
+ make_u31 r11d, r8d, r9d, r10d
+ mov [rax + (((%1 + 0)*16)+(%2*4))], r11d
+
+ movzx r12d, byte [rdi + (%1 + 1)]
+ movzx r13d, word [rbx + ((%1 + 1)*2)]
+ movzx r14d, byte [rsi + (%1 + 1)]
+ make_u31 r15d, r12d, r13d, r14d
+ mov [rax + (((%1 + 1)*16)+(%2*4))], r15d
+%endmacro
+
+
+MKGLOBAL(asm_ZucInitialization_4_avx,function,internal)
+asm_ZucInitialization_4_avx:
+
+%ifdef LINUX
+ %define pKe rdi
+ %define pIv rsi
+ %define pState rdx
+%else
+ %define pKe rcx
+ %define pIv rdx
+ %define pState r8
+%endif
+
+ ; Save non-volatile registers
+ push rbx
+ push rdi
+ push rsi
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdx
+
+ lea rax, [pState] ; load pointer to LFSR
+ push pState ; Save LFSR Pointer to stack
+
+ ; setup the key pointer for first buffer key expand
+ mov rbx, [pKe] ; load the pointer to the array of keys into rbx
+
+ push pKe ; save rdi (key pointer) to the stack
+ lea rdi, [rbx] ; load the pointer to the first key into rdi
+
+
+ ; setup the IV pointer for first buffer key expand
+ mov rcx, [pIv] ; load the pointer to the array of IV's
+ push pIv ; save the IV pointer to the stack
+ lea rsi, [rcx] ; load the first IV pointer
+
+ lea rbx, [EK_d] ; load D variables
+
+ ; Expand key packet 1
+ key_expand_4 0, 0
+ key_expand_4 2, 0
+ key_expand_4 4, 0
+ key_expand_4 6, 0
+ key_expand_4 8, 0
+ key_expand_4 10, 0
+ key_expand_4 12, 0
+ key_expand_4 14, 0
+
+
+ ;second packet key expand here - reset pointers
+ pop rdx ; get IV array pointer from Stack
+ mov rcx, [rdx+8] ; load offset to IV 2 in array
+ lea rsi, [rcx] ; load pointer to IV2
+
+ pop rbx ; get Key array pointer from Stack
+ mov rcx, [rbx+8] ; load offset to key 2 in array
+ lea rdi, [rcx] ; load pointer to Key 2
+
+ push rbx ; save Key pointer
+ push rdx ; save IV pointer
+
+ lea rbx, [EK_d]
+
+ ; Expand key packet 2
+ key_expand_4 0, 1
+ key_expand_4 2, 1
+ key_expand_4 4, 1
+ key_expand_4 6, 1
+ key_expand_4 8, 1
+ key_expand_4 10, 1
+ key_expand_4 12, 1
+ key_expand_4 14, 1
+
+
+
+ ;Third packet key expand here - reset pointers
+ pop rdx ; get IV array pointer from Stack
+ mov rcx, [rdx+16] ; load offset to IV 3 in array
+ lea rsi, [rcx] ; load pointer to IV3
+
+ pop rbx ; get Key array pointer from Stack
+ mov rcx, [rbx+16] ; load offset to key 3 in array
+ lea rdi, [rcx] ; load pointer to Key 3
+
+ push rbx ; save Key pointer
+ push rdx ; save IV pointer
+ lea rbx, [EK_d]
+ ; Expand key packet 3
+ key_expand_4 0, 2
+ key_expand_4 2, 2
+ key_expand_4 4, 2
+ key_expand_4 6, 2
+ key_expand_4 8, 2
+ key_expand_4 10, 2
+ key_expand_4 12, 2
+ key_expand_4 14, 2
+
+
+
+ ;fourth packet key expand here - reset pointers
+ pop rdx ; get IV array pointer from Stack
+ mov rcx, [rdx+24] ; load offset to IV 4 in array
+ lea rsi, [rcx] ; load pointer to IV4
+
+ pop rbx ; get Key array pointer from Stack
+ mov rcx, [rbx+24] ; load offset to key 2 in array
+ lea rdi, [rcx] ; load pointer to Key 2
+ lea rbx, [EK_d]
+ ; Expand key packet 4
+ key_expand_4 0, 3
+ key_expand_4 2, 3
+ key_expand_4 4, 3
+ key_expand_4 6, 3
+ key_expand_4 8, 3
+ key_expand_4 10, 3
+ key_expand_4 12, 3
+ key_expand_4 14, 3
+
+ ; Set R1 and R2 to zero
+ ;xor r10, r10
+ ;xor r11, r11
+
+
+
+ ; Load read-only registers
+ lea rdi, [S0] ; used by sbox_lkup() macro
+ lea rsi, [S1]
+ vmovdqa xmm12, [mask31]
+
+ ; Shift LFSR 32-times, update state variables
+%assign N 0
+%rep 32
+ pop rdx
+ lea rax, [rdx]
+ push rdx
+
+ bits_reorg4 N
+ nonlin_fun4 1
+ vpsrld xmm0,1 ; Shift out LSB of W
+
+ pop rdx
+ lea rax, [rdx]
+ push rdx
+
+ lfsr_updt4 N ; W (xmm0) used in LFSR update - not set to zero
+%assign N N+1
+%endrep
+
+ ; And once more, initial round from keygen phase = 33 times
+ pop rdx
+ lea rax, [rdx]
+ push rdx
+
+ bits_reorg4 0
+ nonlin_fun4 0
+
+ pop rdx
+ lea rax, [rdx]
+
+ vpxor xmm0, xmm0
+ lfsr_updt4 0
+
+
+
+ ; Restore non-volatile registers
+ pop rdx
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rsi
+ pop rdi
+ pop rbx
+
+ ret
+;
+;
+;
+;;
+;; void asm_ZucGenKeystream64B_4_avx(state4_t *pSta, u32* pKeyStr1, u32* pKeyStr2, u32* pKeyStr3, u32* pKeyStr4);
+;;
+;; WIN64
+;; RCX - pSta
+;; RDX - pKeyStr1
+;; R8 - pKeyStr2
+;; R9 - pKeyStr3
+;; Stack - pKeyStr4
+;;
+;; LIN64
+;; RDI - pSta
+;; RSI - pKeyStr1
+;; RDX - pKeyStr2
+;; RCX - pKeyStr3
+;; R8 - pKeyStr4
+;;
+MKGLOBAL(asm_ZucGenKeystream64B_4_avx,function,internal)
+asm_ZucGenKeystream64B_4_avx:
+
+%ifdef LINUX
+ %define pState rdi
+ %define pKS1 rsi
+ %define pKS2 rdx
+ %define pKS3 rcx
+ %define pKS4 r8
+%else
+ %define pState rcx
+ %define pKS1 rdx
+ %define pKS2 r8
+ %define pKS3 r9
+ %define pKS4 rax
+%endif
+
+%ifndef LINUX
+ mov rax, [rsp + 8*5] ; 5th parameter from stack
+%endif
+
+ ; Save non-volatile registers
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+
+%ifndef LINUX
+ push rdi
+ push rsi
+%endif
+ ; Store 4 keystream pointers on the stack
+
+ push pKS1
+ push pKS2
+ push pKS3
+ push pKS4
+
+
+ ; Load state pointer in RAX
+ mov rax, pState
+
+
+ ; Load read-only registers
+ lea rdi, [S0] ; used by sbox_lkup() macro
+ lea rsi, [S1]
+ vmovdqa xmm12, [mask31]
+
+ ; Generate 64B of keystream in 16 rounds
+%assign N 1
+%rep 16
+ bits_reorg4 N
+ nonlin_fun4 1
+ store_kstr4
+ vpxor xmm0, xmm0
+ lfsr_updt4 N
+%assign N N+1
+%endrep
+
+ ; Take keystream pointers off (#push = #pops)
+ pop rax
+ pop rax
+ pop rax
+ pop rax
+
+%ifndef LINUX
+ pop rsi
+ pop rdi
+%endif
+
+ ; Restore non-volatile registers
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+
+
+;;
+;; extern uint32_t asm_Eia3RemainderAVX(const void *ks, const void *data, uint64_t n_bits)
+;;
+;; Returns authentication update value to be XOR'ed with current authentication tag
+;;
+;; WIN64
+;; RCX - KS (key stream pointer)
+;; RDX - DATA (data pointer)
+;; R8 - N_BITS (number data bits to process)
+;; LIN64
+;; RDI - KS (key stream pointer)
+;; RSI - DATA (data pointer)
+;; RDX - N_BITS (number data bits to process)
+;;
+align 64
+MKGLOBAL(asm_Eia3RemainderAVX,function,internal)
+asm_Eia3RemainderAVX:
+
+%ifdef LINUX
+ %define KS rdi
+ %define DATA rsi
+ %define N_BITS rdx
+%else
+ %define KS rcx
+ %define DATA rdx
+ %define N_BITS r8
+%endif
+ FUNC_SAVE
+
+ vmovdqa xmm5, [bit_reverse_table_l]
+ vmovdqa xmm6, [bit_reverse_table_h]
+ vmovdqa xmm7, [bit_reverse_and_table]
+ vmovdqa xmm10, [data_mask_64bits]
+ vpxor xmm9, xmm9
+
+%rep 3
+ cmp N_BITS, 128
+ jb Eia3RoundsAVX_dq_end
+
+ ;; read 16 bytes and reverse bits
+ vmovdqu xmm0, [DATA]
+ vmovdqa xmm1, xmm0
+ vpand xmm1, xmm7
+
+ vmovdqa xmm2, xmm7
+ vpandn xmm2, xmm0
+ vpsrld xmm2, 4
+
+ vmovdqa xmm8, xmm6 ; bit reverse low nibbles (use high table)
+ vpshufb xmm8, xmm1
+
+ vmovdqa xmm4, xmm5 ; bit reverse high nibbles (use low table)
+ vpshufb xmm4, xmm2
+
+ vpor xmm8, xmm4
+ ; xmm8 - bit reversed data bytes
+
+ ;; ZUC authentication part
+ ;; - 4x32 data bits
+ ;; - set up KS
+ vmovdqu xmm3, [KS + (0*4)]
+ vmovdqu xmm4, [KS + (2*4)]
+ vpshufd xmm0, xmm3, 0x61
+ vpshufd xmm1, xmm4, 0x61
+
+ ;; - set up DATA
+ vmovdqa xmm2, xmm8
+ vpand xmm2, xmm10
+ vpshufd xmm3, xmm2, 0xdc
+ vmovdqa xmm4, xmm3
+
+ vpsrldq xmm8, 8
+ vpshufd xmm13, xmm8, 0xdc
+ vmovdqa xmm14, xmm13
+
+ ;; - clmul
+ ;; - xor the results from 4 32-bit words together
+ vpclmulqdq xmm3, xmm0, 0x00
+ vpclmulqdq xmm4, xmm0, 0x11
+ vpclmulqdq xmm13, xmm1, 0x00
+ vpclmulqdq xmm14, xmm1, 0x11
+
+ vpxor xmm3, xmm4
+ vpxor xmm13, xmm14
+ vpxor xmm9, xmm3
+ vpxor xmm9, xmm13
+ lea DATA, [DATA + 16]
+ lea KS, [KS + 16]
+ sub N_BITS, 128
+%endrep
+Eia3RoundsAVX_dq_end:
+
+%rep 3
+ cmp N_BITS, 32
+ jb Eia3RoundsAVX_dw_end
+
+ ;; swap dwords in KS
+ vmovq xmm1, [KS]
+ vpshufd xmm4, xmm1, 0xf1
+
+ ;; bit-reverse 4 bytes of data
+ vmovdqa xmm2, xmm7
+ vmovd xmm0, [DATA]
+ vmovdqa xmm1, xmm0
+ vpand xmm1, xmm2
+
+ vpandn xmm2, xmm0
+ vpsrld xmm2, 4
+
+ vmovdqa xmm0, xmm6 ; bit reverse low nibbles (use high table)
+ vpshufb xmm0, xmm1
+
+ vmovdqa xmm3, xmm5 ; bit reverse high nibbles (use low table)
+ vpshufb xmm3, xmm2
+
+ vpor xmm0, xmm3
+
+ ;; rol & xor
+ vpclmulqdq xmm0, xmm4, 0
+ vpxor xmm9, xmm0
+
+ lea DATA, [DATA + 4]
+ lea KS, [KS + 4]
+ sub N_BITS, 32
+%endrep
+
+Eia3RoundsAVX_dw_end:
+ vmovq rax, xmm9
+ shr rax, 32
+
+ or N_BITS, N_BITS
+ jz Eia3RoundsAVX_byte_loop_end
+
+ ;; get 64-bit key stream for the last data bits (less than 32)
+ mov KS, [KS]
+
+ ;; process remaining data bytes and bits
+Eia3RoundsAVX_byte_loop:
+ or N_BITS, N_BITS
+ jz Eia3RoundsAVX_byte_loop_end
+
+ cmp N_BITS, 8
+ jb Eia3RoundsAVX_byte_partial
+
+ movzx r11, byte [DATA]
+ sub N_BITS, 8
+ jmp Eia3RoundsAVX_byte_read
+
+Eia3RoundsAVX_byte_partial:
+ ;; process remaining bits (up to 7)
+ lea r11, [bit_mask_table]
+ movzx r10, byte [r11 + N_BITS]
+ movzx r11, byte [DATA]
+ and r11, r10
+ xor N_BITS, N_BITS
+Eia3RoundsAVX_byte_read:
+
+%assign DATATEST 0x80
+%rep 8
+ xor r10, r10
+ test r11, DATATEST
+ cmovne r10, KS
+ xor rax, r10
+ rol KS, 1
+%assign DATATEST (DATATEST >> 1)
+%endrep ; byte boundary
+ lea DATA, [DATA + 1]
+ jmp Eia3RoundsAVX_byte_loop
+
+Eia3RoundsAVX_byte_loop_end:
+
+ ;; eax - holds the return value at this stage
+ FUNC_RESTORE
+
+ ret
+
+;;
+;;extern uint32_t asm_Eia3Round64BAVX(uint32_t T, const void *KS, const void *DATA)
+;;
+;; Updates authentication tag T based on keystream KS and DATA.
+;; - it processes 64 bytes of DATA
+;; - reads data in 16 byte chunks and bit reverses them
+;; - reads and re-arranges KS
+;; - employs clmul for the XOR & ROL part
+;; - copies top 64 butes of KS to bottom (for the next round)
+;;
+;; WIN64
+;; RCX - T
+;; RDX - KS pointer to key stream (2 x 64 bytes)
+;;; R8 - DATA pointer to data
+;; LIN64
+;; RDI - T
+;; RSI - KS pointer to key stream (2 x 64 bytes)
+;; RDX - DATA pointer to data
+;;
+align 64
+MKGLOBAL(asm_Eia3Round64BAVX,function,internal)
+asm_Eia3Round64BAVX:
+
+%ifdef LINUX
+ %define T edi
+ %define KS rsi
+ %define DATA rdx
+%else
+ %define T ecx
+ %define KS rdx
+ %define DATA r8
+%endif
+
+ FUNC_SAVE
+
+ vmovdqa xmm5, [bit_reverse_table_l]
+ vmovdqa xmm6, [bit_reverse_table_h]
+ vmovdqa xmm7, [bit_reverse_and_table]
+ vmovdqa xmm10, [data_mask_64bits]
+
+ vpxor xmm9, xmm9
+%assign I 0
+%rep 4
+ ;; read 16 bytes and reverse bits
+ vmovdqu xmm0, [DATA + 16*I]
+ vpand xmm1, xmm0, xmm7
+
+ vpandn xmm2, xmm7, xmm0
+ vpsrld xmm2, 4
+
+ vpshufb xmm8, xmm6, xmm1 ; bit reverse low nibbles (use high table)
+ vpshufb xmm4, xmm5, xmm2 ; bit reverse high nibbles (use low table)
+
+ vpor xmm8, xmm4
+ ; xmm8 - bit reversed data bytes
+
+ ;; ZUC authentication part
+ ;; - 4x32 data bits
+ ;; - set up KS
+%if I != 0
+ vmovdqa xmm11, xmm12
+ vmovdqu xmm12, [KS + (I*16) + (4*4)]
+%else
+ vmovdqu xmm11, [KS + (I*16) + (0*4)]
+ vmovdqu xmm12, [KS + (I*16) + (4*4)]
+%endif
+ vpalignr xmm13, xmm12, xmm11, 8
+ vpshufd xmm2, xmm11, 0x61
+ vpshufd xmm3, xmm13, 0x61
+
+ ;; - set up DATA
+ vpand xmm13, xmm10, xmm8
+ vpshufd xmm0, xmm13, 0xdc
+
+ vpsrldq xmm8, 8
+ vpshufd xmm1, xmm8, 0xdc
+
+ ;; - clmul
+ ;; - xor the results from 4 32-bit words together
+%if I != 0
+ vpclmulqdq xmm13, xmm0, xmm2, 0x00
+ vpclmulqdq xmm14, xmm0, xmm2, 0x11
+ vpclmulqdq xmm15, xmm1, xmm3, 0x00
+ vpclmulqdq xmm8, xmm1, xmm3, 0x11
+
+ vpxor xmm13, xmm14
+ vpxor xmm15, xmm8
+ vpxor xmm9, xmm13
+ vpxor xmm9, xmm15
+%else
+ vpclmulqdq xmm9, xmm0, xmm2, 0x00
+ vpclmulqdq xmm13, xmm0, xmm2, 0x11
+ vpclmulqdq xmm14, xmm1, xmm3, 0x00
+ vpclmulqdq xmm15, xmm1, xmm3, 0x11
+
+ vpxor xmm14, xmm15
+ vpxor xmm9, xmm13
+ vpxor xmm9, xmm14
+%endif
+
+
+%assign I (I + 1)
+%endrep
+
+ ;; - update T
+ vmovq rax, xmm9
+ shr rax, 32
+ xor eax, T
+
+ FUNC_RESTORE
+
+ ret
+
+
+;----------------------------------------------------------------------------------------
+;----------------------------------------------------------------------------------------
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx/zuc_avx_top.c b/src/spdk/intel-ipsec-mb/avx/zuc_avx_top.c
new file mode 100755
index 000000000..b3ba2de81
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/zuc_avx_top.c
@@ -0,0 +1,548 @@
+/*******************************************************************************
+ Copyright (c) 2009-2019, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+/*-----------------------------------------------------------------------
+* zuc_avx.c
+*-----------------------------------------------------------------------
+* An implementation of ZUC, the core algorithm for the
+* 3GPP Confidentiality and Integrity algorithms.
+*
+*-----------------------------------------------------------------------*/
+
+#include <string.h>
+
+#include "include/zuc_internal.h"
+#include "include/wireless_common.h"
+#include "include/save_xmms.h"
+#include "include/clear_regs_mem.h"
+#include "intel-ipsec-mb.h"
+
+#define SAVE_XMMS save_xmms_avx
+#define RESTORE_XMMS restore_xmms_avx
+#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_xmms_avx
+
+static inline
+void _zuc_eea3_1_buffer_avx(const void *pKey,
+ const void *pIv,
+ const void *pBufferIn,
+ void *pBufferOut,
+ const uint32_t length)
+{
+ DECLARE_ALIGNED(ZucState_t zucState, 64);
+ DECLARE_ALIGNED(uint8_t keyStream[64], 64);
+ /* buffer to store 64 bytes of keystream */
+ DECLARE_ALIGNED(uint8_t tempSrc[64], 64);
+ DECLARE_ALIGNED(uint8_t tempDst[64], 64);
+
+ const uint64_t *pIn64 = NULL;
+ const uint8_t *pIn8 = NULL;
+ uint8_t *pOut8 = NULL;
+ uint64_t *pOut64 = NULL, *pKeyStream64 = NULL;
+ uint64_t *pTemp64 = NULL, *pdstTemp64 = NULL;
+
+ uint32_t numKeyStreamsPerPkt = length/ ZUC_KEYSTR_LEN;
+ uint32_t numBytesLeftOver = length % ZUC_KEYSTR_LEN;
+
+ /* need to set the LFSR state to zero */
+ memset(&zucState, 0, sizeof(ZucState_t));
+
+ /* initialize the zuc state */
+ asm_ZucInitialization(pKey, pIv, &(zucState));
+
+ /* Loop Over all the Quad-Words in input buffer and XOR with the 64bits
+ * of generated keystream */
+ pOut64 = (uint64_t *) pBufferOut;
+ pIn64 = (const uint64_t *) pBufferIn;
+
+ while (numKeyStreamsPerPkt--) {
+ /* Generate the key stream 64 bytes at a time */
+ asm_ZucGenKeystream64B((uint32_t *) &keyStream[0], &zucState);
+
+ /* XOR The Keystream generated with the input buffer here */
+ pKeyStream64 = (uint64_t *) keyStream;
+ asm_XorKeyStream64B_avx(pIn64, pOut64, pKeyStream64);
+ pIn64 += 8;
+ pOut64 += 8;
+ }
+
+ /* Check for remaining 0 to 63 bytes */
+ pIn8 = (const uint8_t *) pBufferIn;
+ pOut8 = (uint8_t *) pBufferOut;
+ if(numBytesLeftOver) {
+ asm_ZucGenKeystream64B((uint32_t *) &keyStream[0], &zucState);
+
+ /* copy the remaining bytes into temporary buffer and XOR with
+ * the 64-bytes of keystream. Then copy on the valid bytes back
+ * to the output buffer */
+
+ memcpy(&tempSrc[0], &pIn8[length - numBytesLeftOver],
+ numBytesLeftOver);
+ pKeyStream64 = (uint64_t *) &keyStream[0];
+ pTemp64 = (uint64_t *) &tempSrc[0];
+ pdstTemp64 = (uint64_t *) &tempDst[0];
+
+ asm_XorKeyStream64B_avx(pTemp64, pdstTemp64, pKeyStream64);
+ memcpy(&pOut8[length - numBytesLeftOver], &tempDst[0],
+ numBytesLeftOver);
+
+ }
+#ifdef SAFE_DATA
+ /* Clear sensitive data in stack */
+ clear_mem(keyStream, sizeof(keyStream));
+ clear_mem(&zucState, sizeof(zucState));
+#endif
+}
+
+static inline
+void _zuc_eea3_4_buffer_avx(const void * const pKey[4],
+ const void * const pIv[4],
+ const void * const pBufferIn[4],
+ void *pBufferOut[4],
+ const uint32_t length[4])
+{
+ DECLARE_ALIGNED(ZucState4_t state, 64);
+ DECLARE_ALIGNED(ZucState_t singlePktState, 64);
+ unsigned int i = 0;
+ /* Calculate the minimum input packet size */
+ uint32_t bytes1 = (length[0] < length[1] ?
+ length[0] : length[1]);
+ uint32_t bytes2 = (length[2] < length[3] ?
+ length[2] : length[3]);
+ /* min number of bytes */
+ uint32_t bytes = (bytes1 < bytes2) ? bytes1 : bytes2;
+ uint32_t numKeyStreamsPerPkt = bytes/ZUC_KEYSTR_LEN;
+ uint32_t remainBytes[4] = {0};
+ DECLARE_ALIGNED(uint8_t keyStr1[64], 64);
+ DECLARE_ALIGNED(uint8_t keyStr2[64], 64);
+ DECLARE_ALIGNED(uint8_t keyStr3[64], 64);
+ DECLARE_ALIGNED(uint8_t keyStr4[64], 64);
+ DECLARE_ALIGNED(uint8_t tempSrc[64], 64);
+ DECLARE_ALIGNED(uint8_t tempDst[64], 64);
+ /* structure to store the 4 keys */
+ DECLARE_ALIGNED(ZucKey4_t keys, 64);
+ /* structure to store the 4 IV's */
+ DECLARE_ALIGNED(ZucIv4_t ivs, 64);
+ uint32_t numBytesLeftOver = 0;
+ const uint8_t *pTempBufInPtr = NULL;
+ uint8_t *pTempBufOutPtr = NULL;
+
+ const uint64_t *pIn64_0 = NULL;
+ const uint64_t *pIn64_1 = NULL;
+ const uint64_t *pIn64_2 = NULL;
+ const uint64_t *pIn64_3 = NULL;
+ uint64_t *pOut64_0 = NULL;
+ uint64_t *pOut64_1 = NULL;
+ uint64_t *pOut64_2 = NULL;
+ uint64_t *pOut64_3 = NULL;
+ uint64_t *pTempSrc64 = NULL;
+ uint64_t *pTempDst64 = NULL;
+ uint64_t *pKeyStream64 = NULL;
+
+ /* rounded down minimum length */
+ bytes = numKeyStreamsPerPkt * ZUC_KEYSTR_LEN;
+
+ /* Need to set the LFSR state to zero */
+ memset(&state, 0, sizeof(ZucState4_t));
+
+ /* Calculate the number of bytes left over for each packet */
+ for (i=0; i< 4; i++)
+ remainBytes[i] = length[i] - bytes;
+
+ /* Setup the Keys */
+ keys.pKey1 = pKey[0];
+ keys.pKey2 = pKey[1];
+ keys.pKey3 = pKey[2];
+ keys.pKey4 = pKey[3];
+
+ /* setup the IV's */
+ ivs.pIv1 = pIv[0];
+ ivs.pIv2 = pIv[1];
+ ivs.pIv3 = pIv[2];
+ ivs.pIv4 = pIv[3];
+
+ asm_ZucInitialization_4_avx( &keys, &ivs, &state);
+
+ pOut64_0 = (uint64_t *) pBufferOut[0];
+ pOut64_1 = (uint64_t *) pBufferOut[1];
+ pOut64_2 = (uint64_t *) pBufferOut[2];
+ pOut64_3 = (uint64_t *) pBufferOut[3];
+
+ pIn64_0 = (const uint64_t *) pBufferIn[0];
+ pIn64_1 = (const uint64_t *) pBufferIn[1];
+ pIn64_2 = (const uint64_t *) pBufferIn[2];
+ pIn64_3 = (const uint64_t *) pBufferIn[3];
+
+ /* Loop for 64 bytes at a time generating 4 key-streams per loop */
+ while (numKeyStreamsPerPkt) {
+ /* Generate 64 bytes at a time */
+ asm_ZucGenKeystream64B_4_avx(&state,
+ (uint32_t *) keyStr1,
+ (uint32_t *) keyStr2,
+ (uint32_t *) keyStr3,
+ (uint32_t *) keyStr4);
+
+ /* XOR the KeyStream with the input buffers and store in output
+ * buffer*/
+ pKeyStream64 = (uint64_t *) keyStr1;
+ asm_XorKeyStream64B_avx(pIn64_0, pOut64_0, pKeyStream64);
+ pIn64_0 += 8;
+ pOut64_0 += 8;
+
+ pKeyStream64 = (uint64_t *) keyStr2;
+ asm_XorKeyStream64B_avx(pIn64_1, pOut64_1, pKeyStream64);
+ pIn64_1 += 8;
+ pOut64_1 += 8;
+
+ pKeyStream64 = (uint64_t *) keyStr3;
+ asm_XorKeyStream64B_avx(pIn64_2, pOut64_2, pKeyStream64);
+ pIn64_2 += 8;
+ pOut64_2 += 8;
+
+ pKeyStream64 = (uint64_t *) keyStr4;
+ asm_XorKeyStream64B_avx(pIn64_3, pOut64_3, pKeyStream64);
+ pIn64_3 += 8;
+ pOut64_3 += 8;
+
+ /* Update keystream count */
+ numKeyStreamsPerPkt--;
+
+ }
+
+ /* process each packet separately for the remaining bytes */
+ for (i = 0; i < 4; i++) {
+ if (remainBytes[i]) {
+ /* need to copy the zuc state to single packet state */
+ singlePktState.lfsrState[0] = state.lfsrState[0][i];
+ singlePktState.lfsrState[1] = state.lfsrState[1][i];
+ singlePktState.lfsrState[2] = state.lfsrState[2][i];
+ singlePktState.lfsrState[3] = state.lfsrState[3][i];
+ singlePktState.lfsrState[4] = state.lfsrState[4][i];
+ singlePktState.lfsrState[5] = state.lfsrState[5][i];
+ singlePktState.lfsrState[6] = state.lfsrState[6][i];
+ singlePktState.lfsrState[7] = state.lfsrState[7][i];
+ singlePktState.lfsrState[8] = state.lfsrState[8][i];
+ singlePktState.lfsrState[9] = state.lfsrState[9][i];
+ singlePktState.lfsrState[10] = state.lfsrState[10][i];
+ singlePktState.lfsrState[11] = state.lfsrState[11][i];
+ singlePktState.lfsrState[12] = state.lfsrState[12][i];
+ singlePktState.lfsrState[13] = state.lfsrState[13][i];
+ singlePktState.lfsrState[14] = state.lfsrState[14][i];
+ singlePktState.lfsrState[15] = state.lfsrState[15][i];
+
+ singlePktState.fR1 = state.fR1[i];
+ singlePktState.fR2 = state.fR2[i];
+
+ singlePktState.bX0 = state.bX0[i];
+ singlePktState.bX1 = state.bX1[i];
+ singlePktState.bX2 = state.bX2[i];
+ singlePktState.bX3 = state.bX3[i];
+
+ numKeyStreamsPerPkt = remainBytes[i] / ZUC_KEYSTR_LEN;
+ numBytesLeftOver = remainBytes[i] % ZUC_KEYSTR_LEN;
+
+ pTempBufInPtr = pBufferIn[i];
+ pTempBufOutPtr = pBufferOut[i];
+
+ /* update the output and input pointers here to point
+ * to the i'th buffers */
+ pOut64_0 = (uint64_t *) &pTempBufOutPtr[length[i] -
+ remainBytes[i]];
+ pIn64_0 = (const uint64_t *) &pTempBufInPtr[length[i] -
+ remainBytes[i]];
+
+ while (numKeyStreamsPerPkt--) {
+ /* Generate the key stream 64 bytes at a time */
+ asm_ZucGenKeystream64B((uint32_t *) keyStr1,
+ &singlePktState);
+ pKeyStream64 = (uint64_t *) keyStr1;
+ asm_XorKeyStream64B_avx(pIn64_0, pOut64_0,
+ pKeyStream64);
+ pIn64_0 += 8;
+ pOut64_0 += 8;
+ }
+
+
+ /* Check for remaining 0 to 63 bytes */
+ if (numBytesLeftOver) {
+ asm_ZucGenKeystream64B((uint32_t *) &keyStr1,
+ &singlePktState);
+ uint32_t offset = length[i] - numBytesLeftOver;
+
+ /* copy the remaining bytes into temporary
+ * buffer and XOR with the 64-bytes of
+ * keystream. Then copy on the valid bytes back
+ * to the output buffer */
+ memcpy(&tempSrc[0], &pTempBufInPtr[offset],
+ numBytesLeftOver);
+ memset(&tempSrc[numBytesLeftOver], 0,
+ 64 - numBytesLeftOver);
+
+ pKeyStream64 = (uint64_t *) &keyStr1[0];
+ pTempSrc64 = (uint64_t *) &tempSrc[0];
+ pTempDst64 = (uint64_t *) &tempDst[0];
+ asm_XorKeyStream64B_avx(pTempSrc64, pTempDst64,
+ pKeyStream64);
+
+ memcpy(&pTempBufOutPtr[offset],
+ &tempDst[0], numBytesLeftOver);
+ }
+ }
+ }
+#ifdef SAFE_DATA
+ /* Clear sensitive data in stack */
+ clear_mem(keyStr1, sizeof(keyStr1));
+ clear_mem(keyStr2, sizeof(keyStr2));
+ clear_mem(keyStr3, sizeof(keyStr3));
+ clear_mem(keyStr4, sizeof(keyStr4));
+ clear_mem(&singlePktState, sizeof(singlePktState));
+ clear_mem(&state, sizeof(state));
+ clear_mem(&keys, sizeof(keys));
+ clear_mem(&ivs, sizeof(ivs));
+#endif
+}
+
+void zuc_eea3_1_buffer_avx(const void *pKey,
+ const void *pIv,
+ const void *pBufferIn,
+ void *pBufferOut,
+ const uint32_t length)
+{
+#ifndef LINUX
+ DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+ SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+ /* Check for NULL pointers */
+ if (pKey == NULL || pIv == NULL || pBufferIn == NULL ||
+ pBufferOut == NULL)
+ return;
+
+ /* Check input data is in range of supported length */
+ if (length < ZUC_MIN_LEN || length > ZUC_MAX_LEN)
+ return;
+#endif
+ _zuc_eea3_1_buffer_avx(pKey, pIv, pBufferIn, pBufferOut, length);
+
+#ifdef SAFE_DATA
+ /* Clear sensitive data in registers */
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+ RESTORE_XMMS(xmm_save);
+#endif
+}
+
+void zuc_eea3_4_buffer_avx(const void * const pKey[4],
+ const void * const pIv[4],
+ const void * const pBufferIn[4],
+ void *pBufferOut[4],
+ const uint32_t length[4])
+{
+#ifndef LINUX
+ DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+ SAVE_XMMS(xmm_save);
+#endif
+#ifdef SAFE_PARAM
+ unsigned int i;
+
+ /* Check for NULL pointers */
+ if (pKey == NULL || pIv == NULL || pBufferIn == NULL ||
+ pBufferOut == NULL || length == NULL)
+ return;
+
+ for (i = 0; i < 4; i++) {
+ if (pKey[i] == NULL || pIv[i] == NULL ||
+ pBufferIn[i] == NULL || pBufferOut[i] == NULL)
+ return;
+
+ /* Check input data is in range of supported length */
+ if (length[i] < ZUC_MIN_LEN || length[i] > ZUC_MAX_LEN)
+ return;
+ }
+#endif
+
+ _zuc_eea3_4_buffer_avx(pKey, pIv, pBufferIn, pBufferOut, length);
+
+#ifdef SAFE_DATA
+ /* Clear sensitive data in registers */
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+ RESTORE_XMMS(xmm_save);
+#endif
+}
+
+void zuc_eea3_n_buffer_avx(const void * const pKey[], const void * const pIv[],
+ const void * const pBufferIn[], void *pBufferOut[],
+ const uint32_t length[],
+ const uint32_t numBuffers)
+{
+#ifndef LINUX
+ DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+ SAVE_XMMS(xmm_save);
+#endif
+
+ unsigned int i;
+ unsigned int packetCount = numBuffers;
+
+#ifdef SAFE_PARAM
+ /* Check for NULL pointers */
+ if (pKey == NULL || pIv == NULL || pBufferIn == NULL ||
+ pBufferOut == NULL || length == NULL)
+ return;
+
+ for (i = 0; i < numBuffers; i++) {
+ if (pKey[i] == NULL || pIv[i] == NULL ||
+ pBufferIn[i] == NULL || pBufferOut[i] == NULL)
+ return;
+
+ /* Check input data is in range of supported length */
+ if (length[i] < ZUC_MIN_LEN || length[i] > ZUC_MAX_LEN)
+ return;
+ }
+#endif
+ i = 0;
+
+ while(packetCount >= 4) {
+ packetCount -=4;
+ _zuc_eea3_4_buffer_avx(&pKey[i],
+ &pIv[i],
+ &pBufferIn[i],
+ &pBufferOut[i],
+ &length[i]);
+ i+=4;
+ }
+
+ while(packetCount--) {
+ _zuc_eea3_1_buffer_avx(pKey[i],
+ pIv[i],
+ pBufferIn[i],
+ pBufferOut[i],
+ length[i]);
+ i++;
+ }
+#ifdef SAFE_DATA
+ /* Clear sensitive data in registers */
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+ RESTORE_XMMS(xmm_save);
+#endif
+}
+
+static inline uint64_t rotate_left(uint64_t u, size_t r)
+{
+ return (((u) << (r)) | ((u) >> (64 - (r))));
+}
+
+static inline uint64_t load_uint64(const void *ptr)
+{
+ return *((const uint64_t *)ptr);
+}
+
+void zuc_eia3_1_buffer_avx(const void *pKey,
+ const void *pIv,
+ const void *pBufferIn,
+ const uint32_t lengthInBits,
+ uint32_t *pMacI)
+{
+#ifndef LINUX
+ DECLARE_ALIGNED(uint128_t xmm_save[10], 16);
+
+ SAVE_XMMS(xmm_save);
+#endif
+ DECLARE_ALIGNED(ZucState_t zucState, 64);
+ DECLARE_ALIGNED(uint32_t keyStream[16 * 2], 64);
+ const uint32_t keyStreamLengthInBits = ZUC_KEYSTR_LEN * 8;
+ /* generate a key-stream 2 words longer than the input message */
+ const uint32_t N = lengthInBits + (2 * ZUC_WORD);
+ uint32_t L = (N + 31) / ZUC_WORD;
+ uint32_t *pZuc = (uint32_t *) &keyStream[0];
+ uint32_t remainingBits = lengthInBits;
+ uint32_t T = 0;
+ const uint8_t *pIn8 = (const uint8_t *) pBufferIn;
+
+#ifdef SAFE_PARAM
+ /* Check for NULL pointers */
+ if (pKey == NULL || pIv == NULL || pBufferIn == NULL || pMacI == NULL)
+ return;
+
+ /* Check input data is in range of supported length */
+ if (lengthInBits < ZUC_MIN_LEN || lengthInBits > ZUC_MAX_LEN)
+ return;
+#endif
+
+ memset(&zucState, 0, sizeof(ZucState_t));
+
+ asm_ZucInitialization(pKey, pIv, &(zucState));
+ asm_ZucGenKeystream64B(pZuc, &zucState);
+
+ /* loop over the message bits */
+ while (remainingBits >= keyStreamLengthInBits) {
+ remainingBits -= keyStreamLengthInBits;
+ L -= (keyStreamLengthInBits / 32);
+ /* Generate the next key stream 8 bytes or 64 bytes */
+ if (!remainingBits)
+ asm_ZucGenKeystream8B(&keyStream[16], &zucState);
+ else
+ asm_ZucGenKeystream64B(&keyStream[16], &zucState);
+ T = asm_Eia3Round64BAVX(T, &keyStream[0], pIn8);
+ memcpy(&keyStream[0], &keyStream[16], 16 * sizeof(uint32_t));
+ pIn8 = &pIn8[ZUC_KEYSTR_LEN];
+ }
+
+ /*
+ * If remaining bits has more than 14 ZUC WORDS (double words),
+ * keystream needs to have up to another 2 ZUC WORDS (8B)
+ */
+ if (remainingBits > (14 * 32))
+ asm_ZucGenKeystream8B(&keyStream[16], &zucState);
+ T ^= asm_Eia3RemainderAVX(&keyStream[0], pIn8, remainingBits);
+ T ^= rotate_left(load_uint64(&keyStream[remainingBits / 32]),
+ remainingBits % 32);
+
+ /* save the final MAC-I result */
+ uint32_t keyBlock = keyStream[L - 1];
+ *pMacI = bswap4(T ^ keyBlock);
+
+#ifdef SAFE_DATA
+ /* Clear sensitive data (in registers and stack) */
+ clear_mem(keyStream, sizeof(keyStream));
+ clear_mem(&zucState, sizeof(zucState));
+ CLEAR_SCRATCH_GPS();
+ CLEAR_SCRATCH_SIMD_REGS();
+#endif
+#ifndef LINUX
+ RESTORE_XMMS(xmm_save);
+#endif
+}
diff --git a/src/spdk/intel-ipsec-mb/avx2/gcm128_avx_gen4.asm b/src/spdk/intel-ipsec-mb/avx2/gcm128_avx_gen4.asm
new file mode 100644
index 000000000..924602b63
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx2/gcm128_avx_gen4.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2017-2018, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM128_MODE 1
+%include "avx2/gcm_avx_gen4.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx2/gcm192_avx_gen4.asm b/src/spdk/intel-ipsec-mb/avx2/gcm192_avx_gen4.asm
new file mode 100644
index 000000000..7295d5b74
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx2/gcm192_avx_gen4.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2017-2018, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM192_MODE 1
+%include "avx2/gcm_avx_gen4.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx2/gcm256_avx_gen4.asm b/src/spdk/intel-ipsec-mb/avx2/gcm256_avx_gen4.asm
new file mode 100644
index 000000000..bf2a89cb9
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx2/gcm256_avx_gen4.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2017-2018, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM256_MODE 1
+%include "avx2/gcm_avx_gen4.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx2/gcm_avx_gen4.asm b/src/spdk/intel-ipsec-mb/avx2/gcm_avx_gen4.asm
new file mode 100644
index 000000000..88697d9d1
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx2/gcm_avx_gen4.asm
@@ -0,0 +1,3641 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2019, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+;
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+; The details of the implementation is explained in:
+; Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+; AAD will be padded with 0 to the next 16byte multiple
+; for example, assume AAD is a u32 vector
+;
+; if AAD is 8 bytes:
+; AAD[3] = {A0, A1};
+; padded AAD in xmm register = {A1 A0 0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A1) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 32-bit Sequence Number (A0) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 32-bit Sequence Number
+;
+; if AAD is 12 bytes:
+; AAD[3] = {A0, A1, A2};
+; padded AAD in xmm register = {A2 A1 A0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A2) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 64-bit Extended Sequence Number {A1,A0} |
+; | |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+; Must be a multiple of 4 bytes and from the definition of the spec.
+; The code additionally supports any aadLen length.
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "include/os.asm"
+%include "include/reg_sizes.asm"
+%include "include/clear_regs.asm"
+%include "include/gcm_defines.asm"
+%include "include/gcm_keys_avx2_avx512.asm"
+%include "include/memcpy.asm"
+
+%ifndef GCM128_MODE
+%ifndef GCM192_MODE
+%ifndef GCM256_MODE
+%error "No GCM mode selected for gcm_avx_gen4.asm!"
+%endif
+%endif
+%endif
+
+;; Decide on AES-GCM key size to compile for
+%ifdef GCM128_MODE
+%define NROUNDS 9
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx_gen4
+%endif
+
+%ifdef GCM192_MODE
+%define NROUNDS 11
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx_gen4
+%endif
+
+%ifdef GCM256_MODE
+%define NROUNDS 13
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx_gen4
+%endif
+
+section .text
+default rel
+
+; need to push 4 registers into stack to maintain
+%define STACK_OFFSET 8*4
+
+%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define TMP3 16*1 ; Temporary storage for AES State 3
+%define TMP4 16*2 ; Temporary storage for AES State 4
+%define TMP5 16*3 ; Temporary storage for AES State 5
+%define TMP6 16*4 ; Temporary storage for AES State 6
+%define TMP7 16*5 ; Temporary storage for AES State 7
+%define TMP8 16*6 ; Temporary storage for AES State 8
+
+%define LOCAL_STORAGE 16*7
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE 16*10
+%else
+ %define XMM_STORAGE 0
+%endif
+
+%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GHASH_MUL 7
+%define %%GH %1 ; 16 Bytes
+%define %%HK %2 ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%T2, %%GH, %%HK, 0x00 ; %%T2 = a0*b0
+ vpclmulqdq %%T3, %%GH, %%HK, 0x01 ; %%T3 = a1*b0
+ vpclmulqdq %%GH, %%GH, %%HK, 0x10 ; %%GH = a0*b1
+ vpxor %%GH, %%GH, %%T3
+
+
+ vpsrldq %%T3, %%GH, 8 ; shift-R %%GH 2 DWs
+ vpslldq %%GH, %%GH, 8 ; shift-L %%GH 2 DWs
+
+ vpxor %%T1, %%T1, %%T3
+ vpxor %%GH, %%GH, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqa %%T3, [rel POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%GH, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L %%T2 2 DWs
+
+ vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%GH, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%GH, %%T3, %%GH, 0x10
+ vpslldq %%GH, %%GH, 4 ; shift-L %%GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%GH, %%GH, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%GH, %%GH, %%T1 ; the result is in %%GH
+
+%endmacro
+
+
+; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx_gen4
+; functions, but are kept to allow users to switch cpu architectures between calls
+; of pre, init, update, and finalize.
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+%define %%T6 %8
+
+ ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vmovdqa %%T5, %%HK
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_3], %%T5
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_4], %%T5
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_5], %%T5
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_6], %%T5
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_7], %%T5
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_8], %%T5
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT 6
+%define %%OUTPUT %1 ; %%OUTPUT is an xmm register
+%define %%INPUT %2
+%define %%LENGTH %3
+%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
+%define %%COUNTER %5
+%define %%TMP1 %6
+
+ vpxor %%OUTPUT, %%OUTPUT
+ mov %%COUNTER, %%LENGTH
+ mov %%END_READ_LOCATION, %%INPUT
+ add %%END_READ_LOCATION, %%LENGTH
+ xor %%TMP1, %%TMP1
+
+
+ cmp %%COUNTER, 8
+ jl %%_byte_loop_2
+ vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
+ je %%_done
+
+ sub %%COUNTER, 8
+
+%%_byte_loop_1: ;Read in data 1 byte at a time while data is left
+ shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_1
+ vpinsrq %%OUTPUT, %%TMP1, 1
+ jmp %%_done
+
+%%_byte_loop_2: ;Read in data 1 byte at a time while data is left
+ ;; NOTE: in current implementation check for zero length is obsolete here.
+ ;; The adequate checks are done by callers of this macro.
+ ;; cmp %%COUNTER, 0
+ ;; je %%_done
+ shl %%TMP1, 8 ;This loop handles when no bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_2
+ vpinsrq %%OUTPUT, %%TMP1, 0
+%%_done:
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CALC_AAD_HASH 15
+%define %%A_IN %1
+%define %%A_LEN %2
+%define %%AAD_HASH %3
+%define %%GDATA_KEY %4
+%define %%XTMP0 %5 ; xmm temp reg 5
+%define %%XTMP1 %6 ; xmm temp reg 5
+%define %%XTMP2 %7
+%define %%XTMP3 %8
+%define %%XTMP4 %9
+%define %%XTMP5 %10 ; xmm temp reg 5
+%define %%T1 %11 ; temp reg 1
+%define %%T2 %12
+%define %%T3 %13
+%define %%T4 %14
+%define %%T5 %15 ; temp reg 5
+
+
+ mov %%T1, %%A_IN ; T1 = AAD
+ mov %%T2, %%A_LEN ; T2 = aadLen
+ vpxor %%AAD_HASH, %%AAD_HASH
+
+%%_get_AAD_loop128:
+ cmp %%T2, 128
+ jl %%_exit_AAD_loop128
+
+ vmovdqu %%XTMP0, [%%T1 + 16*0]
+ vpshufb %%XTMP0, [rel SHUF_MASK]
+
+ vpxor %%XTMP0, %%AAD_HASH
+
+ vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_8]
+ vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0
+ vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1
+
+%assign i 1
+%assign j 7
+%rep 7
+ vmovdqu %%XTMP0, [%%T1 + 16*i]
+ vpshufb %%XTMP0, [rel SHUF_MASK]
+
+ vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_ %+ j]
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1
+ vpxor %%XTMP1, %%XTMP1, %%XTMP4
+
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0
+ vpxor %%XTMP2, %%XTMP2, %%XTMP4
+
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4
+%assign i (i + 1)
+%assign j (j - 1)
+%endrep
+
+ vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs
+ vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs
+ vpxor %%XTMP2, %%XTMP2, %%XTMP4
+ vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L)
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqa %%XTMP5, [rel POLY2]
+ vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01
+ vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs
+ vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00
+ vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10
+ vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1
+
+ sub %%T2, 128
+ je %%_CALC_AAD_done
+
+ add %%T1, 128
+ jmp %%_get_AAD_loop128
+
+%%_exit_AAD_loop128:
+ cmp %%T2, 16
+ jl %%_get_small_AAD_block
+
+ ;; calculate hash_key position to start with
+ mov %%T3, %%T2
+ and %%T3, -16 ; 1 to 7 blocks possible here
+ neg %%T3
+ add %%T3, HashKey_1 + 16
+ lea %%T3, [%%GDATA_KEY + %%T3]
+
+ vmovdqu %%XTMP0, [%%T1]
+ vpshufb %%XTMP0, [rel SHUF_MASK]
+
+ vpxor %%XTMP0, %%AAD_HASH
+
+ vmovdqu %%XTMP5, [%%T3]
+ vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0
+ vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1
+
+ add %%T3, 16 ; move to next hashkey
+ add %%T1, 16 ; move to next data block
+ sub %%T2, 16
+ cmp %%T2, 16
+ jl %%_AAD_reduce
+
+%%_AAD_blocks:
+ vmovdqu %%XTMP0, [%%T1]
+ vpshufb %%XTMP0, [rel SHUF_MASK]
+
+ vmovdqu %%XTMP5, [%%T3]
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1
+ vpxor %%XTMP1, %%XTMP1, %%XTMP4
+
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0
+ vpxor %%XTMP2, %%XTMP2, %%XTMP4
+
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4
+
+ add %%T3, 16 ; move to next hashkey
+ add %%T1, 16
+ sub %%T2, 16
+ cmp %%T2, 16
+ jl %%_AAD_reduce
+ jmp %%_AAD_blocks
+
+%%_AAD_reduce:
+ vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs
+ vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs
+ vpxor %%XTMP2, %%XTMP2, %%XTMP4
+ vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L)
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqa %%XTMP5, [rel POLY2]
+ vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01
+ vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs
+ vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00
+ vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10
+ vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1
+
+ or %%T2, %%T2
+ je %%_CALC_AAD_done
+
+%%_get_small_AAD_block:
+ vmovdqu %%XTMP0, [%%GDATA_KEY + HashKey]
+ READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [rel SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN),
+; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET),
+; and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 8
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%PLAIN_CYPH_LEN %5
+%define %%DATA_OFFSET %6
+%define %%AAD_HASH %7
+%define %%ENC_DEC %8
+
+ mov r13, [%%GDATA_CTX + PBlockLen]
+ cmp r13, 0
+ je %%_partial_block_done ;Leave Macro if no partial blocks
+
+ cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
+ jl %%_fewer_than_16_bytes
+ VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
+ jmp %%_data_read
+
+%%_fewer_than_16_bytes:
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
+
+%%_data_read: ;Finished reading in data
+
+
+ vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
+ vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+
+ lea r12, [rel SHIFT_MASK]
+
+ add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm9, xmm2 ;shift right r13 bytes
+
+%ifidn %%ENC_DEC, DEC
+ vmovdqa xmm3, xmm1
+ vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_1:
+
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK]; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpand xmm3, xmm1
+ vpshufb xmm3, [rel SHUF_MASK]
+ vpshufb xmm3, xmm2
+ vpxor %%AAD_HASH, xmm3
+
+
+ cmp r15,0
+ jl %%_partial_incomplete_1
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA_CTX + PBlockLen], rax
+ jmp %%_dec_done
+%%_partial_incomplete_1:
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rax, %%PLAIN_CYPH_LEN
+ add [%%GDATA_CTX + PBlockLen], rax
+%else
+ add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%endif
+%%_dec_done:
+ vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
+
+%else
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_2:
+
+ vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpshufb xmm9, [rel SHUF_MASK]
+ vpshufb xmm9, xmm2
+ vpxor %%AAD_HASH, xmm9
+
+ cmp r15,0
+ jl %%_partial_incomplete_2
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA_CTX + PBlockLen], rax
+ jmp %%_encode_done
+%%_partial_incomplete_2:
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rax, %%PLAIN_CYPH_LEN
+ add [%%GDATA_CTX + PBlockLen], rax
+%else
+ add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%endif
+%%_encode_done:
+ vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
+
+ vpshufb xmm9, [rel SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ vpshufb xmm9, xmm2
+%endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output encrypted Bytes
+ cmp r15,0
+ jl %%_partial_fill
+ mov r12, r13
+ mov r13, 16
+ sub r13, r12 ; Set r13 to be the number of bytes to write out
+ jmp %%_count_set
+%%_partial_fill:
+ mov r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+ vmovq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq xmm9, xmm9, 8
+ vmovq rax, xmm9
+ sub r13, 8
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+%macro GHASH_SINGLE_MUL 9
+%define %%GDATA %1
+%define %%HASHKEY %2
+%define %%CIPHER %3
+%define %%STATE_11 %4
+%define %%STATE_00 %5
+%define %%STATE_MID %6
+%define %%T1 %7
+%define %%T2 %8
+%define %%FIRST %9
+
+ vmovdqu %%T1, [%%GDATA + %%HASHKEY]
+%ifidn %%FIRST, first
+ vpclmulqdq %%STATE_11, %%CIPHER, %%T1, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%STATE_00, %%CIPHER, %%T1, 0x00 ; %%T4_2 = a0*b0
+ vpclmulqdq %%STATE_MID, %%CIPHER, %%T1, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 ; %%T5 = a0*b1
+ vpxor %%STATE_MID, %%STATE_MID, %%T2
+%else
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x11
+ vpxor %%STATE_11, %%STATE_11, %%T2
+
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x00
+ vpxor %%STATE_00, %%STATE_00, %%T2
+
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x01
+ vpxor %%STATE_MID, %%STATE_MID, %%T2
+
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10
+ vpxor %%STATE_MID, %%STATE_MID, %%T2
+%endif
+
+%endmacro
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 23
+%define %%GDATA_KEY %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%LENGTH %4
+%define %%DATA_OFFSET %5
+%define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%T1 %7
+%define %%T2 %8
+%define %%T3 %9
+%define %%T4 %10
+%define %%T5 %11
+%define %%CTR %12
+%define %%XMM1 %13
+%define %%XMM2 %14
+%define %%XMM3 %15
+%define %%XMM4 %16
+%define %%XMM5 %17
+%define %%XMM6 %18
+%define %%XMM7 %19
+%define %%XMM8 %20
+%define %%T6 %21
+%define %%T_key %22
+%define %%ENC_DEC %23
+
+%assign i (8-%%num_initial_blocks)
+ ;; Move AAD_HASH to temp reg
+ vmovdqu %%T2, %%XMM8
+ ;; Start AES for %%num_initial_blocks blocks
+ ;; vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpaddd %%CTR, %%CTR, [rel ONE] ; INCR Y0
+ vmovdqa reg(i), %%CTR
+ vpshufb reg(i), [rel SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+%if(%%num_initial_blocks>0)
+vmovdqu %%T_key, [%%GDATA_KEY+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpxor reg(i),reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep NROUNDS
+vmovdqu %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+vmovdqu %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%endif ; %if(%%num_initial_blocks>0)
+
+
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vpxor reg(i), reg(i), %%T1
+ ;; Write back ciphertext for %%num_initial_blocks blocks
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa reg(i), %%T1
+ %endif
+ ;; Prepare ciphertext for GHASH computations
+ vpshufb reg(i), [rel SHUF_MASK]
+%assign i (i+1)
+%endrep
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%assign i (9-%%num_initial_blocks)
+%if(%%num_initial_blocks>0)
+ vmovdqa %%T3, reg(i)
+%assign i (i+1)
+%endif
+%if(%%num_initial_blocks>1)
+%rep %%num_initial_blocks-1
+ vmovdqu [rsp + TMP %+ i], reg(i)
+%assign i (i+1)
+%endrep
+%endif
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Prepare 8 counter blocks and perform rounds of AES cipher on
+ ;; them, load plain/cipher text and store cipher/plain text.
+ ;; Stitch GHASH computation in between AES rounds.
+ vpaddd %%XMM1, %%CTR, [rel ONE] ; INCR Y0
+ vpaddd %%XMM2, %%CTR, [rel TWO] ; INCR Y0
+ vpaddd %%XMM3, %%XMM1, [rel TWO] ; INCR Y0
+ vpaddd %%XMM4, %%XMM2, [rel TWO] ; INCR Y0
+ vpaddd %%XMM5, %%XMM3, [rel TWO] ; INCR Y0
+ vpaddd %%XMM6, %%XMM4, [rel TWO] ; INCR Y0
+ vpaddd %%XMM7, %%XMM5, [rel TWO] ; INCR Y0
+ vpaddd %%XMM8, %%XMM6, [rel TWO] ; INCR Y0
+ vmovdqa %%CTR, %%XMM8
+
+ vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*0]
+ vpxor %%XMM1, %%XMM1, %%T_key
+ vpxor %%XMM2, %%XMM2, %%T_key
+ vpxor %%XMM3, %%XMM3, %%T_key
+ vpxor %%XMM4, %%XMM4, %%T_key
+ vpxor %%XMM5, %%XMM5, %%T_key
+ vpxor %%XMM6, %%XMM6, %%T_key
+ vpxor %%XMM7, %%XMM7, %%T_key
+ vpxor %%XMM8, %%XMM8, %%T_key
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+%assign k (%%num_initial_blocks)
+
+%define %%T4_2 %%T4
+%if(%%num_initial_blocks>0)
+ ;; Hash in AES state
+ ;; T2 - incoming AAD hash
+ vpxor %%T2, %%T3
+
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, first
+%endif
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*1]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*2]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>1)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*3]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*4]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>2)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>3)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*5]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*6]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>4)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*7]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*8]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>5)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*9]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+%ifndef GCM128_MODE
+ vmovdqu %%T_key, [%%GDATA_KEY+16*10]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>6)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+%ifdef GCM128_MODE
+ vmovdqu %%T_key, [%%GDATA_KEY+16*10]
+ vaesenclast %%XMM1, %%T_key
+ vaesenclast %%XMM2, %%T_key
+ vaesenclast %%XMM3, %%T_key
+ vaesenclast %%XMM4, %%T_key
+ vaesenclast %%XMM5, %%T_key
+ vaesenclast %%XMM6, %%T_key
+ vaesenclast %%XMM7, %%T_key
+ vaesenclast %%XMM8, %%T_key
+%endif
+
+%ifdef GCM192_MODE
+ vmovdqu %%T_key, [%%GDATA_KEY+16*11]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*12]
+ vaesenclast %%XMM1, %%T_key
+ vaesenclast %%XMM2, %%T_key
+ vaesenclast %%XMM3, %%T_key
+ vaesenclast %%XMM4, %%T_key
+ vaesenclast %%XMM5, %%T_key
+ vaesenclast %%XMM6, %%T_key
+ vaesenclast %%XMM7, %%T_key
+ vaesenclast %%XMM8, %%T_key
+%endif
+%ifdef GCM256_MODE
+ vmovdqu %%T_key, [%%GDATA_KEY+16*11]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*12]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>7)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+%ifdef GCM256_MODE ; GCM256
+ vmovdqu %%T_key, [%%GDATA_KEY+16*13]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*14]
+ vaesenclast %%XMM1, %%T_key
+ vaesenclast %%XMM2, %%T_key
+ vaesenclast %%XMM3, %%T_key
+ vaesenclast %%XMM4, %%T_key
+ vaesenclast %%XMM5, %%T_key
+ vaesenclast %%XMM6, %%T_key
+ vaesenclast %%XMM7, %%T_key
+ vaesenclast %%XMM8, %%T_key
+%endif ; GCM256 mode
+
+%if(%%num_initial_blocks>0)
+ vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs
+ vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs
+ vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4
+ vpxor %%T4, %%T6, %%T4
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; First phase of the reduction
+ vmovdqa %%T3, [rel POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T4, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
+
+ ;; First phase of the reduction complete
+ vpxor %%T4, %%T4, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; Second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%T4, 0x00
+ ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+ vpsrldq %%T2, %%T2, 4
+
+ vpclmulqdq %%T4, %%T3, %%T4, 0x10
+ ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
+ vpslldq %%T4, %%T4, 4
+ ;; Second phase of the reduction complete
+ vpxor %%T4, %%T4, %%T2
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; The result is in %%T3
+ vpxor %%T3, %%T1, %%T4
+%else
+ ;; The hash should end up in T3
+ vmovdqa %%T3, %%T2
+%endif
+
+ ;; Final hash is now in T3
+%if %%num_initial_blocks > 0
+ ;; NOTE: obsolete in case %%num_initial_blocks = 0
+ sub %%LENGTH, 16*%%num_initial_blocks
+%endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+ vpxor %%XMM1, %%XMM1, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM1, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+ vpxor %%XMM2, %%XMM2, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM2, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+ vpxor %%XMM3, %%XMM3, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM3, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+ vpxor %%XMM4, %%XMM4, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM4, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+ vpxor %%XMM5, %%XMM5, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM5, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+ vpxor %%XMM6, %%XMM6, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM6, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+ vpxor %%XMM7, %%XMM7, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM7, %%T1
+ %endif
+
+%if %%num_initial_blocks > 0
+ ;; NOTE: 'jl' is never taken for %%num_initial_blocks = 0
+ ;; This macro is executed for lenght 128 and up,
+ ;; zero length is checked in GCM_ENC_DEC.
+ ;; If the last block is partial then the xor will be done later
+ ;; in ENCRYPT_FINAL_PARTIAL_BLOCK.
+ ;; We know it's partial if LENGTH - 16*num_initial_blocks < 128
+ cmp %%LENGTH, 128
+ jl %%_initial_skip_last_word_write
+%endif
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+ vpxor %%XMM8, %%XMM8, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM8, %%T1
+ %endif
+
+ ;; Update %%LENGTH with the number of blocks processed
+ sub %%LENGTH, 16
+ add %%DATA_OFFSET, 16
+%%_initial_skip_last_word_write:
+ sub %%LENGTH, 128-16
+ add %%DATA_OFFSET, 128-16
+
+ vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap
+ ;; Combine GHASHed value with the corresponding ciphertext
+ vpxor %%XMM1, %%XMM1, %%T3
+ vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+;;; INITIAL_BLOCKS macro with support for a partial final block.
+;;; num_initial_blocks is expected to include the partial final block
+;;; in the count.
+%macro INITIAL_BLOCKS_PARTIAL 25
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%LENGTH %5
+%define %%DATA_OFFSET %6
+%define %%num_initial_blocks %7 ; can be 1, 2, 3, 4, 5, 6 or 7 (not 0)
+%define %%T1 %8
+%define %%T2 %9
+%define %%T3 %10
+%define %%T4 %11
+%define %%T5 %12
+%define %%CTR %13
+%define %%XMM1 %14
+%define %%XMM2 %15
+%define %%XMM3 %16
+%define %%XMM4 %17
+%define %%XMM5 %18
+%define %%XMM6 %19
+%define %%XMM7 %20
+%define %%XMM8 %21
+%define %%T6 %22
+%define %%T_key %23
+%define %%ENC_DEC %24
+%define %%INSTANCE_TYPE %25
+
+%assign i (8-%%num_initial_blocks)
+ ;; Move AAD_HASH to temp reg
+ vmovdqu %%T2, %%XMM8
+ ;; vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ ;; Compute AES counters
+ vpaddd %%CTR, %%CTR, [rel ONE] ; INCR Y0
+ vmovdqa reg(i), %%CTR
+ vpshufb reg(i), [rel SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+vmovdqu %%T_key, [%%GDATA_KEY+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ ; Start AES for %%num_initial_blocks blocks
+ vpxor reg(i),reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep NROUNDS
+vmovdqu %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+vmovdqu %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Hash all but the last block of data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks-1
+ ;; Encrypt the message for all but the last block
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vpxor reg(i), reg(i), %%T1
+ ;; write back ciphertext for %%num_initial_blocks blocks
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa reg(i), %%T1
+ %endif
+ ;; Prepare ciphertext for GHASH computations
+ vpshufb reg(i), [rel SHUF_MASK]
+%assign i (i+1)
+%endrep
+
+ ;; The final block of data may be <16B
+ sub %%LENGTH, 16*(%%num_initial_blocks-1)
+
+%if %%num_initial_blocks < 8
+ ;; NOTE: the 'jl' is always taken for num_initial_blocks = 8.
+ ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 128.
+ cmp %%LENGTH, 16
+ jl %%_small_initial_partial_block
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Handle a full length final block - encrypt and hash all blocks
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ sub %%LENGTH, 16
+ mov [%%GDATA_CTX + PBlockLen], %%LENGTH
+
+ ;; Encrypt the message
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vpxor reg(i), reg(i), %%T1
+ ;; write back ciphertext for %%num_initial_blocks blocks
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa reg(i), %%T1
+ %endif
+ ;; Prepare ciphertext for GHASH computations
+ vpshufb reg(i), [rel SHUF_MASK]
+
+ ;; Hash all of the data
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+%assign k (%%num_initial_blocks)
+%assign last_block_to_hash 0
+
+%if(%%num_initial_blocks>last_block_to_hash)
+ ;; Hash in AES state
+ vpxor %%T2, reg(j)
+
+ ;; T2 - incoming AAD hash
+ ;; reg(i) holds ciphertext
+ ;; T5 - hash key
+ ;; T6 - updated xor
+ ;; reg(1)/xmm1 should now be available for tmp use
+ vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
+ vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0
+ vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
+ vpxor %%T6, %%T6, %%T5
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%assign rep_count (%%num_initial_blocks-1)
+%rep rep_count
+
+ vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
+ vpclmulqdq %%T3, reg(j), %%T5, 0x11
+ vpxor %%T1, %%T1, %%T3
+
+ vpclmulqdq %%T3, reg(j), %%T5, 0x00
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, reg(j), %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, reg(j), %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%endrep
+
+ ;; Record that a reduction is needed
+ mov r12, 1
+
+ jmp %%_small_initial_compute_hash
+
+
+%endif ; %if %%num_initial_blocks < 8
+
+%%_small_initial_partial_block:
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Handle ghash for a <16B final block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;; In this case if it's a single call to encrypt we can
+ ;; hash all of the data but if it's an init / update / finalize
+ ;; series of call we need to leave the last block if it's
+ ;; less than a full block of data.
+
+ mov [%%GDATA_CTX + PBlockLen], %%LENGTH
+ vmovdqu [%%GDATA_CTX + PBlockEncKey], reg(i)
+ ;; Handle a partial final block
+ ;; GDATA, KEY, T1, T2
+ ;; r13 - length
+ ;; LT16 - indicates type of read and that the buffer is less than 16 bytes long
+ ;; NOTE: could be replaced with %%LENGTH but at this point
+ ;; %%LENGTH is always less than 16.
+ ;; No PLAIN_CYPH_LEN argument available in this macro.
+ ENCRYPT_FINAL_PARTIAL_BLOCK reg(i), %%T1, %%T3, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, LT16, %%ENC_DEC, %%DATA_OFFSET
+ vpshufb reg(i), [rel SHUF_MASK]
+
+%ifidn %%INSTANCE_TYPE, multi_call
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+%assign k (%%num_initial_blocks-1)
+%assign last_block_to_hash 1
+%else
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+%assign k (%%num_initial_blocks)
+%assign last_block_to_hash 0
+%endif
+
+%if(%%num_initial_blocks>last_block_to_hash)
+ ;; Record that a reduction is needed
+ mov r12, 1
+ ;; Hash in AES state
+ vpxor %%T2, reg(j)
+
+ ;; T2 - incoming AAD hash
+ ;; reg(i) holds ciphertext
+ ;; T5 - hash key
+ ;; T6 - updated xor
+ ;; reg(1)/xmm1 should now be available for tmp use
+ vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
+ vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0
+ vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
+ vpxor %%T6, %%T6, %%T5
+%else
+ ;; Record that a reduction is not needed -
+ ;; In this case no hashes are computed because there
+ ;; is only one initial block and it is < 16B in length.
+ mov r12, 0
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%ifidn %%INSTANCE_TYPE, multi_call
+%assign rep_count (%%num_initial_blocks-2)
+%%_multi_call_hash:
+%else
+%assign rep_count (%%num_initial_blocks-1)
+%endif
+
+%if rep_count < 0
+ ;; quick fix for negative rep_count (to be investigated)
+%assign rep_count 0
+%endif
+
+%rep rep_count
+
+ vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
+ vpclmulqdq %%T3, reg(j), %%T5, 0x11
+ vpxor %%T1, %%T1, %%T3
+
+ vpclmulqdq %%T3, reg(j), %%T5, 0x00
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, reg(j), %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, reg(j), %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%endrep
+
+%%_small_initial_compute_hash:
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Ghash reduction
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%if(%%num_initial_blocks=1)
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; We only need to check if a reduction is needed if
+ ;; initial_blocks == 1 and init/update/final is being used.
+ ;; In this case we may just have a partial block, and that
+ ;; gets hashed in finalize.
+ cmp r12, 0
+ je %%_no_reduction_needed
+%endif
+%endif
+
+ vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs
+ vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs
+ vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4
+ vpxor %%T4, %%T6, %%T4
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; First phase of the reduction
+ vmovdqa %%T3, [rel POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T4, 0x01
+ ;; shift-L xmm2 2 DWs
+ vpslldq %%T2, %%T2, 8
+ vpxor %%T4, %%T4, %%T2
+
+ ;; First phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Second phase of the reduction
+
+ vpclmulqdq %%T2, %%T3, %%T4, 0x00
+ ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+ vpsrldq %%T2, %%T2, 4
+
+ vpclmulqdq %%T4, %%T3, %%T4, 0x10
+ ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
+ vpslldq %%T4, %%T4, 4
+
+ vpxor %%T4, %%T4, %%T2
+ ;; Second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%T3, %%T1, %%T4
+
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; If using init/update/finalize, we need to xor any partial block data
+ ;; into the hash.
+%if %%num_initial_blocks > 1
+ ;; NOTE: for %%num_initial_blocks = 0 the xor never takes place
+%if %%num_initial_blocks != 8
+ ;; NOTE: for %%num_initial_blocks = 8, %%LENGTH, stored in [PBlockLen] is never zero
+ cmp qword [%%GDATA_CTX + PBlockLen], 0
+ je %%_no_partial_block_xor
+%endif ; %%num_initial_blocks != 8
+ vpxor %%T3, %%T3, reg(8)
+%%_no_partial_block_xor:
+%endif ; %%num_initial_blocks > 1
+%endif ; %%INSTANCE_TYPE, multi_call
+
+%if(%%num_initial_blocks=1)
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; NOTE: %%_no_reduction_needed case only valid for
+ ;; multi_call with initial_blocks = 1.
+ ;; Look for comment above around '_no_reduction_needed'
+ ;; The jmp below is obsolete as the code will fall through.
+
+ ;; The result is in %%T3
+ jmp %%_after_reduction
+
+%%_no_reduction_needed:
+ ;; The hash should end up in T3. The only way we should get here is if
+ ;; there is a partial block of data, so xor that into the hash.
+ vpxor %%T3, %%T2, reg(8)
+%endif ; %%INSTANCE_TYPE = multi_call
+%endif ; %%num_initial_blocks=1
+
+%%_after_reduction:
+ ;; Final hash is now in T3
+
+%endmacro ; INITIAL_BLOCKS_PARTIAL
+
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; %%DATA_OFFSET is the data offset value
+%macro GHASH_8_ENCRYPT_8_PARALLEL 23
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%DATA_OFFSET %4
+%define %%T1 %5
+%define %%T2 %6
+%define %%T3 %7
+%define %%T4 %8
+%define %%T5 %9
+%define %%T6 %10
+%define %%CTR %11
+%define %%XMM1 %12
+%define %%XMM2 %13
+%define %%XMM3 %14
+%define %%XMM4 %15
+%define %%XMM5 %16
+%define %%XMM6 %17
+%define %%XMM7 %18
+%define %%XMM8 %19
+%define %%T7 %20
+%define %%loop_idx %21
+%define %%ENC_DEC %22
+%define %%FULL_PARTIAL %23
+
+ vmovdqa %%T2, %%XMM1
+ vmovdqu [rsp + TMP2], %%XMM2
+ vmovdqu [rsp + TMP3], %%XMM3
+ vmovdqu [rsp + TMP4], %%XMM4
+ vmovdqu [rsp + TMP5], %%XMM5
+ vmovdqu [rsp + TMP6], %%XMM6
+ vmovdqu [rsp + TMP7], %%XMM7
+ vmovdqu [rsp + TMP8], %%XMM8
+
+%ifidn %%loop_idx, in_order
+ vpaddd %%XMM1, %%CTR, [rel ONE] ; INCR CNT
+ vmovdqa %%T5, [rel TWO]
+ vpaddd %%XMM2, %%CTR, %%T5
+ vpaddd %%XMM3, %%XMM1, %%T5
+ vpaddd %%XMM4, %%XMM2, %%T5
+ vpaddd %%XMM5, %%XMM3, %%T5
+ vpaddd %%XMM6, %%XMM4, %%T5
+ vpaddd %%XMM7, %%XMM5, %%T5
+ vpaddd %%XMM8, %%XMM6, %%T5
+ vmovdqa %%CTR, %%XMM8
+
+ vmovdqa %%T5, [rel SHUF_MASK]
+ vpshufb %%XMM1, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM2, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM3, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM4, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM5, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM6, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM7, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM8, %%T5 ; perform a 16Byte swap
+%else
+ vpaddd %%XMM1, %%CTR, [rel ONEf] ; INCR CNT
+ vmovdqa %%T5, [rel TWOf]
+ vpaddd %%XMM2, %%CTR, %%T5
+ vpaddd %%XMM3, %%XMM1, %%T5
+ vpaddd %%XMM4, %%XMM2, %%T5
+ vpaddd %%XMM5, %%XMM3, %%T5
+ vpaddd %%XMM6, %%XMM4, %%T5
+ vpaddd %%XMM7, %%XMM5, %%T5
+ vpaddd %%XMM8, %%XMM6, %%T5
+ vmovdqa %%CTR, %%XMM8
+%endif
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T1, [%%GDATA + 16*0]
+ vpxor %%XMM1, %%XMM1, %%T1
+ vpxor %%XMM2, %%XMM2, %%T1
+ vpxor %%XMM3, %%XMM3, %%T1
+ vpxor %%XMM4, %%XMM4, %%T1
+ vpxor %%XMM5, %%XMM5, %%T1
+ vpxor %%XMM6, %%XMM6, %%T1
+ vpxor %%XMM7, %%XMM7, %%T1
+ vpxor %%XMM8, %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T1, [%%GDATA + 16*1]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [%%GDATA + 16*2]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+ vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0
+ vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
+ vpxor %%T6, %%T6, %%T5
+
+ vmovdqu %%T1, [%%GDATA + 16*3]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP2]
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*4]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu %%T1, [rsp + TMP3]
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*5]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [rsp + TMP4]
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*6]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP5]
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*7]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP6]
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*8]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP7]
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + 16*9]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T1, [rsp + TMP8]
+ vmovdqu %%T5, [%%GDATA + HashKey]
+
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T1, %%T4, %%T3
+
+
+ vmovdqu %%T5, [%%GDATA + 16*10]
+ %ifndef GCM128_MODE ; GCM192 or GCM256
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*11]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*12]
+%endif
+%ifdef GCM256_MODE
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*13]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*14]
+%endif ; GCM256
+
+%assign i 0
+%assign j 1
+%rep 8
+
+ ;; SNP TBD: This is pretty ugly - consider whether just XORing the
+ ;; data in after vaesenclast is simpler and performant. Would
+ ;; also have to ripple it through partial block and ghash_mul_8.
+%ifidn %%FULL_PARTIAL, full
+ %ifdef NT_LD
+ VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ vpxor %%T2, %%T2, %%T5
+ %else
+ vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ %endif
+
+ %ifidn %%ENC_DEC, ENC
+ vaesenclast reg(j), reg(j), %%T2
+ %else
+ vaesenclast %%T3, reg(j), %%T2
+ vpxor reg(j), %%T2, %%T5
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
+ %endif
+
+%else
+ ; Don't read the final data during partial block processing
+ %ifdef NT_LD
+ %if (i<7)
+ VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ vpxor %%T2, %%T2, %%T5
+ %else
+ ;; Stage the key directly in T2 rather than hash it with plaintext
+ vmovdqu %%T2, %%T5
+ %endif
+ %else
+ %if (i<7)
+ vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ %else
+ ;; Stage the key directly in T2 rather than hash it with plaintext
+ vmovdqu %%T2, %%T5
+ %endif
+ %endif
+
+ %ifidn %%ENC_DEC, ENC
+ vaesenclast reg(j), reg(j), %%T2
+ %else
+ %if (i<7)
+ vaesenclast %%T3, reg(j), %%T2
+ vpxor reg(j), %%T2, %%T5
+ ;; Do not read the data since it could fault
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
+ %else
+ vaesenclast reg(j), reg(j), %%T2
+ %endif
+ %endif
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs
+ vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs
+ vpxor %%T7, %%T7, %%T3
+ vpxor %%T1, %%T1, %%T6 ; accumulate the results in %%T1:%%T7
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqa %%T3, [rel POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T7, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
+
+ vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ %ifidn %%ENC_DEC, ENC
+ ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7
+ %ifidn %%FULL_PARTIAL, full
+ ;; Avoid writing past the buffer if handling a partial block
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8
+ %endif
+ %endif
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%T7, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%T4, %%T3, %%T7, 0x10
+ vpslldq %%T4, %%T4, 4 ; shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%T1, %%T1, %%T4 ; the result is in %%T1
+
+ vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap
+
+
+ vpxor %%XMM1, %%T1
+
+
+%endmacro ; GHASH_8_ENCRYPT_8_PARALLEL
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro GHASH_LAST_8 16
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+%define %%XMM8 %16
+
+ ;; Karatsuba Method
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+
+ vpshufd %%T2, %%XMM1, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM1
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
+ vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
+
+ vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpshufd %%T2, %%XMM2, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM2
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpshufd %%T2, %%XMM3, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM3
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpshufd %%T2, %%XMM4, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM4
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpshufd %%T2, %%XMM5, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM5
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpshufd %%T2, %%XMM6, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM6
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpshufd %%T2, %%XMM7, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM7
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey]
+ vpshufd %%T2, %%XMM8, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM8
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+ vpxor %%XMM1, %%XMM1, %%T6
+ vpxor %%T2, %%XMM1, %%T7
+
+
+
+
+ vpslldq %%T4, %%T2, 8
+ vpsrldq %%T2, %%T2, 8
+
+ vpxor %%T7, %%T7, %%T4
+ vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqa %%T3, [rel POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T7, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
+
+ vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%T7, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%T4, %%T3, %%T7, 0x10
+ vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%T6, %%T6, %%T4 ; the result is in %%T6
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro GHASH_LAST_7 15
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+
+ ;; Karatsuba Method
+
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+
+ vpshufd %%T2, %%XMM1, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM1
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
+ vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
+
+ vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpshufd %%T2, %%XMM2, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM2
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpshufd %%T2, %%XMM3, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM3
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpshufd %%T2, %%XMM4, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM4
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpshufd %%T2, %%XMM5, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM5
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpshufd %%T2, %%XMM6, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM6
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_1]
+ vpshufd %%T2, %%XMM7, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM7
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpxor %%XMM1, %%XMM1, %%T6
+ vpxor %%T2, %%XMM1, %%T7
+
+
+
+
+ vpslldq %%T4, %%T2, 8
+ vpsrldq %%T2, %%T2, 8
+
+ vpxor %%T7, %%T7, %%T4
+ vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqa %%T3, [rel POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T7, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
+
+ vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%T7, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%T4, %%T3, %%T7, 0x10
+ vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%T6, %%T6, %%T4 ; the result is in %%T6
+%endmacro
+
+
+
+;;; Handle encryption of the final partial block
+;;; IN:
+;;; r13 - Number of bytes to read
+;;; MODIFIES:
+;;; KEY - Key for encrypting the partial block
+;;; HASH - Current hash value
+;;; SMASHES:
+;;; r10, r12, r15, rax
+;;; T1, T2
+;;; Note:
+;;; PLAIN_CYPH_LEN, %7, is passed only to determine
+;;; if buffer is big enough to do a 16 byte read & shift.
+;;; 'LT16' is passed here only if buffer is known to be smaller
+;;; than 16 bytes.
+;;; Any other value passed here will result in 16 byte read
+;;; code path.
+;;; TBD: Remove HASH from the instantiation
+%macro ENCRYPT_FINAL_PARTIAL_BLOCK 8
+%define %%KEY %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%CYPH_PLAIN_OUT %4
+%define %%PLAIN_CYPH_IN %5
+%define %%PLAIN_CYPH_LEN %6
+%define %%ENC_DEC %7
+%define %%DATA_OFFSET %8
+
+ ;; NOTE: type of read tuned based %%PLAIN_CYPH_LEN setting
+%ifidn %%PLAIN_CYPH_LEN, LT16
+ ;; Handle the case where the message is < 16 bytes
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+
+ ;; T1 - packed output
+ ;; r10 - input data address
+ ;; r13 - input data length
+ ;; r12, r15, rax - temp registers
+ READ_SMALL_DATA_INPUT %%T1, r10, r13, r12, r15, rax
+
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13
+%else
+ ;; Handle the case where the message is >= 16 bytes
+ sub %%DATA_OFFSET, 16
+ add %%DATA_OFFSET, r13
+ ;; Receive the last <16 Byte block
+ vmovdqu %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET]
+ sub %%DATA_OFFSET, r13
+ add %%DATA_OFFSET, 16
+
+ lea r12, [SHIFT_MASK + 16]
+ ;; Adjust the shuffle mask pointer to be able to shift 16-r13 bytes
+ ;; (r13 is the number of bytes in plaintext mod 16)
+ sub r12, r13
+ ;; Get the appropriate shuffle mask
+ vmovdqu %%T2, [r12]
+ ;; shift right 16-r13 bytes
+ vpshufb %%T1, %%T2
+%endif ; %%PLAIN_CYPH_LEN, LT16
+
+ ;; At this point T1 contains the partial block data
+%ifidn %%ENC_DEC, DEC
+ ;; Plaintext XOR E(K, Yn)
+ ;; Set aside the ciphertext
+ vmovdqa %%T2, %%T1
+ vpxor %%KEY, %%KEY, %%T1
+ ;; Get the appropriate mask to mask out top 16-r13 bytes of ciphertext
+ vmovdqu %%T1, [r12 + ALL_F - SHIFT_MASK]
+ ;; Mask out top 16-r13 bytes of ciphertext
+ vpand %%KEY, %%KEY, %%T1
+
+ ;; Prepare the ciphertext for the hash
+ ;; mask out top 16-r13 bytes of the plaintext
+ vpand %%T2, %%T2, %%T1
+%else
+ ;; Plaintext XOR E(K, Yn)
+ vpxor %%KEY, %%KEY, %%T1
+ ;; Get the appropriate mask to mask out top 16-r13 bytes of %%KEY
+ vmovdqu %%T1, [r12 + ALL_F - SHIFT_MASK]
+ ;; Mask out top 16-r13 bytes of %%KEY
+ vpand %%KEY, %%KEY, %%T1
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Output r13 Bytes
+ vmovq rax, %%KEY
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq %%T1, %%KEY, 8
+ vmovq rax, %%T1
+ sub r13, 8
+
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn %%ENC_DEC, DEC
+ ;; If decrypt, restore the ciphertext into %%KEY
+ vmovdqu %%KEY, %%T2
+%endif
+%endmacro ; ENCRYPT_FINAL_PARTIAL_BLOCK
+
+
+
+; Encryption of a single block
+%macro ENCRYPT_SINGLE_BLOCK 2
+%define %%GDATA %1
+%define %%XMM0 %2
+
+ vpxor %%XMM0, %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep NROUNDS
+ vaesenc %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep
+ vaesenclast %%XMM0, [%%GDATA+16*i]
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+ ;; Required for Update/GMC_ENC
+ ;the number of pushes must equal STACK_OFFSET
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r14, rsp
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+ vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+ vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+ vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+ vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+ vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+ vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+ vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+ vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+ vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_ymms_asm
+%endif
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15, [rsp + LOCAL_STORAGE + 9*16]
+ vmovdqu xmm14, [rsp + LOCAL_STORAGE + 8*16]
+ vmovdqu xmm13, [rsp + LOCAL_STORAGE + 7*16]
+ vmovdqu xmm12, [rsp + LOCAL_STORAGE + 6*16]
+ vmovdqu xmm11, [rsp + LOCAL_STORAGE + 5*16]
+ vmovdqu xmm10, [rsp + LOCAL_STORAGE + 4*16]
+ vmovdqu xmm9, [rsp + LOCAL_STORAGE + 3*16]
+ vmovdqu xmm8, [rsp + LOCAL_STORAGE + 2*16]
+ vmovdqu xmm7, [rsp + LOCAL_STORAGE + 1*16]
+ vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+
+;; Required for Update/GMC_ENC
+ mov rsp, r14
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
+; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV,
+; Additional Authentication data (A_IN), Additional Data length (A_LEN).
+; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA_CTX.
+; Clobbers rax, r10-r13, and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_INIT 5
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%IV %3
+%define %%A_IN %4
+%define %%A_LEN %5
+%define %%AAD_HASH xmm14
+
+
+ mov r10, %%A_LEN
+ cmp r10, 0
+ je %%_aad_is_zero
+
+ CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
+ jmp %%_after_aad
+
+%%_aad_is_zero:
+ vpxor %%AAD_HASH, %%AAD_HASH
+
+%%_after_aad:
+ mov r10, %%A_LEN
+ vpxor xmm2, xmm3
+
+ vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash
+ mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length
+ xor r10, r10
+ mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0
+ mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0
+ vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0
+ mov r10, %%IV
+ vmovdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001
+ vpinsrq xmm2, [r10], 0
+ vpinsrd xmm2, [r10+8], 2
+ vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv
+
+ vpshufb xmm2, [rel SHUF_MASK]
+
+ vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv
+%endmacro
+
+%macro GCM_ENC_DEC_SMALL 12
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%PLAIN_CYPH_LEN %5
+%define %%ENC_DEC %6
+%define %%DATA_OFFSET %7
+%define %%LENGTH %8
+%define %%NUM_BLOCKS %9
+%define %%CTR %10
+%define %%HASH %11
+%define %%INSTANCE_TYPE %12
+
+ ;; NOTE: the check below is obsolete in current implementation. The check is already done in GCM_ENC_DEC.
+ ;; cmp %%NUM_BLOCKS, 0
+ ;; je %%_small_initial_blocks_encrypted
+ cmp %%NUM_BLOCKS, 8
+ je %%_small_initial_num_blocks_is_8
+ cmp %%NUM_BLOCKS, 7
+ je %%_small_initial_num_blocks_is_7
+ cmp %%NUM_BLOCKS, 6
+ je %%_small_initial_num_blocks_is_6
+ cmp %%NUM_BLOCKS, 5
+ je %%_small_initial_num_blocks_is_5
+ cmp %%NUM_BLOCKS, 4
+ je %%_small_initial_num_blocks_is_4
+ cmp %%NUM_BLOCKS, 3
+ je %%_small_initial_num_blocks_is_3
+ cmp %%NUM_BLOCKS, 2
+ je %%_small_initial_num_blocks_is_2
+
+ jmp %%_small_initial_num_blocks_is_1
+
+
+%%_small_initial_num_blocks_is_8:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 8, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_7:
+ ;; r13 - %%LENGTH
+ ;; xmm12 - T1
+ ;; xmm13 - T2
+ ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys
+ ;; xmm15 - T4
+ ;; xmm11 - T5
+ ;; xmm9 - CTR
+ ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys
+ ;; xmm2 - XMM2
+ ;; xmm3 - XMM3
+ ;; xmm4 - XMM4
+ ;; xmm5 - XMM5
+ ;; xmm6 - XMM6
+ ;; xmm7 - XMM7
+ ;; xmm8 - XMM8 - AAD HASH IN
+ ;; xmm10 - T6
+ ;; xmm0 - T_key
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_6:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_5:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_4:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_3:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_2:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_1:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+
+ ;; Note: zero initial blocks not allowed.
+
+%%_small_initial_blocks_encrypted:
+
+%endmacro ; GCM_ENC_DEC_SMALL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct
+; has been initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN),
+; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC).
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_ENC_DEC 7
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%PLAIN_CYPH_LEN %5
+%define %%ENC_DEC %6
+%define %%INSTANCE_TYPE %7
+%define %%DATA_OFFSET r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+
+ cmp %%PLAIN_CYPH_LEN, 0
+ je %%_enc_dec_done
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+ ;; Update length of data processed
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rax, %%PLAIN_CYPH_LEN
+ add [%%GDATA_CTX + InLen], rax
+%else
+ add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN
+%endif
+ vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+ vmovdqu xmm8, [%%GDATA_CTX + AadHash]
+
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; NOTE: partial block processing makes only sense for multi_call here.
+ ;; Used for the update flow - if there was a previous partial
+ ;; block fill the remaining bytes here.
+ PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+%endif
+
+ ;; lift CTR set from initial_blocks to here
+%ifidn %%INSTANCE_TYPE, single_call
+ vmovdqu xmm9, xmm2
+%else
+ vmovdqu xmm9, [%%GDATA_CTX + CurCount]
+%endif
+
+ ;; Save the amount of data left to process in r10
+ mov r13, %%PLAIN_CYPH_LEN
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; NOTE: %%DATA_OFFSET is zero in single_call case.
+ ;; Consequently PLAIN_CYPH_LEN will never be zero after
+ ;; %%DATA_OFFSET subtraction below.
+ sub r13, %%DATA_OFFSET
+
+ ;; There may be no more data if it was consumed in the partial block.
+ cmp r13, 0
+ je %%_enc_dec_done
+%endif ; %%INSTANCE_TYPE, multi_call
+ mov r10, r13
+
+ ;; Determine how many blocks to process in INITIAL
+ mov r12, r13
+ shr r12, 4
+ and r12, 7
+
+ ;; Process one additional block in INITIAL if there is a partial block
+ and r10, 0xf
+ blsmsk r10, r10 ; Set CF if zero
+ cmc ; Flip CF
+ adc r12, 0x0 ; Process an additional INITIAL block if CF set
+
+ ;; Less than 127B will be handled by the small message code, which
+ ;; can process up to 7 16B blocks.
+ cmp r13, 128
+ jge %%_large_message_path
+
+ GCM_ENC_DEC_SMALL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, r13, r12, xmm9, xmm14, %%INSTANCE_TYPE
+ jmp %%_ghash_done
+
+%%_large_message_path:
+ and r12, 0x7 ; Still, don't allow 8 INITIAL blocks since this will
+ ; can be handled by the x8 partial loop.
+
+ cmp r12, 0
+ je %%_initial_num_blocks_is_0
+ cmp r12, 7
+ je %%_initial_num_blocks_is_7
+ cmp r12, 6
+ je %%_initial_num_blocks_is_6
+ cmp r12, 5
+ je %%_initial_num_blocks_is_5
+ cmp r12, 4
+ je %%_initial_num_blocks_is_4
+ cmp r12, 3
+ je %%_initial_num_blocks_is_3
+ cmp r12, 2
+ je %%_initial_num_blocks_is_2
+
+ jmp %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+ ;; r13 - %%LENGTH
+ ;; xmm12 - T1
+ ;; xmm13 - T2
+ ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys
+ ;; xmm15 - T4
+ ;; xmm11 - T5
+ ;; xmm9 - CTR
+ ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys
+ ;; xmm2 - XMM2
+ ;; xmm3 - XMM3
+ ;; xmm4 - XMM4
+ ;; xmm5 - XMM5
+ ;; xmm6 - XMM6
+ ;; xmm7 - XMM7
+ ;; xmm8 - XMM8 - AAD HASH IN
+ ;; xmm10 - T6
+ ;; xmm0 - T_key
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_3:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_2:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+ ;; The entire message was encrypted processed in initial and now need to be hashed
+ cmp r13, 0
+ je %%_encrypt_done
+
+ ;; Encrypt the final <16 byte (partial) block, then hash
+ cmp r13, 16
+ jl %%_encrypt_final_partial
+
+ ;; Process 7 full blocks plus a partial block
+ cmp r13, 128
+ jl %%_encrypt_by_8_partial
+
+
+%%_encrypt_by_8_parallel:
+ ;; in_order vs. out_order is an optimization to increment the counter without shuffling
+ ;; it back into little endian. r15d keeps track of when we need to increent in order so
+ ;; that the carry is handled correctly.
+ vmovd r15d, xmm9
+ and r15d, 255
+ vpshufb xmm9, [rel SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+ cmp r15d, 255-8
+ jg %%_encrypt_by_8
+
+
+
+ ;; xmm0 - T1
+ ;; xmm10 - T2
+ ;; xmm11 - T3
+ ;; xmm12 - T4
+ ;; xmm13 - T5
+ ;; xmm14 - T6
+ ;; xmm9 - CTR
+ ;; xmm1 - XMM1
+ ;; xmm2 - XMM2
+ ;; xmm3 - XMM3
+ ;; xmm4 - XMM4
+ ;; xmm5 - XMM5
+ ;; xmm6 - XMM6
+ ;; xmm7 - XMM7
+ ;; xmm8 - XMM8
+ ;; xmm15 - T7
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC, full
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ cmp r13, 128
+ jge %%_encrypt_by_8_new
+
+ vpshufb xmm9, [rel SHUF_MASK]
+ jmp %%_encrypt_by_8_parallel_done
+
+%%_encrypt_by_8:
+ vpshufb xmm9, [rel SHUF_MASK]
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, full
+ vpshufb xmm9, [rel SHUF_MASK]
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ cmp r13, 128
+ jge %%_encrypt_by_8_new
+ vpshufb xmm9, [rel SHUF_MASK]
+
+
+%%_encrypt_by_8_parallel_done:
+ ;; Test to see if we need a by 8 with partial block. At this point
+ ;; bytes remaining should be either zero or between 113-127.
+ cmp r13, 0
+ je %%_encrypt_done
+
+%%_encrypt_by_8_partial:
+ ;; Shuffle needed to align key for partial block xor. out_order
+ ;; is a little faster because it avoids extra shuffles.
+ ;; TBD: Might need to account for when we don't have room to increment the counter.
+
+
+ ;; Process parallel buffers with a final partial block.
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, partial
+
+
+ add %%DATA_OFFSET, 128-16
+ sub r13, 128-16
+
+%%_encrypt_final_partial:
+
+ vpshufb xmm8, [rel SHUF_MASK]
+ mov [%%GDATA_CTX + PBlockLen], r13
+ vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm8
+
+ ;; xmm8 - Final encrypted counter - need to hash with partial or full block ciphertext
+ ;; GDATA, KEY, T1, T2
+ ENCRYPT_FINAL_PARTIAL_BLOCK xmm8, xmm0, xmm10, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET
+
+ vpshufb xmm8, [rel SHUF_MASK]
+
+
+%%_encrypt_done:
+
+ ;; Mapping to macro parameters
+ ;; IN:
+ ;; xmm9 contains the counter
+ ;; xmm1-xmm8 contain the xor'd ciphertext
+ ;; OUT:
+ ;; xmm14 contains the final hash
+ ;; GDATA, T1, T2, T3, T4, T5, T6, T7, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+%ifidn %%INSTANCE_TYPE, multi_call
+ mov r13, [%%GDATA_CTX + PBlockLen]
+ cmp r13, 0
+ jz %%_hash_last_8
+ GHASH_LAST_7 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+ ;; XOR the partial word into the hash
+ vpxor xmm14, xmm14, xmm8
+ jmp %%_ghash_done
+%endif
+%%_hash_last_8:
+ GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+%%_ghash_done:
+ vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
+ vmovdqu [%%GDATA_CTX + AadHash], xmm14 ; my_ctx_data.aad hash = xmm14
+
+%%_enc_dec_done:
+
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX) and whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_COMPLETE 6
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%AUTH_TAG %3
+%define %%AUTH_TAG_LEN %4
+%define %%ENC_DEC %5
+%define %%INSTANCE_TYPE %6
+%define %%PLAIN_CYPH_LEN rax
+
+ vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+ ;; Start AES as early as possible
+ vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0
+ ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0)
+
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; If the GCM function is called as a single function call rather
+ ;; than invoking the individual parts (init, update, finalize) we
+ ;; can remove a write to read dependency on AadHash.
+ vmovdqu xmm14, [%%GDATA_CTX + AadHash]
+
+ ;; Encrypt the final partial block. If we did this as a single call then
+ ;; the partial block was handled in the main GCM_ENC_DEC macro.
+ mov r12, [%%GDATA_CTX + PBlockLen]
+ cmp r12, 0
+
+ je %%_partial_done
+
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ vmovdqu [%%GDATA_CTX + AadHash], xmm14
+
+%%_partial_done:
+
+%endif
+
+ mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes)
+ mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
+
+ shl r12, 3 ; convert into number of bits
+ vmovd xmm15, r12d ; len(A) in xmm15
+
+ shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
+ vmovq xmm1, %%PLAIN_CYPH_LEN
+ vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
+ vpxor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C)
+
+ vpxor xmm14, xmm15
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6
+ vpshufb xmm14, [rel SHUF_MASK] ; perform a 16Byte swap
+
+ vpxor xmm9, xmm9, xmm14
+
+
+%%_return_T:
+ mov r10, %%AUTH_TAG ; r10 = authTag
+ mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
+
+ cmp r11, 16
+ je %%_T_16
+
+ cmp r11, 12
+ je %%_T_12
+
+ cmp r11, 8
+ je %%_T_8
+
+ simd_store_avx r10, xmm9, r11, r12, rax
+ jmp %%_return_T_done
+%%_T_8:
+ vmovq rax, xmm9
+ mov [r10], rax
+ jmp %%_return_T_done
+%%_T_12:
+ vmovq rax, xmm9
+ mov [r10], rax
+ vpsrldq xmm9, xmm9, 8
+ vmovd eax, xmm9
+ mov [r10 + 8], eax
+ jmp %%_return_T_done
+%%_T_16:
+ vmovdqu [r10], xmm9
+
+%%_return_T_done:
+
+%ifdef SAFE_DATA
+ ;; Clear sensitive data from context structure
+ vpxor xmm0, xmm0
+ vmovdqu [%%GDATA_CTX + AadHash], xmm0
+ vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm0
+%endif
+%endmacro ; GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_precomp_128_avx_gen4 /
+; aes_gcm_precomp_192_avx_gen4 /
+; aes_gcm_precomp_256_avx_gen4
+; (struct gcm_key_data *key_data)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(precomp,_),function,)
+FN_NAME(precomp,_):
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_precomp
+%endif
+
+ push r12
+ push r13
+ push r14
+ push r15
+
+ mov r14, rsp
+
+
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63 ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; only xmm6 needs to be maintained
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+ vpxor xmm6, xmm6
+ ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey
+
+ vpshufb xmm6, [rel SHUF_MASK]
+ ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+ vmovdqa xmm2, xmm6
+ vpsllq xmm6, xmm6, 1
+ vpsrlq xmm2, xmm2, 63
+ vmovdqa xmm1, xmm2
+ vpslldq xmm2, xmm2, 8
+ vpsrldq xmm1, xmm1, 8
+ vpor xmm6, xmm6, xmm2
+ ;reduction
+ vpshufd xmm2, xmm1, 00100100b
+ vpcmpeqd xmm2, [rel TWOONE]
+ vpand xmm2, xmm2, [rel POLY]
+ vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
+
+
+ PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+ mov rsp, r14
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_ymms_asm
+%endif
+exit_precomp:
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_init_128_avx_gen4 / aes_gcm_init_192_avx_gen4 / aes_gcm_init_256_avx_gen4
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(init,_),function,)
+FN_NAME(init,_):
+ push r12
+ push r13
+%ifidn __OUTPUT_FORMAT__, win64
+ push r14
+ push r15
+ mov r14, rsp
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 1*16
+ movdqu [rsp + 0*16], xmm6
+%endif
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_init
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_init
+
+ ;; Check IV != NULL
+ cmp arg3, 0
+ jz exit_init
+
+ ;; Check if aad_len == 0
+ cmp arg5, 0
+ jz skip_aad_check_init
+
+ ;; Check aad != NULL (aad_len != 0)
+ cmp arg4, 0
+ jz exit_init
+
+skip_aad_check_init:
+%endif
+ GCM_INIT arg1, arg2, arg3, arg4, arg5
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_ymms_asm
+%endif
+exit_init:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm6 , [rsp + 0*16]
+ mov rsp, r14
+ pop r15
+ pop r14
+%endif
+ pop r13
+ pop r12
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_update_avx_gen4 / aes_gcm_enc_192_update_avx_gen4 /
+; aes_gcm_enc_128_update_avx_gen4
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_update_),function,)
+FN_NAME(enc,_update_):
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_update_enc
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_update_enc
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_update_enc
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_update_enc
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_update_enc
+
+skip_in_out_check_update_enc:
+%endif
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call
+
+exit_update_enc:
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_update_avx_gen4 / aes_gcm_dec_192_update_avx_gen4 /
+; aes_gcm_dec_256_update_avx_gen4
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_update_),function,)
+FN_NAME(dec,_update_):
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_update_dec
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_update_dec
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_update_dec
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_update_dec
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_update_dec
+
+skip_in_out_check_update_dec:
+%endif
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call
+
+exit_update_dec:
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_finalize_avx_gen4 / aes_gcm_enc_192_finalize_avx_gen4 /
+; aes_gcm_enc_256_finalize_avx_gen4
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_finalize_),function,)
+FN_NAME(enc,_finalize_):
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_enc_fin
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_enc_fin
+
+ ;; Check auth_tag != NULL
+ cmp arg3, 0
+ jz exit_enc_fin
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg4, 0
+ jz exit_enc_fin
+
+ cmp arg4, 16
+ ja exit_enc_fin
+%endif
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16], xmm6
+ vmovdqu [rsp + 1*16], xmm9
+ vmovdqu [rsp + 2*16], xmm11
+ vmovdqu [rsp + 3*16], xmm14
+ vmovdqu [rsp + 4*16], xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, arg4, ENC, multi_call
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_ymms_asm
+%endif
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15, [rsp + 4*16]
+ vmovdqu xmm14, [rsp + 3*16]
+ vmovdqu xmm11, [rsp + 2*16]
+ vmovdqu xmm9, [rsp + 1*16]
+ vmovdqu xmm6, [rsp + 0*16]
+ add rsp, 5*16
+%endif
+ pop r12
+exit_enc_fin:
+
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_finalize_avx_gen4 / aes_gcm_dec_192_finalize_avx_gen4
+; aes_gcm_dec_256_finalize_avx_gen4
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_finalize_),function,)
+FN_NAME(dec,_finalize_):
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_dec_fin
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_dec_fin
+
+ ;; Check auth_tag != NULL
+ cmp arg3, 0
+ jz exit_dec_fin
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg4, 0
+ jz exit_dec_fin
+
+ cmp arg4, 16
+ ja exit_dec_fin
+%endif
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16], xmm6
+ vmovdqu [rsp + 1*16], xmm9
+ vmovdqu [rsp + 2*16], xmm11
+ vmovdqu [rsp + 3*16], xmm14
+ vmovdqu [rsp + 4*16], xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, arg4, DEC, multi_call
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_ymms_asm
+%endif
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15, [rsp + 4*16]
+ vmovdqu xmm14, [rsp + 3*16]
+ vmovdqu xmm11, [rsp + 2*16]
+ vmovdqu xmm9, [rsp + 1*16]
+ vmovdqu xmm6, [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+
+exit_dec_fin:
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_avx_gen4 / aes_gcm_enc_192_avx_gen4 / aes_gcm_enc_256_avx_gen4
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_),function,)
+FN_NAME(enc,_):
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_enc
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_enc
+
+ ;; Check IV != NULL
+ cmp arg6, 0
+ jz exit_enc
+
+ ;; Check auth_tag != NULL
+ cmp arg9, 0
+ jz exit_enc
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg10, 0
+ jz exit_enc
+
+ cmp arg10, 16
+ ja exit_enc
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_enc
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_enc
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_enc
+
+skip_in_out_check_enc:
+ ;; Check if aad_len == 0
+ cmp arg8, 0
+ jz skip_aad_check_enc
+
+ ;; Check aad != NULL (aad_len != 0)
+ cmp arg7, 0
+ jz exit_enc
+
+skip_aad_check_enc:
+%endif
+ GCM_INIT arg1, arg2, arg6, arg7, arg8
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call
+
+ GCM_COMPLETE arg1, arg2, arg9, arg10, ENC, single_call
+
+exit_enc:
+ FUNC_RESTORE
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_avx_gen4 / aes_gcm_dec_192_avx_gen4 / aes_gcm_dec_256_avx_gen4
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_),function,)
+FN_NAME(dec,_):
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_dec
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_dec
+
+ ;; Check IV != NULL
+ cmp arg6, 0
+ jz exit_dec
+
+ ;; Check auth_tag != NULL
+ cmp arg9, 0
+ jz exit_dec
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg10, 0
+ jz exit_dec
+
+ cmp arg10, 16
+ ja exit_dec
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_dec
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_dec
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_dec
+
+skip_in_out_check_dec:
+ ;; Check if aad_len == 0
+ cmp arg8, 0
+ jz skip_aad_check_dec
+
+ ;; Check aad != NULL (aad_len != 0)
+ cmp arg7, 0
+ jz exit_dec
+
+skip_aad_check_dec:
+%endif
+
+
+ GCM_INIT arg1, arg2, arg6, arg7, arg8
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call
+
+ GCM_COMPLETE arg1, arg2, arg9, arg10, DEC, single_call
+
+exit_dec:
+ FUNC_RESTORE
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx2/mb_mgr_avx2.c b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_avx2.c
new file mode 100644
index 000000000..7133e64c6
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_avx2.c
@@ -0,0 +1,676 @@
+/*******************************************************************************
+ Copyright (c) 2012-2018, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define AVX2
+#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_ymms
+
+#include "intel-ipsec-mb.h"
+#include "include/kasumi_internal.h"
+#include "include/zuc_internal.h"
+#include "include/snow3g.h"
+
+#include "save_xmms.h"
+#include "asm.h"
+#include "des.h"
+#include "cpu_feature.h"
+#include "noaesni.h"
+
+JOB_AES_HMAC *submit_job_aes128_enc_avx(MB_MGR_AES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes128_enc_avx(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes192_enc_avx(MB_MGR_AES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes192_enc_avx(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes256_enc_avx(MB_MGR_AES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes256_enc_avx(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes_xcbc_avx(MB_MGR_AES_XCBC_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes_xcbc_avx(MB_MGR_AES_XCBC_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes_cntr_avx(JOB_AES_HMAC *job);
+
+JOB_AES_HMAC *submit_job_aes_cntr_bit_avx(JOB_AES_HMAC *job);
+
+#define SAVE_XMMS save_xmms_avx
+#define RESTORE_XMMS restore_xmms_avx
+
+#define SUBMIT_JOB_AES128_ENC submit_job_aes128_enc_avx
+#define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_avx
+#define FLUSH_JOB_AES128_ENC flush_job_aes128_enc_avx
+
+#define SUBMIT_JOB_AES192_ENC submit_job_aes192_enc_avx
+#define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_avx
+#define FLUSH_JOB_AES192_ENC flush_job_aes192_enc_avx
+
+#define SUBMIT_JOB_AES256_ENC submit_job_aes256_enc_avx
+#define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_avx
+#define FLUSH_JOB_AES256_ENC flush_job_aes256_enc_avx
+
+#define SUBMIT_JOB_AES_ECB_128_ENC submit_job_aes_ecb_128_enc_avx
+#define SUBMIT_JOB_AES_ECB_128_DEC submit_job_aes_ecb_128_dec_avx
+#define SUBMIT_JOB_AES_ECB_192_ENC submit_job_aes_ecb_192_enc_avx
+#define SUBMIT_JOB_AES_ECB_192_DEC submit_job_aes_ecb_192_dec_avx
+#define SUBMIT_JOB_AES_ECB_256_ENC submit_job_aes_ecb_256_enc_avx
+#define SUBMIT_JOB_AES_ECB_256_DEC submit_job_aes_ecb_256_dec_avx
+
+#define SUBMIT_JOB_AES_CNTR submit_job_aes_cntr_avx
+#define SUBMIT_JOB_AES_CNTR_BIT submit_job_aes_cntr_bit_avx
+
+#define AES_CBC_DEC_128 aes_cbc_dec_128_avx
+#define AES_CBC_DEC_192 aes_cbc_dec_192_avx
+#define AES_CBC_DEC_256 aes_cbc_dec_256_avx
+
+#define AES_CNTR_128 aes_cntr_128_avx
+#define AES_CNTR_192 aes_cntr_192_avx
+#define AES_CNTR_256 aes_cntr_256_avx
+
+#define AES_CNTR_CCM_128 aes_cntr_ccm_128_avx
+
+#define AES_ECB_ENC_128 aes_ecb_enc_128_avx
+#define AES_ECB_ENC_192 aes_ecb_enc_192_avx
+#define AES_ECB_ENC_256 aes_ecb_enc_256_avx
+#define AES_ECB_DEC_128 aes_ecb_dec_128_avx
+#define AES_ECB_DEC_192 aes_ecb_dec_192_avx
+#define AES_ECB_DEC_256 aes_ecb_dec_256_avx
+
+#define SUBMIT_JOB_PON_ENC submit_job_pon_enc_avx
+#define SUBMIT_JOB_PON_DEC submit_job_pon_dec_avx
+#define SUBMIT_JOB_PON_ENC_NO_CTR submit_job_pon_enc_no_ctr_avx
+#define SUBMIT_JOB_PON_DEC_NO_CTR submit_job_pon_dec_no_ctr_avx
+
+#ifndef NO_GCM
+#define AES_GCM_DEC_128 aes_gcm_dec_128_avx_gen4
+#define AES_GCM_ENC_128 aes_gcm_enc_128_avx_gen4
+#define AES_GCM_DEC_192 aes_gcm_dec_192_avx_gen4
+#define AES_GCM_ENC_192 aes_gcm_enc_192_avx_gen4
+#define AES_GCM_DEC_256 aes_gcm_dec_256_avx_gen4
+#define AES_GCM_ENC_256 aes_gcm_enc_256_avx_gen4
+
+#define SUBMIT_JOB_AES_GCM_DEC submit_job_aes_gcm_dec_avx2
+#define FLUSH_JOB_AES_GCM_DEC flush_job_aes_gcm_dec_avx2
+#define SUBMIT_JOB_AES_GCM_ENC submit_job_aes_gcm_enc_avx2
+#define FLUSH_JOB_AES_GCM_ENC flush_job_aes_gcm_enc_avx2
+#endif /* NO_GCM */
+
+#define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_avx
+#define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_avx
+
+#define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_avx
+#define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_avx
+#define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_avx
+#define QUEUE_SIZE queue_size_avx2
+
+#define SUBMIT_JOB_AES_ENC SUBMIT_JOB_AES_ENC_AVX2
+#define FLUSH_JOB_AES_ENC FLUSH_JOB_AES_ENC_AVX2
+#define SUBMIT_JOB_AES_DEC SUBMIT_JOB_AES_DEC_AVX2
+
+
+
+JOB_AES_HMAC *submit_job_hmac_avx2(MB_MGR_HMAC_SHA_1_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_avx2(MB_MGR_HMAC_SHA_1_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_224_avx2(MB_MGR_HMAC_SHA_256_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_224_avx2(MB_MGR_HMAC_SHA_256_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_256_avx2(MB_MGR_HMAC_SHA_256_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_256_avx2(MB_MGR_HMAC_SHA_256_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_384_avx2(MB_MGR_HMAC_SHA_512_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_384_avx2(MB_MGR_HMAC_SHA_512_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_512_avx2(MB_MGR_HMAC_SHA_512_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_512_avx2(MB_MGR_HMAC_SHA_512_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_md5_avx2(MB_MGR_HMAC_MD5_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_md5_avx2(MB_MGR_HMAC_MD5_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes_cmac_auth_avx(MB_MGR_CMAC_OOO *state,
+ JOB_AES_HMAC *job);
+
+JOB_AES_HMAC *flush_job_aes_cmac_auth_avx(MB_MGR_CMAC_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes_ccm_auth_avx(MB_MGR_CCM_OOO *state,
+ JOB_AES_HMAC *job);
+
+JOB_AES_HMAC *flush_job_aes_ccm_auth_avx(MB_MGR_CCM_OOO *state);
+
+#define SUBMIT_JOB_HMAC submit_job_hmac_avx2
+#define FLUSH_JOB_HMAC flush_job_hmac_avx2
+#define SUBMIT_JOB_HMAC_SHA_224 submit_job_hmac_sha_224_avx2
+#define FLUSH_JOB_HMAC_SHA_224 flush_job_hmac_sha_224_avx2
+#define SUBMIT_JOB_HMAC_SHA_256 submit_job_hmac_sha_256_avx2
+#define FLUSH_JOB_HMAC_SHA_256 flush_job_hmac_sha_256_avx2
+#define SUBMIT_JOB_HMAC_SHA_384 submit_job_hmac_sha_384_avx2
+#define FLUSH_JOB_HMAC_SHA_384 flush_job_hmac_sha_384_avx2
+#define SUBMIT_JOB_HMAC_SHA_512 submit_job_hmac_sha_512_avx2
+#define FLUSH_JOB_HMAC_SHA_512 flush_job_hmac_sha_512_avx2
+#define SUBMIT_JOB_HMAC_MD5 submit_job_hmac_md5_avx2
+#define FLUSH_JOB_HMAC_MD5 flush_job_hmac_md5_avx2
+
+/* ====================================================================== */
+
+#define SUBMIT_JOB submit_job_avx2
+#define FLUSH_JOB flush_job_avx2
+#define SUBMIT_JOB_NOCHECK submit_job_nocheck_avx2
+#define QUEUE_SIZE queue_size_avx2
+#define GET_NEXT_JOB get_next_job_avx2
+#define GET_COMPLETED_JOB get_completed_job_avx2
+
+/* ====================================================================== */
+
+#define SUBMIT_JOB_HASH SUBMIT_JOB_HASH_AVX2
+#define FLUSH_JOB_HASH FLUSH_JOB_HASH_AVX2
+
+/* ====================================================================== */
+
+#define AES_CFB_128_ONE aes_cfb_128_one_avx2
+
+void aes128_cbc_mac_x8(AES_ARGS *args, uint64_t len);
+
+#define AES128_CBC_MAC aes128_cbc_mac_x8
+
+#define FLUSH_JOB_AES_CCM_AUTH flush_job_aes_ccm_auth_avx
+#define SUBMIT_JOB_AES_CCM_AUTH submit_job_aes_ccm_auth_avx
+
+#define FLUSH_JOB_AES_CMAC_AUTH flush_job_aes_cmac_auth_avx
+#define SUBMIT_JOB_AES_CMAC_AUTH submit_job_aes_cmac_auth_avx
+
+/* ====================================================================== */
+
+/*
+ * GCM submit / flush API for AVX2 arch
+ */
+#ifndef NO_GCM
+static JOB_AES_HMAC *
+submit_job_aes_gcm_dec_avx2(MB_MGR *state, JOB_AES_HMAC *job)
+{
+ DECLARE_ALIGNED(struct gcm_context_data ctx, 16);
+ (void) state;
+
+ if (16 == job->aes_key_len_in_bytes)
+ AES_GCM_DEC_128(job->aes_dec_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else if (24 == job->aes_key_len_in_bytes)
+ AES_GCM_DEC_192(job->aes_dec_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else /* assume 32 bytes */
+ AES_GCM_DEC_256(job->aes_dec_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+
+ job->status = STS_COMPLETED;
+ return job;
+}
+
+static JOB_AES_HMAC *
+flush_job_aes_gcm_dec_avx2(MB_MGR *state, JOB_AES_HMAC *job)
+{
+ (void) state;
+ (void) job;
+ return NULL;
+}
+
+static JOB_AES_HMAC *
+submit_job_aes_gcm_enc_avx2(MB_MGR *state, JOB_AES_HMAC *job)
+{
+ DECLARE_ALIGNED(struct gcm_context_data ctx, 16);
+ (void) state;
+
+ if (16 == job->aes_key_len_in_bytes)
+ AES_GCM_ENC_128(job->aes_enc_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes, job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else if (24 == job->aes_key_len_in_bytes)
+ AES_GCM_ENC_192(job->aes_enc_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes, job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else /* assume 32 bytes */
+ AES_GCM_ENC_256(job->aes_enc_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes, job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+
+ job->status = STS_COMPLETED;
+ return job;
+}
+
+static JOB_AES_HMAC *
+flush_job_aes_gcm_enc_avx2(MB_MGR *state, JOB_AES_HMAC *job)
+{
+ (void) state;
+ (void) job;
+ return NULL;
+}
+#endif /* NO_GCM */
+
+/* ====================================================================== */
+
+void
+init_mb_mgr_avx2(MB_MGR *state)
+{
+ unsigned int j;
+ uint8_t *p;
+ size_t size;
+
+ state->features = cpu_feature_adjust(state->flags,
+ cpu_feature_detect());
+
+ if (!(state->features & IMB_FEATURE_AESNI)) {
+ init_mb_mgr_sse_no_aesni(state);
+ return;
+ }
+
+ /* Init AES out-of-order fields */
+ memset(state->aes128_ooo.lens, 0xFF,
+ sizeof(state->aes128_ooo.lens));
+ memset(&state->aes128_ooo.lens[0], 0,
+ sizeof(state->aes128_ooo.lens[0]) * 8);
+ memset(state->aes128_ooo.job_in_lane, 0,
+ sizeof(state->aes128_ooo.job_in_lane));
+ state->aes128_ooo.unused_lanes = 0xF76543210;
+ state->aes128_ooo.num_lanes_inuse = 0;
+
+ memset(state->aes192_ooo.lens, 0xFF,
+ sizeof(state->aes192_ooo.lens));
+ memset(&state->aes192_ooo.lens[0], 0,
+ sizeof(state->aes192_ooo.lens[0]) * 8);
+ memset(state->aes192_ooo.job_in_lane, 0,
+ sizeof(state->aes192_ooo.job_in_lane));
+ state->aes192_ooo.unused_lanes = 0xF76543210;
+ state->aes192_ooo.num_lanes_inuse = 0;
+
+ memset(&state->aes256_ooo.lens, 0xFF,
+ sizeof(state->aes256_ooo.lens));
+ memset(&state->aes256_ooo.lens[0], 0,
+ sizeof(state->aes256_ooo.lens[0]) * 8);
+ memset(state->aes256_ooo.job_in_lane, 0,
+ sizeof(state->aes256_ooo.job_in_lane));
+ state->aes256_ooo.unused_lanes = 0xF76543210;
+ state->aes256_ooo.num_lanes_inuse = 0;
+
+ /* DOCSIS SEC BPI (AES CBC + AES CFB for partial block)
+ * uses same settings as AES128 CBC.
+ */
+ memset(state->docsis_sec_ooo.lens, 0xFF,
+ sizeof(state->docsis_sec_ooo.lens));
+ memset(&state->docsis_sec_ooo.lens[0], 0,
+ sizeof(state->docsis_sec_ooo.lens[0]) * 8);
+ memset(state->docsis_sec_ooo.job_in_lane, 0,
+ sizeof(state->docsis_sec_ooo.job_in_lane));
+ state->docsis_sec_ooo.unused_lanes = 0xF76543210;
+ state->docsis_sec_ooo.num_lanes_inuse = 0;
+
+
+ /* Init HMAC/SHA1 out-of-order fields */
+ state->hmac_sha_1_ooo.lens[0] = 0;
+ state->hmac_sha_1_ooo.lens[1] = 0;
+ state->hmac_sha_1_ooo.lens[2] = 0;
+ state->hmac_sha_1_ooo.lens[3] = 0;
+ state->hmac_sha_1_ooo.lens[4] = 0;
+ state->hmac_sha_1_ooo.lens[5] = 0;
+ state->hmac_sha_1_ooo.lens[6] = 0;
+ state->hmac_sha_1_ooo.lens[7] = 0;
+ state->hmac_sha_1_ooo.unused_lanes = 0xF76543210;
+ for (j = 0; j < AVX2_NUM_SHA1_LANES; j++) {
+ state->hmac_sha_1_ooo.ldata[j].job_in_lane = NULL;
+ state->hmac_sha_1_ooo.ldata[j].extra_block[64] = 0x80;
+ memset(state->hmac_sha_1_ooo.ldata[j].extra_block + 65,
+ 0x00,
+ 64 + 7);
+ p = state->hmac_sha_1_ooo.ldata[j].outer_block;
+ memset(p + 5*4 + 1,
+ 0x00,
+ 64 - 5*4 - 1 - 2);
+ p[5 * 4] = 0x80;
+ p[64 - 2] = 0x02;
+ p[64 - 1] = 0xA0;
+ }
+ /* Init HMAC/SHA224 out-of-order fields */
+ state->hmac_sha_224_ooo.lens[0] = 0;
+ state->hmac_sha_224_ooo.lens[1] = 0;
+ state->hmac_sha_224_ooo.lens[2] = 0;
+ state->hmac_sha_224_ooo.lens[3] = 0;
+ state->hmac_sha_224_ooo.lens[4] = 0;
+ state->hmac_sha_224_ooo.lens[5] = 0;
+ state->hmac_sha_224_ooo.lens[6] = 0;
+ state->hmac_sha_224_ooo.lens[7] = 0;
+ state->hmac_sha_224_ooo.unused_lanes = 0xF76543210;
+ /* sha256 and sha224 are very similar except for
+ * digest constants and output size
+ */
+ for (j = 0; j < AVX2_NUM_SHA256_LANES; j++) {
+ state->hmac_sha_224_ooo.ldata[j].job_in_lane = NULL;
+
+ p = state->hmac_sha_224_ooo.ldata[j].extra_block;
+ size = sizeof(state->hmac_sha_224_ooo.ldata[j].extra_block);
+ memset (p, 0x00, size);
+ p[64] = 0x80;
+
+ p = state->hmac_sha_224_ooo.ldata[j].outer_block;
+ size = sizeof(state->hmac_sha_224_ooo.ldata[j].outer_block);
+ memset(p, 0x00, size);
+ p[7 * 4] = 0x80; /* digest 7 words long */
+ p[64 - 2] = 0x02; /* length in little endian = 0x02E0 */
+ p[64 - 1] = 0xE0;
+ }
+
+ /* Init HMAC/SHA256 out-of-order fields */
+ state->hmac_sha_256_ooo.lens[0] = 0;
+ state->hmac_sha_256_ooo.lens[1] = 0;
+ state->hmac_sha_256_ooo.lens[2] = 0;
+ state->hmac_sha_256_ooo.lens[3] = 0;
+ state->hmac_sha_256_ooo.lens[4] = 0;
+ state->hmac_sha_256_ooo.lens[5] = 0;
+ state->hmac_sha_256_ooo.lens[6] = 0;
+ state->hmac_sha_256_ooo.lens[7] = 0;
+ state->hmac_sha_256_ooo.unused_lanes = 0xF76543210;
+ for (j = 0; j < AVX2_NUM_SHA256_LANES; j++) {
+ state->hmac_sha_256_ooo.ldata[j].job_in_lane = NULL;
+ state->hmac_sha_256_ooo.ldata[j].extra_block[64] = 0x80;
+ memset(state->hmac_sha_256_ooo.ldata[j].extra_block + 65,
+ 0x00,
+ 64 + 7);
+ /* hmac related */
+ p = state->hmac_sha_256_ooo.ldata[j].outer_block;
+ memset(p + 8*4 + 1,
+ 0x00,
+ 64 - 8*4 - 1 - 2);
+ p[8 * 4] = 0x80; /* 8 digest words */
+ p[64 - 2] = 0x03; /* length */
+ p[64 - 1] = 0x00;
+ }
+
+ /* Init HMAC/SHA384 out-of-order fields */
+ state->hmac_sha_384_ooo.lens[0] = 0;
+ state->hmac_sha_384_ooo.lens[1] = 0;
+ state->hmac_sha_384_ooo.lens[2] = 0;
+ state->hmac_sha_384_ooo.lens[3] = 0;
+ state->hmac_sha_384_ooo.lens[4] = 0xFFFF;
+ state->hmac_sha_384_ooo.lens[5] = 0xFFFF;
+ state->hmac_sha_384_ooo.lens[6] = 0xFFFF;
+ state->hmac_sha_384_ooo.lens[7] = 0xFFFF;
+ state->hmac_sha_384_ooo.unused_lanes = 0xFF03020100;
+ for (j = 0; j < AVX2_NUM_SHA512_LANES; j++) {
+ MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_384_ooo;
+
+ ctx->ldata[j].job_in_lane = NULL;
+ ctx->ldata[j].extra_block[SHA_384_BLOCK_SIZE] = 0x80;
+ memset(ctx->ldata[j].extra_block + (SHA_384_BLOCK_SIZE + 1),
+ 0x00, SHA_384_BLOCK_SIZE + 7);
+ p = ctx->ldata[j].outer_block;
+ /* special end point because this length is constant */
+ memset(p + SHA384_DIGEST_SIZE_IN_BYTES + 1, 0x00,
+ SHA_384_BLOCK_SIZE -
+ SHA384_DIGEST_SIZE_IN_BYTES - 1 - 2);
+ /* mark the end */
+ p[SHA384_DIGEST_SIZE_IN_BYTES] = 0x80;
+ /* hmac outer block length always of fixed size,
+ * it is OKey length, a whole message block length, 1024 bits,
+ * with padding plus the length of the inner digest,
+ * which is 384 bits, 1408 bits == 0x0580.
+ * The input message block needs to be converted to big endian
+ * within the sha implementation before use.
+ */
+ p[SHA_384_BLOCK_SIZE - 2] = 0x05;
+ p[SHA_384_BLOCK_SIZE - 1] = 0x80;
+ }
+
+ /* Init HMAC/SHA512 out-of-order fields */
+ state->hmac_sha_512_ooo.lens[0] = 0;
+ state->hmac_sha_512_ooo.lens[1] = 0;
+ state->hmac_sha_512_ooo.lens[2] = 0;
+ state->hmac_sha_512_ooo.lens[3] = 0;
+ state->hmac_sha_512_ooo.lens[4] = 0xFFFF;
+ state->hmac_sha_512_ooo.lens[5] = 0xFFFF;
+ state->hmac_sha_512_ooo.lens[6] = 0xFFFF;
+ state->hmac_sha_512_ooo.lens[7] = 0xFFFF;
+ state->hmac_sha_512_ooo.unused_lanes = 0xFF03020100;
+ for (j = 0; j < AVX2_NUM_SHA512_LANES; j++) {
+ MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_512_ooo;
+
+ ctx->ldata[j].job_in_lane = NULL;
+ ctx->ldata[j].extra_block[SHA_512_BLOCK_SIZE] = 0x80;
+ memset(ctx->ldata[j].extra_block + (SHA_512_BLOCK_SIZE + 1),
+ 0x00, SHA_512_BLOCK_SIZE + 7);
+ p = ctx->ldata[j].outer_block;
+ /* special end point because this length is constant */
+ memset(p + SHA512_DIGEST_SIZE_IN_BYTES + 1, 0x00,
+ SHA_512_BLOCK_SIZE -
+ SHA512_DIGEST_SIZE_IN_BYTES - 1 - 2);
+ /* mark the end */
+ p[SHA512_DIGEST_SIZE_IN_BYTES] = 0x80;
+ /* hmac outer block length always of fixed size,
+ * it is OKey length, a whole message block length, 1024 bits,
+ * with padding plus the length of the inner digest,
+ * which is 512 bits, 1536 bits == 0x600.
+ * The input message block needs to be converted to big endian
+ * within the sha implementation before use.
+ */
+ p[SHA_512_BLOCK_SIZE - 2] = 0x06;
+ p[SHA_512_BLOCK_SIZE - 1] = 0x00;
+ }
+
+ /* Init HMAC/MD5 out-of-order fields */
+ state->hmac_md5_ooo.lens[0] = 0;
+ state->hmac_md5_ooo.lens[1] = 0;
+ state->hmac_md5_ooo.lens[2] = 0;
+ state->hmac_md5_ooo.lens[3] = 0;
+ state->hmac_md5_ooo.lens[4] = 0;
+ state->hmac_md5_ooo.lens[5] = 0;
+ state->hmac_md5_ooo.lens[6] = 0;
+ state->hmac_md5_ooo.lens[7] = 0;
+ state->hmac_md5_ooo.lens[8] = 0;
+ state->hmac_md5_ooo.lens[9] = 0;
+ state->hmac_md5_ooo.lens[10] = 0;
+ state->hmac_md5_ooo.lens[11] = 0;
+ state->hmac_md5_ooo.lens[12] = 0;
+ state->hmac_md5_ooo.lens[13] = 0;
+ state->hmac_md5_ooo.lens[14] = 0;
+ state->hmac_md5_ooo.lens[15] = 0;
+ state->hmac_md5_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->hmac_md5_ooo.num_lanes_inuse = 0;
+ for (j = 0; j < AVX2_NUM_MD5_LANES; j++) {
+ state->hmac_md5_ooo.ldata[j].job_in_lane = NULL;
+
+ p = state->hmac_md5_ooo.ldata[j].extra_block;
+ size = sizeof(state->hmac_md5_ooo.ldata[j].extra_block);
+ memset (p, 0x00, size);
+ p[64] = 0x80;
+
+ p = state->hmac_md5_ooo.ldata[j].outer_block;
+ size = sizeof(state->hmac_md5_ooo.ldata[j].outer_block);
+ memset(p, 0x00, size);
+ p[4 * 4] = 0x80;
+ p[64 - 7] = 0x02;
+ p[64 - 8] = 0x80;
+ }
+
+ /* Init AES/XCBC OOO fields */
+ state->aes_xcbc_ooo.lens[0] = 0;
+ state->aes_xcbc_ooo.lens[1] = 0;
+ state->aes_xcbc_ooo.lens[2] = 0;
+ state->aes_xcbc_ooo.lens[3] = 0;
+ state->aes_xcbc_ooo.lens[4] = 0;
+ state->aes_xcbc_ooo.lens[5] = 0;
+ state->aes_xcbc_ooo.lens[6] = 0;
+ state->aes_xcbc_ooo.lens[7] = 0;
+ state->aes_xcbc_ooo.unused_lanes = 0xF76543210;
+ for (j = 0; j < 8 ; j++) {
+ state->aes_xcbc_ooo.ldata[j].job_in_lane = NULL;
+ state->aes_xcbc_ooo.ldata[j].final_block[16] = 0x80;
+ memset(state->aes_xcbc_ooo.ldata[j].final_block + 17, 0x00, 15);
+ }
+
+ /* Init AES-CCM auth out-of-order fields */
+ for (j = 0; j < 8; j++) {
+ state->aes_ccm_ooo.init_done[j] = 0;
+ state->aes_ccm_ooo.lens[j] = 0;
+ state->aes_ccm_ooo.job_in_lane[j] = NULL;
+ }
+ state->aes_ccm_ooo.unused_lanes = 0xF76543210;
+
+ /* Init AES-CMAC auth out-of-order fields */
+ for (j = 0; j < 8; j++) {
+ state->aes_cmac_ooo.init_done[j] = 0;
+ state->aes_cmac_ooo.lens[j] = 0;
+ state->aes_cmac_ooo.job_in_lane[j] = NULL;
+ }
+ state->aes_cmac_ooo.unused_lanes = 0xF76543210;
+
+ /* Init "in order" components */
+ state->next_job = 0;
+ state->earliest_job = -1;
+
+ /* set handlers */
+ state->get_next_job = get_next_job_avx2;
+ state->submit_job = submit_job_avx2;
+ state->submit_job_nocheck = submit_job_nocheck_avx2;
+ state->get_completed_job = get_completed_job_avx2;
+ state->flush_job = flush_job_avx2;
+ state->queue_size = queue_size_avx2;
+ state->keyexp_128 = aes_keyexp_128_avx2;
+ state->keyexp_192 = aes_keyexp_192_avx2;
+ state->keyexp_256 = aes_keyexp_256_avx2;
+ state->cmac_subkey_gen_128 = aes_cmac_subkey_gen_avx2;
+ state->xcbc_keyexp = aes_xcbc_expand_key_avx2;
+ state->des_key_sched = des_key_schedule;
+ state->sha1_one_block = sha1_one_block_avx2;
+ state->sha1 = sha1_avx2;
+ state->sha224_one_block = sha224_one_block_avx2;
+ state->sha224 = sha224_avx2;
+ state->sha256_one_block = sha256_one_block_avx2;
+ state->sha256 = sha256_avx2;
+ state->sha384_one_block = sha384_one_block_avx2;
+ state->sha384 = sha384_avx2;
+ state->sha512_one_block = sha512_one_block_avx2;
+ state->sha512 = sha512_avx2;
+ state->md5_one_block = md5_one_block_avx2;
+ state->aes128_cfb_one = aes_cfb_128_one_avx2;
+
+ state->eea3_1_buffer = zuc_eea3_1_buffer_avx;
+ state->eea3_4_buffer = zuc_eea3_4_buffer_avx;
+ state->eea3_n_buffer = zuc_eea3_n_buffer_avx;
+ state->eia3_1_buffer = zuc_eia3_1_buffer_avx;
+
+ state->f8_1_buffer = kasumi_f8_1_buffer_avx;
+ state->f8_1_buffer_bit = kasumi_f8_1_buffer_bit_avx;
+ state->f8_2_buffer = kasumi_f8_2_buffer_avx;
+ state->f8_3_buffer = kasumi_f8_3_buffer_avx;
+ state->f8_4_buffer = kasumi_f8_4_buffer_avx;
+ state->f8_n_buffer = kasumi_f8_n_buffer_avx;
+ state->f9_1_buffer = kasumi_f9_1_buffer_avx;
+ state->f9_1_buffer_user = kasumi_f9_1_buffer_user_avx;
+ state->kasumi_init_f8_key_sched = kasumi_init_f8_key_sched_avx;
+ state->kasumi_init_f9_key_sched = kasumi_init_f9_key_sched_avx;
+ state->kasumi_key_sched_size = kasumi_key_sched_size_avx;
+
+ state->snow3g_f8_1_buffer_bit = snow3g_f8_1_buffer_bit_avx2;
+ state->snow3g_f8_1_buffer = snow3g_f8_1_buffer_avx2;
+ state->snow3g_f8_2_buffer = snow3g_f8_2_buffer_avx2;
+ state->snow3g_f8_4_buffer = snow3g_f8_4_buffer_avx2;
+ state->snow3g_f8_8_buffer = snow3g_f8_8_buffer_avx2;
+ state->snow3g_f8_n_buffer = snow3g_f8_n_buffer_avx2;
+ state->snow3g_f8_8_buffer_multikey = snow3g_f8_8_buffer_multikey_avx2;
+ state->snow3g_f8_n_buffer_multikey = snow3g_f8_n_buffer_multikey_avx2;
+ state->snow3g_f9_1_buffer = snow3g_f9_1_buffer_avx2;
+ state->snow3g_init_key_sched = snow3g_init_key_sched_avx2;
+ state->snow3g_key_sched_size = snow3g_key_sched_size_avx2;
+
+#ifndef NO_GCM
+ state->gcm128_enc = aes_gcm_enc_128_avx_gen4;
+ state->gcm192_enc = aes_gcm_enc_192_avx_gen4;
+ state->gcm256_enc = aes_gcm_enc_256_avx_gen4;
+ state->gcm128_dec = aes_gcm_dec_128_avx_gen4;
+ state->gcm192_dec = aes_gcm_dec_192_avx_gen4;
+ state->gcm256_dec = aes_gcm_dec_256_avx_gen4;
+ state->gcm128_init = aes_gcm_init_128_avx_gen4;
+ state->gcm192_init = aes_gcm_init_192_avx_gen4;
+ state->gcm256_init = aes_gcm_init_256_avx_gen4;
+ state->gcm128_enc_update = aes_gcm_enc_128_update_avx_gen4;
+ state->gcm192_enc_update = aes_gcm_enc_192_update_avx_gen4;
+ state->gcm256_enc_update = aes_gcm_enc_256_update_avx_gen4;
+ state->gcm128_dec_update = aes_gcm_dec_128_update_avx_gen4;
+ state->gcm192_dec_update = aes_gcm_dec_192_update_avx_gen4;
+ state->gcm256_dec_update = aes_gcm_dec_256_update_avx_gen4;
+ state->gcm128_enc_finalize = aes_gcm_enc_128_finalize_avx_gen4;
+ state->gcm192_enc_finalize = aes_gcm_enc_192_finalize_avx_gen4;
+ state->gcm256_enc_finalize = aes_gcm_enc_256_finalize_avx_gen4;
+ state->gcm128_dec_finalize = aes_gcm_dec_128_finalize_avx_gen4;
+ state->gcm192_dec_finalize = aes_gcm_dec_192_finalize_avx_gen4;
+ state->gcm256_dec_finalize = aes_gcm_dec_256_finalize_avx_gen4;
+ state->gcm128_precomp = aes_gcm_precomp_128_avx_gen4;
+ state->gcm192_precomp = aes_gcm_precomp_192_avx_gen4;
+ state->gcm256_precomp = aes_gcm_precomp_256_avx_gen4;
+ state->gcm128_pre = aes_gcm_pre_128_avx_gen4;
+ state->gcm192_pre = aes_gcm_pre_192_avx_gen4;
+ state->gcm256_pre = aes_gcm_pre_256_avx_gen4;
+#endif
+}
+
+#include "mb_mgr_code.h"
diff --git a/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_flush_avx2.asm b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_flush_avx2.asm
new file mode 100644
index 000000000..88fac0c64
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_flush_avx2.asm
@@ -0,0 +1,315 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+extern sha1_x8_avx2
+
+section .data
+default rel
+
+align 16
+byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+x80: ;ddq 0x00000000000000000000000000000080
+ dq 0x0000000000000080, 0x0000000000000000
+x00: ;ddq 0x00000000000000000000000000000000
+ dq 0x0000000000000000, 0x0000000000000000
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ ;ddq 0x00000000000000000000FFFF00000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ ;ddq 0x0000000000000000FFFF000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+ ;ddq 0x000000000000FFFF0000000000000000
+ dq 0x0000000000000000, 0x000000000000FFFF
+ ;ddq 0x00000000FFFF00000000000000000000
+ dq 0x0000000000000000, 0x00000000FFFF0000
+ ;ddq 0x0000FFFF000000000000000000000000
+ dq 0x0000000000000000, 0x0000FFFF00000000
+ ;ddq 0xFFFF0000000000000000000000000000
+ dq 0x0000000000000000, 0xFFFF000000000000
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+
+section .text
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rdi, rbp
+%define idx rbp
+
+%define unused_lanes r9
+%define lane_data r9
+%define tmp2 r9
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+
+%endif
+
+; we clobber rbp, called routine clobbers r12-r15
+struc STACK
+_gpr_save: resq 5
+_rsp_save: resq 1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* flush_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state)
+; arg 1 : rcx : state
+MKGLOBAL(flush_job_hmac_avx2,function,internal)
+flush_job_hmac_avx2:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -32 ; align stack to 32 byte boundary
+ mov [rsp + _gpr_save + 8*0], rbp
+ mov [rsp + _gpr_save + 8*1], r12
+ mov [rsp + _gpr_save + 8*2], r13
+ mov [rsp + _gpr_save + 8*3], r14
+ mov [rsp + _gpr_save + 8*4], r15
+ mov [rsp + _rsp_save], rax
+
+ mov unused_lanes, [state + _unused_lanes]
+ bt unused_lanes, 32+3
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 7
+ cmp qword [state + _ldata + (I * _HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
+ cmovne idx, [rel APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+copy_lane_data:
+ ; copy valid lane (idx) to empty lanes
+ vmovdqa xmm0, [state + _lens]
+ mov tmp, [state + _args_data_ptr + PTR_SZ*idx]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args_data_ptr + PTR_SZ*I], tmp
+ vpor xmm0, xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ vmovdqa [state + _lens], xmm0
+
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...7)
+ DBGPRINTL64 "FLUSH min_length", len2
+ DBGPRINTL64 "FLUSH min_length index ", idx
+ cmp len2, 0
+ je len_is_0
+
+ vpbroadcastw xmm1, xmm1
+ DBGPRINTL_XMM "FLUSH lens after shuffle", xmm1
+
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens], xmm0
+ DBGPRINTL_XMM "FLUSH lens immediately after min subtraction", xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_x8_avx2
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+
+ vmovd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3
+ vpshufb xmm0, xmm0, [rel byteswap]
+ mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ vmovdqa [lane_data + _outer_block], xmm0
+ mov [lane_data + _outer_block + 4*4], DWORD(tmp)
+
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ mov DWORD(tmp), [tmp + 4*4]
+ vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3
+ mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp)
+ jmp copy_lane_data
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp copy_lane_data
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4 ;; a nibble
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; copy 12 bytes
+ mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(r12), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+ bswap DWORD(r12)
+ mov [p + 0*4], DWORD(tmp2)
+ mov [p + 1*4], DWORD(tmp4)
+ mov [p + 2*4], DWORD(r12)
+
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 12
+ je clear_ret
+
+ ;; copy remaining 8 bytes to return 20 byte digest
+ mov DWORD(r13), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(r14), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(r13)
+ bswap DWORD(r14)
+ mov [p + 3*SHA1_DIGEST_WORD_SIZE], DWORD(r13)
+ mov [p + 4*SHA1_DIGEST_WORD_SIZE], DWORD(r14)
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ vpxor ymm0, ymm0
+
+ ;; Clear digest (20B), outer_block (20B) and extra_block (64B)
+ ;; of returned job and NULL jobs
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
+ jne APPEND(skip_clear_,I)
+
+ ;; Clear digest
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 0*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 1*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 2*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 3*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 4*SHA1_DIGEST_ROW_SIZE], 0
+
+ lea lane_data, [state + _ldata + (I*_HMAC_SHA1_LANE_DATA_size)]
+
+ ;; Clear first 64 bytes of extra_block
+ vmovdqa [lane_data + _extra_block], ymm0
+ vmovdqa [lane_data + _extra_block + 32], ymm0
+
+ ;; Clear first 20 bytes of outer_block
+ vmovdqa [lane_data + _outer_block], xmm0
+ mov dword [lane_data + _outer_block + 16], 0
+
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SAFE_DATA
+
+return:
+ vzeroupper
+ mov rbp, [rsp + _gpr_save + 8*0]
+ mov r12, [rsp + _gpr_save + 8*1]
+ mov r13, [rsp + _gpr_save + 8*2]
+ mov r14, [rsp + _gpr_save + 8*3]
+ mov r15, [rsp + _gpr_save + 8*4]
+ mov rsp, [rsp + _rsp_save]
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_md5_flush_avx2.asm b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_md5_flush_avx2.asm
new file mode 100644
index 000000000..f123157b7
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_md5_flush_avx2.asm
@@ -0,0 +1,362 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+extern md5_x8x2_avx2
+
+section .data
+default rel
+align 16
+dupw: ;ddq 0x01000100010001000100010001000100
+ dq 0x0100010001000100, 0x0100010001000100
+x80: ;ddq 0x00000000000000000000000000000080
+ dq 0x0000000000000080, 0x0000000000000000
+x00: ;ddq 0x00000000000000000000000000000000
+ dq 0x0000000000000000, 0x0000000000000000
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ ;ddq 0x00000000000000000000FFFF00000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ ;ddq 0x0000000000000000FFFF000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+ ;ddq 0x000000000000FFFF0000000000000000
+ dq 0x0000000000000000, 0x000000000000FFFF
+ ;ddq 0x00000000FFFF00000000000000000000
+ dq 0x0000000000000000, 0x00000000FFFF0000
+ ;ddq 0x0000FFFF000000000000000000000000
+ dq 0x0000000000000000, 0x0000FFFF00000000
+ ;ddq 0xFFFF0000000000000000000000000000
+ dq 0x0000000000000000, 0xFFFF000000000000
+
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+lane_8: dq 8
+lane_9: dq 9
+lane_10: dq 10
+lane_11: dq 11
+lane_12: dq 12
+lane_13: dq 13
+lane_14: dq 14
+lane_15: dq 15
+
+section .text
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbp
+%define idx rbp
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define tmp5 r9
+%define num_lanes_inuse r12
+%define len_upper r13
+%define idx_upper r14
+%endif
+
+; This routine and/or the called routine clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* flush_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state)
+; arg 1 : rcx : state
+MKGLOBAL(flush_job_hmac_md5_avx2,function,internal)
+flush_job_hmac_md5_avx2:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -32
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ DBGPRINTL "---------- enter md5 flush -----------"
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse_md5] ;; empty?
+ cmp num_lanes_inuse, 0
+ jz return_null
+
+ ; find a lane with a non-null job -- flush does not have to be efficient!
+ mov idx, 0
+ %assign I 1
+%rep 15
+ cmp qword [state + _ldata_md5 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [rel APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+
+copy_lane_data:
+ ; copy good lane (idx) to empty lanes
+ mov tmp, [state + _args_data_ptr_md5 + PTR_SZ*idx]
+ ;; tackle lower 8 lanes
+ vmovdqa xmm0, [state + _lens_md5 + 0*16] ;; lower 8 lengths
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata_md5 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(lower_skip_,I)
+ mov [state + _args_data_ptr_md5 + PTR_SZ*I], tmp
+ vpor xmm0, xmm0, [rel len_masks + 16*I]
+APPEND(lower_skip_,I):
+%assign I (I+1)
+%endrep
+ ;; tackle upper lanes
+ vmovdqa xmm1, [state + _lens_md5 + 1*16] ;; upper 8 lengths
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata_md5 + (8 + I) * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(upper_skip_,I)
+ mov [state + _args_data_ptr_md5 + PTR_SZ*(8+I)], tmp
+ vpor xmm1, xmm1, [rel len_masks + 16*I]
+APPEND(upper_skip_,I):
+%assign I (I+1)
+%endrep
+ jmp start_loop0
+
+ align 32
+start_loop0:
+ ; Find min length
+ vphminposuw xmm2, xmm0
+ vpextrw DWORD(len2), xmm2, 0 ; min value
+ vpextrw DWORD(idx), xmm2, 1 ; min index (0...7)
+
+ vphminposuw xmm3, xmm1
+ vpextrw DWORD(len_upper), xmm3, 0 ; min value
+ vpextrw DWORD(idx_upper), xmm3, 1 ; min index (8...F)
+
+ cmp len2, len_upper
+ jle use_min
+
+min_in_high:
+ vmovdqa xmm2, xmm3
+ mov len2, len_upper
+ mov idx, idx_upper
+ or idx, 0x8 ; to reflect that index in 8-F
+use_min:
+ and len2, len2 ; to set flags
+ jz len_is_0
+ DBGPRINTL64 "min_length min_index ", len2, idx
+ DBGPRINTL_XMM "FLUSH md5 lens before sub lower", xmm0
+ vpbroadcastw xmm2, xmm2 ; duplicate words across all lanes
+ vpsubw xmm0, xmm0, xmm2
+ DBGPRINTL_XMM "FLUSH md5 lens after sub lower", xmm0
+ vmovdqa [state + _lens_md5 + 0*16], xmm0
+
+ vpsubw xmm1, xmm1, xmm2
+ DBGPRINTL_XMM "FLUSH md5 lens after sub upper", xmm1
+ vmovdqa [state + _lens_md5 + 1*16], xmm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_x8x2_avx2
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_md5 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens_md5 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
+
+ vmovd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3
+ vmovdqa [lane_data + _outer_block], xmm0
+
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ vmovd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3
+ jmp copy_lane_data
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens_md5 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp copy_lane_data
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ mov unused_lanes, [state + _unused_lanes_md5]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes_md5], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse_md5] ;; update lanes inuse
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse_md5], DWORD(num_lanes_inuse)
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; copy 12 bytes
+ mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE]
+ mov DWORD(tmp5), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE]
+; bswap DWORD(tmp2)
+; bswap DWORD(tmp4)
+; bswap DWORD(tmp3)
+ mov [p + 0*4], DWORD(tmp2)
+ mov [p + 1*4], DWORD(tmp4)
+ mov [p + 2*4], DWORD(tmp5)
+
+ cmp DWORD [job_rax + _auth_tag_output_len_in_bytes], 12
+ je clear_ret
+
+ ; copy 16 bytes
+ mov DWORD(tmp5), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE]
+ mov [p + 3*4], DWORD(tmp5)
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ vpxor ymm0, ymm0
+
+ ;; Clear digest (16B), outer_block (16B) and extra_block (64B)
+ ;; of returned job and NULL jobs
+%assign I 0
+%rep 16
+ cmp qword [state + _ldata_md5 + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
+ jne APPEND(skip_clear_,I)
+
+ ;; Clear digest (16 bytes)
+%assign J 0
+%rep 4
+ mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*I + J*MD5_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+
+ lea lane_data, [state + _ldata_md5 + (I*_HMAC_SHA1_LANE_DATA_size)]
+ ;; Clear first 64 bytes of extra_block
+ vmovdqa [lane_data + _extra_block], ymm0
+ vmovdqa [lane_data + _extra_block + 32], ymm0
+
+ ;; Clear first 16 bytes of outer_block
+ vmovdqa [lane_data + _outer_block], xmm0
+
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SAFE_DATA
+
+return:
+ DBGPRINTL "---------- exit md5 flush -----------"
+ vzeroupper
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_md5_submit_avx2.asm b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_md5_submit_avx2.asm
new file mode 100644
index 000000000..661ae4eba
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_md5_submit_avx2.asm
@@ -0,0 +1,373 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/memcpy.asm"
+%include "include/reg_sizes.asm"
+%include "include/const.inc"
+
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+extern md5_x8x2_avx2
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define reg3 rcx
+%define reg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define reg3 rdi
+%define reg4 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbp
+%define last_len rbp
+%define idx rbp
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+%define tmp4 rbx
+
+%define job_rax rax
+%define len rax
+
+%define size_offset reg3
+%define tmp2 reg3
+
+%define lane reg4
+%define tmp3 reg4
+
+%define extra_blocks r8
+
+%define tmp r9
+%define p2 r9
+
+%define lane_data r10
+%define num_lanes_inuse r12
+%define len_upper r13
+%define idx_upper r14
+
+%endif
+
+; This routine and/or the called routine clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+section .text
+
+; JOB* submit_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+MKGLOBAL(submit_job_hmac_md5_avx2,function,internal)
+submit_job_hmac_md5_avx2:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -32
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ DBGPRINTL "---------- enter md5 submit -----------"
+ mov unused_lanes, [state + _unused_lanes_md5]
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse_md5]
+ mov lane, unused_lanes
+
+ and lane, 0xF
+ shr unused_lanes, 4
+ mov [state + _unused_lanes_md5], unused_lanes
+ add num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse_md5], DWORD(num_lanes_inuse)
+ DBGPRINTL64 "SUBMIT ********** num_lanes_in_use", num_lanes_inuse
+ imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_md5 + lane_data]
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov tmp, len
+ shr tmp, 6 ; divide by 64, len in terms of blocks
+ DBGPRINTL64 "SUBMIT job len, num_blks ", len, tmp
+ mov [lane_data + _job_in_lane], job
+ mov dword [lane_data + _outer_done], 0
+
+ VPINSRW_M256 state + _lens_md5, xmm0, xmm1, last_len, p, lane, tmp, scale_x16
+
+ mov last_len, len
+ and last_len, 63
+ lea extra_blocks, [last_len + 9 + 63]
+ shr extra_blocks, 6
+ mov [lane_data + _extra_blocks], DWORD(extra_blocks)
+
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _args_data_ptr_md5 + PTR_SZ*lane], p
+
+ cmp len, 64
+ jb copy_lt64
+
+fast_copy:
+ add p, len
+ vmovdqu ymm0, [p - 64 + 0 * 32]
+ vmovdqu ymm1, [p - 64 + 1 * 32]
+ vmovdqu [lane_data + _extra_block + 0*32], ymm0
+ vmovdqu [lane_data + _extra_block + 1*32], ymm1
+end_fast_copy:
+
+ mov size_offset, extra_blocks
+ shl size_offset, 6
+ sub size_offset, last_len
+ add size_offset, 64-8
+ mov [lane_data + _size_offset], DWORD(size_offset)
+ mov start_offset, 64
+ sub start_offset, last_len
+ mov [lane_data + _start_offset], DWORD(start_offset)
+
+ lea tmp, [8*64 + 8*len]
+; bswap tmp
+ mov [lane_data + _extra_block + size_offset], tmp
+
+ mov tmp, [job + _auth_key_xor_ipad]
+ vmovdqu xmm0, [tmp]
+ vmovd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 0*MD5_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3
+
+ test len, ~63
+ jnz ge64_bytes
+
+lt64_bytes:
+ VPINSRW_M256 state + _lens_md5, xmm0, xmm1, tmp, len2, lane, extra_blocks, scale_x16
+
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_md5 + PTR_SZ*lane], tmp
+ mov dword [lane_data + _extra_blocks], 0
+
+ge64_bytes:
+ DBGPRINTL64 "SUBMIT md5 all lanes loaded? ********** num_lanes_in_use", num_lanes_inuse
+ cmp num_lanes_inuse, 0x10 ; all 16 lanes loaded?
+ jne return_null
+ jmp start_loop
+
+ align 16
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens_md5]
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...7)
+
+ vmovdqa xmm2, [state + _lens_md5 + 1*16] ;; second 8 lengths
+ vphminposuw xmm3, xmm2
+ vpextrw DWORD(len_upper), xmm3, 0 ; min value
+ vpextrw DWORD(idx_upper), xmm3, 1 ; min index (8...F)
+
+ cmp len2, len_upper
+ jle use_min
+
+min_in_high:
+
+ vmovdqa xmm1, xmm3
+ mov len2, len_upper
+ mov idx, idx_upper ;; idx retrieved would be [0-7]
+ or idx, 0x8 ;; to reflect that index in 8-F
+
+use_min:
+
+ cmp len2, 0
+ je len_is_0
+ DBGPRINTL64 "min_length min_index ", len2, idx
+ vpbroadcastw xmm1, xmm1 ; duplicate words across all lanes
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_md5 + 0*16], xmm0
+ DBGPRINTL_XMM "SUBMIT lens after sub lower", xmm0
+
+ vpsubw xmm2, xmm2, xmm1
+ vmovdqa [state + _lens_md5 + 1*16], xmm2
+ DBGPRINTL_XMM "SUBMIT lens after sub upper", xmm2
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_x8x2_avx2
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_md5 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+
+ VPINSRW_M256 state + _lens_md5, xmm0, xmm1, tmp, job, idx, 1, scale_x16
+
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
+
+ vmovd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3
+ vmovdqa [lane_data + _outer_block], xmm0
+
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ vmovd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3
+ jmp start_loop
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+
+ VPINSRW_M256 state + _lens_md5, xmm0, xmm1, tmp, len2, idx, extra_blocks, scale_x16
+
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp start_loop
+
+ align 16
+
+copy_lt64:
+ ;; less than one message block of data
+ ;; beginning of source block
+ ;; destination extrablock but backwards by len from where 0x80 pre-populated
+ ;; p2 clobbers unused_lanes, undo before exiting
+ lea p2, [lane_data + _extra_block + 64]
+ sub p2, len
+ memcpy_avx2_64_1 p2, p, len, tmp4, tmp2, ymm0, ymm1
+ mov unused_lanes, [state + _unused_lanes_md5]
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes_md5]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes_md5], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse_md5]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse_md5], DWORD(num_lanes_inuse)
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; copy 12 bytes
+ mov DWORD(tmp), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE]
+ mov [p + 0*4], DWORD(tmp)
+ mov [p + 1*4], DWORD(tmp2)
+ mov [p + 2*4], DWORD(tmp3)
+
+ cmp DWORD [job_rax + _auth_tag_output_len_in_bytes], 12
+ je clear_ret
+
+ ; copy 16 bytes
+ mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE]
+ mov [p + 3*4], DWORD(tmp3)
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ ;; Clear digest (16B), outer_block (16B) and extra_block (64B) of returned job
+ mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 0
+
+ vpxor ymm0, ymm0
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_md5 + lane_data]
+ ;; Clear first 64 bytes of extra_block
+ vmovdqa [lane_data + _extra_block], ymm0
+ vmovdqa [lane_data + _extra_block + 32], ymm0
+
+ ;; Clear first 16 bytes of outer_block
+ vmovdqa [lane_data + _outer_block], xmm0
+%endif
+
+return:
+ DBGPRINTL "---------- exit md5 submit -----------"
+
+ vzeroupper
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_224_flush_avx2.asm b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_224_flush_avx2.asm
new file mode 100644
index 000000000..b5d0a1b6b
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_224_flush_avx2.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define FUNC flush_job_hmac_sha_224_avx2
+%define SHA224
+
+%include "avx2/mb_mgr_hmac_sha_256_flush_avx2.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_224_submit_avx2.asm b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_224_submit_avx2.asm
new file mode 100644
index 000000000..e4b254b95
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_224_submit_avx2.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define FUNC submit_job_hmac_sha_224_avx2
+%define SHA224
+
+%include "avx2/mb_mgr_hmac_sha_256_submit_avx2.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_256_flush_avx2.asm b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_256_flush_avx2.asm
new file mode 100644
index 000000000..f41c9329b
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_256_flush_avx2.asm
@@ -0,0 +1,379 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+extern sha256_oct_avx2
+
+section .data
+default rel
+align 16
+byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ ;ddq 0x00000000000000000000FFFF00000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ ;ddq 0x0000000000000000FFFF000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+ ;ddq 0x000000000000FFFF0000000000000000
+ dq 0x0000000000000000, 0x000000000000FFFF
+ ;ddq 0x00000000FFFF00000000000000000000
+ dq 0x0000000000000000, 0x00000000FFFF0000
+ ;ddq 0x0000FFFF000000000000000000000000
+ dq 0x0000000000000000, 0x0000FFFF00000000
+ ;ddq 0xFFFF0000000000000000000000000000
+ dq 0x0000000000000000, 0xFFFF000000000000
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+
+section .text
+
+%ifndef FUNC
+%define FUNC flush_job_hmac_sha_256_avx2
+%endif
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define reg3 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define reg3 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbp, r15
+%define idx rbp
+
+%define unused_lanes r10
+%define tmp5 r10
+
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 reg3
+%define tmp r9
+%endif
+
+; we clobber rsi, rbp; called routine also clobbers rbx, rdi, r12, r13, r14
+struc STACK
+_gpr_save: resq 7
+_rsp_save: resq 1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state)
+; arg 1 : state
+MKGLOBAL(FUNC,function,internal)
+FUNC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -32
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*5], rsi
+ mov [rsp + _gpr_save + 8*6], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ ; if bit (32+3) is set, then all lanes are empty
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ bt unused_lanes, 32+3
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+
+%assign I 1
+%rep 7
+ cmp qword [state + _ldata_sha256 + (I * _HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
+ cmovne idx, [rel APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+copy_lane_data:
+ ; copy idx to empty lanes
+ vmovdqa xmm0, [state + _lens_sha256]
+ mov tmp, [state + _args_data_ptr_sha256 + 8*idx]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata_sha256 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args_data_ptr_sha256 + 8*I], tmp
+ vpor xmm0, xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ vmovdqa [state + _lens_sha256 ], xmm0
+
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...7)
+ cmp len2, 0
+ je len_is_0
+
+ vpbroadcastw xmm1, xmm1 ; duplicate words across all lanes
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_sha256], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_oct_avx2
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens_sha256 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr_sha256 + 8*idx], tmp
+
+ vmovd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3
+ vpshufb xmm0, xmm0, [rel byteswap]
+ vmovd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2
+%ifndef SHA224
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3
+%endif
+ vpshufb xmm1, xmm1, [rel byteswap]
+
+ vmovdqa [lane_data + _outer_block], xmm0
+ vmovdqa [lane_data + _outer_block + 4*4], xmm1
+%ifdef SHA224
+ mov dword [lane_data + _outer_block + 7*4], 0x80
+%endif
+
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ vmovdqu xmm1, [tmp + 4*4]
+ vmovd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3
+ vmovd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2
+ vpextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3
+ jmp copy_lane_data
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_sha256 + 8*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp copy_lane_data
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha256], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+%ifdef SHA224
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 14
+ jne copy_full_digest
+%else
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 16
+ jne copy_full_digest
+%endif
+ ;; copy SHA224 14bytes / SHA256 16bytes
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+ bswap DWORD(tmp5)
+ mov [p + 0*4], DWORD(tmp)
+ mov [p + 1*4], DWORD(tmp2)
+ mov [p + 2*4], DWORD(tmp4)
+%ifdef SHA224
+ mov [p + 3*4], WORD(tmp5)
+%else
+ mov [p + 3*4], DWORD(tmp5)
+%endif
+ jmp clear_ret
+
+copy_full_digest:
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+ bswap DWORD(tmp5)
+ mov [p + 0*4], DWORD(tmp)
+ mov [p + 1*4], DWORD(tmp2)
+ mov [p + 2*4], DWORD(tmp4)
+ mov [p + 3*4], DWORD(tmp5)
+
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE]
+%ifndef SHA224
+ mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE]
+%endif
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+%ifndef SHA224
+ bswap DWORD(tmp5)
+%endif
+ mov [p + 4*4], DWORD(tmp)
+ mov [p + 5*4], DWORD(tmp2)
+ mov [p + 6*4], DWORD(tmp4)
+%ifndef SHA224
+ mov [p + 7*4], DWORD(tmp5)
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ vpxor ymm0, ymm0
+
+ ;; Clear digest (28B/32B), outer_block (28B/32B) and extra_block (64B)
+ ;; of returned job and NULL jobs
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata_sha256 + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
+ jne APPEND(skip_clear_,I)
+
+ ;; Clear digest (28 bytes for SHA-224, 32 bytes for SHA-256 bytes)
+%assign J 0
+%rep 7
+ mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*I + J*SHA256_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+%ifndef SHA224
+ mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*I + 7*SHA256_DIGEST_ROW_SIZE], 0
+%endif
+
+ lea lane_data, [state + _ldata_sha256 + (I*_HMAC_SHA1_LANE_DATA_size)]
+ ;; Clear first 64 bytes of extra_block
+ vmovdqa [lane_data + _extra_block], ymm0
+ vmovdqa [lane_data + _extra_block + 32], ymm0
+
+ ;; Clear first 28 bytes (SHA-224) or 32 bytes (SHA-256) of outer_block
+%ifdef SHA224
+ vmovdqa [lane_data + _outer_block], xmm0
+ mov qword [lane_data + _outer_block + 16], 0
+ mov dword [lane_data + _outer_block + 24], 0
+%else
+ vmovdqu [lane_data + _outer_block], ymm0
+%endif
+
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SAFE_DATA
+
+return:
+ vzeroupper
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*5]
+ mov rdi, [rsp + _gpr_save + 8*6]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_256_submit_avx2.asm b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_256_submit_avx2.asm
new file mode 100644
index 000000000..46cea27bb
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_256_submit_avx2.asm
@@ -0,0 +1,426 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+%include "include/const.inc"
+
+extern sha256_oct_avx2
+
+section .data
+default rel
+align 16
+byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+section .text
+
+%ifndef FUNC
+%define FUNC submit_job_hmac_sha_256_avx2
+%endif
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define reg3 rcx
+%define reg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define reg3 rdi
+%define reg4 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbp, r15
+%define last_len rbp
+%define idx rbp
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+%define p2 rbx
+%define tmp4 rbx
+
+%define job_rax rax
+%define len rax
+
+%define size_offset reg3
+%define tmp2 reg3
+
+%define lane reg4
+%define tmp3 reg4
+
+%define extra_blocks r8
+
+%define tmp r9
+
+%define lane_data r10
+
+%endif
+
+
+; we clobber rbx, rsi, rdi, rbp; called routine also clobbers r12, r13, r14
+struc STACK
+_gpr_save: resq 7
+_rsp_save: resq 1
+endstruc
+
+; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+MKGLOBAL(FUNC,function,internal)
+FUNC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -32
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*5], rsi
+ mov [rsp + _gpr_save + 8*6], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ mov lane, unused_lanes
+ and lane, 0xF ;; just a nibble
+ shr unused_lanes, 4
+
+ imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ mov [state + _unused_lanes_sha256], unused_lanes
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov tmp, len
+ shr tmp, 6 ; divide by 64, len in terms of blocks
+
+ mov [lane_data + _job_in_lane], job
+ mov dword [lane_data + _outer_done], 0
+
+ vmovdqa xmm0, [state + _lens_sha256]
+ XVPINSRW xmm0, xmm1, extra_blocks, lane, tmp, scale_x16
+ vmovdqa [state + _lens_sha256], xmm0
+
+ mov last_len, len
+ and last_len, 63
+ lea extra_blocks, [last_len + 9 + 63]
+ shr extra_blocks, 6
+ mov [lane_data + _extra_blocks], DWORD(extra_blocks)
+
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _args_data_ptr_sha256 + 8*lane], p
+
+ cmp len, 64
+ jb copy_lt64
+
+fast_copy:
+ add p, len
+ vmovdqu ymm0, [p - 64 + 0 * 32]
+ vmovdqu ymm1, [p - 64 + 1 * 32]
+ vmovdqu [lane_data + _extra_block + 0*32], ymm0
+ vmovdqu [lane_data + _extra_block + 1*32], ymm1
+
+end_fast_copy:
+ mov size_offset, extra_blocks
+ shl size_offset, 6
+ sub size_offset, last_len
+ add size_offset, 64-8
+ mov [lane_data + _size_offset], DWORD(size_offset)
+ mov start_offset, 64
+ sub start_offset, last_len
+ mov [lane_data + _start_offset], DWORD(start_offset)
+
+ lea tmp, [8*64 + 8*len]
+ bswap tmp
+ mov [lane_data + _extra_block + size_offset], tmp
+
+ mov tmp, [job + _auth_key_xor_ipad]
+ vmovdqu xmm0, [tmp]
+ vmovdqu xmm1, [tmp + 4*4]
+ vmovd [state + _args_digest_sha256 + 4*lane + 0*SHA256_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest_sha256 + 4*lane + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest_sha256 + 4*lane + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest_sha256 + 4*lane + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3
+ vmovd [state + _args_digest_sha256 + 4*lane + 4*SHA256_DIGEST_ROW_SIZE], xmm1
+ vpextrd [state + _args_digest_sha256 + 4*lane + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1
+ vpextrd [state + _args_digest_sha256 + 4*lane + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2
+ vpextrd [state + _args_digest_sha256 + 4*lane + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3
+
+ test len, ~63
+ jnz ge64_bytes
+
+lt64_bytes:
+ vmovdqa xmm0, [state + _lens_sha256]
+ XVPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16
+ vmovdqa [state + _lens_sha256], xmm0
+
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_sha256 + 8*lane], tmp
+ mov dword [lane_data + _extra_blocks], 0
+
+ge64_bytes:
+ cmp unused_lanes, 0xf
+ jne return_null
+ jmp start_loop
+
+ align 16
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens_sha256]
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...7)
+ cmp len2, 0
+ je len_is_0
+
+ vpbroadcastw xmm1, xmm1 ; duplicate words across all lanes
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_sha256], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_oct_avx2
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+
+ vmovdqa xmm0, [state + _lens_sha256]
+ XVPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16
+ vmovdqa [state + _lens_sha256], xmm0
+
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr_sha256 + 8*idx], tmp
+
+ vmovd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3
+ vpshufb xmm0, xmm0, [rel byteswap]
+ vmovd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2
+%ifndef SHA224
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3
+%endif
+ vpshufb xmm1, xmm1, [rel byteswap]
+ vmovdqa [lane_data + _outer_block], xmm0
+ vmovdqa [lane_data + _outer_block + 4*4], xmm1
+%ifdef SHA224
+ mov dword [lane_data + _outer_block + 7*4], 0x80
+%endif
+
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ vmovdqu xmm1, [tmp + 4*4]
+ vmovd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3
+ vmovd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2
+ vpextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3
+
+ jmp start_loop
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+
+ vmovdqa xmm0, [state + _lens_sha256]
+ XVPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16
+ vmovdqa [state + _lens_sha256], xmm0
+
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_sha256 + 8*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp start_loop
+
+ align 16
+copy_lt64:
+ ;; less than one message block of data
+ ;; beginning of source block
+ ;; destination extrablock but backwards by len from where 0x80 pre-populated
+ lea p2, [lane_data + _extra_block + 64]
+ sub p2, len
+ memcpy_avx2_64_1 p2, p, len, tmp, tmp2, ymm0, ymm1
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha256], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ vzeroupper
+
+%ifdef SHA224
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 14
+ jne copy_full_digest
+%else
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 16
+ jne copy_full_digest
+%endif
+ ;; copy 14 bytes for SHA224 / 16 bytes for SHA256
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+ bswap DWORD(tmp4)
+ mov [p + 0*4], DWORD(tmp)
+ mov [p + 1*4], DWORD(tmp2)
+ mov [p + 2*4], DWORD(tmp3)
+%ifdef SHA224
+ mov [p + 3*4], WORD(tmp4)
+%else
+ mov [p + 3*4], DWORD(tmp4)
+%endif
+ jmp clear_ret
+
+copy_full_digest:
+ ;; copy 28 bytes for SHA224 / 32 bytes for SHA256
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+ bswap DWORD(tmp4)
+ mov [p + 0*4], DWORD(tmp)
+ mov [p + 1*4], DWORD(tmp2)
+ mov [p + 2*4], DWORD(tmp3)
+ mov [p + 3*4], DWORD(tmp4)
+
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE]
+%ifndef SHA224
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE]
+%endif
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+%ifndef SHA224
+ bswap DWORD(tmp4)
+%endif
+ mov [p + 4*4], DWORD(tmp)
+ mov [p + 5*4], DWORD(tmp2)
+ mov [p + 6*4], DWORD(tmp3)
+%ifndef SHA224
+ mov [p + 7*4], DWORD(tmp4)
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ ;; Clear digest (28B/32B), outer_block (28B/32B) and extra_block (64B) of returned job
+%assign J 0
+%rep 7
+ mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*idx + J*SHA256_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+%ifndef SHA224
+ mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*idx + 7*SHA256_DIGEST_ROW_SIZE], 0
+%endif
+
+ vpxor ymm0, ymm0
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ ;; Clear first 64 bytes of extra_block
+ vmovdqa [lane_data + _extra_block], ymm0
+ vmovdqa [lane_data + _extra_block + 32], ymm0
+
+ ;; Clear first 28 bytes (SHA-224) or 32 bytes (SHA-256) of outer_block
+%ifdef SHA224
+ vmovdqa [lane_data + _outer_block], xmm0
+ mov qword [lane_data + _outer_block + 16], 0
+ mov dword [lane_data + _outer_block + 24], 0
+%else
+ vmovdqu [lane_data + _outer_block], ymm0
+%endif
+%endif ;; SAFE_DATA
+
+return:
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*5]
+ mov rdi, [rsp + _gpr_save + 8*6]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_384_flush_avx2.asm b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_384_flush_avx2.asm
new file mode 100644
index 000000000..b354cdff3
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_384_flush_avx2.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define FUNC flush_job_hmac_sha_384_avx2
+%define SHA_X_DIGEST_SIZE 384
+
+%include "avx2/mb_mgr_hmac_sha_512_flush_avx2.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_384_submit_avx2.asm b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_384_submit_avx2.asm
new file mode 100644
index 000000000..46cd3c54f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_384_submit_avx2.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define FUNC submit_job_hmac_sha_384_avx2
+%define SHA_X_DIGEST_SIZE 384
+
+%include "avx2/mb_mgr_hmac_sha_512_submit_avx2.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_512_flush_avx2.asm b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_512_flush_avx2.asm
new file mode 100644
index 000000000..14a28c43a
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_512_flush_avx2.asm
@@ -0,0 +1,353 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+extern sha512_x4_avx2
+
+section .data
+default rel
+
+align 16
+byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607
+ dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ ;ddq 0x00000000000000000000FFFF00000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ ;ddq 0x0000000000000000FFFF000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+
+section .text
+
+%ifndef FUNC
+%define FUNC flush_job_hmac_sha_512_avx2
+%define SHA_X_DIGEST_SIZE 512
+%endif
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbp, r15
+%define idx rbp
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+
+%define tmp5 r9
+
+%define tmp6 r10
+
+%endif
+
+; we clobber rbx, rbp; called routine also clobbers r12
+struc STACK
+_gpr_save: resq 3
+_rsp_save: resq 1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* FUNC(MB_MGR_HMAC_SHA_512_OOO *state)
+; arg 1 : rcx : state
+MKGLOBAL(FUNC,function,internal)
+FUNC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -32
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ bt unused_lanes, 32+7
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 3
+ cmp qword [state + _ldata_sha512 + I * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0
+ cmovne idx, [rel APPEND(lane_, I)]
+%assign I (I+1)
+%endrep
+
+copy_lane_data:
+ ; copy good lane (idx) to empty lanes
+ vmovdqa xmm0, [state + _lens_sha512]
+ mov tmp, [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata_sha512 + I * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0
+ jne APPEND(skip_,I)
+ mov [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*I], tmp
+ vpor xmm0, xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ vmovdqa [state + _lens_sha512], xmm0
+
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...3)
+ cmp len2, 0
+ je len_is_0
+
+ vpshuflw xmm1, xmm1, 0x00
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_sha512], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_x4_avx2
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _SHA512_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha512 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done_sha512], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done_sha512], 1
+ mov DWORD(size_offset), [lane_data + _size_offset_sha512]
+ mov qword [lane_data + _extra_block_sha512 + size_offset], 0
+ mov word [state + _lens_sha512 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block_sha512]
+ mov job, [lane_data + _job_in_lane_sha512]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
+
+ ; move digest into data location
+ %assign I 0
+ %rep (SHA_X_DIGEST_SIZE / (8*16))
+ vmovq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE]
+ vpinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1
+ vpshufb xmm0, [rel byteswap]
+ vmovdqa [lane_data + _outer_block_sha512 + I*2*SHA512_DIGEST_WORD_SIZE], xmm0
+ %assign I (I+1)
+ %endrep
+
+ ; move the opad key into digest
+ mov tmp, [job + _auth_key_xor_opad]
+
+ %assign I 0
+ %rep 4
+ vmovdqu xmm0, [tmp + I * 16]
+ vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 0)*SHA512_DIGEST_ROW_SIZE], xmm0
+ vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1
+ %assign I (I+1)
+ %endrep
+
+ jmp copy_lane_data
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset_sha512]
+ mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block_sha512 + start_offset]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks_sha512], 0
+ jmp copy_lane_data
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane_sha512]
+ mov qword [lane_data + _job_in_lane_sha512], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha512], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+%if (SHA_X_DIGEST_SIZE != 384)
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 32
+ jne copy_full_digest
+%else
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 24
+ jne copy_full_digest
+%endif
+
+ ;; copy 32 bytes for SHA512 / 24 bytes for SHA384
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
+%endif
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp4)
+ bswap QWORD(tmp6)
+%if (SHA_X_DIGEST_SIZE != 384)
+ bswap QWORD(tmp5)
+%endif
+ mov [p + 0*8], QWORD(tmp2)
+ mov [p + 1*8], QWORD(tmp4)
+ mov [p + 2*8], QWORD(tmp6)
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov [p + 3*8], QWORD(tmp5)
+%endif
+ jmp clear_ret
+
+copy_full_digest:
+ ;; copy 64 bytes for SHA512 / 48 bytes for SHA384
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp4)
+ bswap QWORD(tmp6)
+ bswap QWORD(tmp5)
+ mov [p + 0*8], QWORD(tmp2)
+ mov [p + 1*8], QWORD(tmp4)
+ mov [p + 2*8], QWORD(tmp6)
+ mov [p + 3*8], QWORD(tmp5)
+
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 4*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 5*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA512_DIGEST_ROW_SIZE]
+%endif
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp4)
+%if (SHA_X_DIGEST_SIZE != 384)
+ bswap QWORD(tmp6)
+ bswap QWORD(tmp5)
+%endif
+ mov [p + 4*8], QWORD(tmp2)
+ mov [p + 5*8], QWORD(tmp4)
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov [p + 6*8], QWORD(tmp6)
+ mov [p + 7*8], QWORD(tmp5)
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ vpxor ymm0, ymm0
+
+ ;; Clear digest (48B/64B), outer_block (48B/64B) and extra_block (128B) of returned job
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata_sha512 + (I*_SHA512_LANE_DATA_size) + _job_in_lane_sha512], 0
+ jne APPEND(skip_clear_,I)
+
+ ;; Clear digest (48 bytes for SHA-384, 64 bytes for SHA-512 bytes)
+%assign J 0
+%rep 6
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + J*SHA512_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + 6*SHA512_DIGEST_ROW_SIZE], 0
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + 7*SHA512_DIGEST_ROW_SIZE], 0
+%endif
+
+ lea lane_data, [state + _ldata_sha512 + (I*_SHA512_LANE_DATA_size)]
+ ;; Clear first 128 bytes of extra_block
+%assign offset 0
+%rep 4
+ vmovdqa [lane_data + _extra_block + offset], ymm0
+%assign offset (offset + 32)
+%endrep
+
+ ;; Clear first 48 bytes (SHA-384) or 64 bytes (SHA-512) of outer_block
+ vmovdqu [lane_data + _outer_block], ymm0
+%if (SHA_X_DIGEST_SIZE == 384)
+ vmovdqa [lane_data + _outer_block + 32], xmm0
+%else
+ vmovdqu [lane_data + _outer_block + 32], ymm0
+%endif
+
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SAFE_DATA
+
+return:
+ vzeroupper
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_512_submit_avx2.asm b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_512_submit_avx2.asm
new file mode 100644
index 000000000..a7c3e249b
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_sha_512_submit_avx2.asm
@@ -0,0 +1,416 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+%include "include/const.inc"
+
+extern sha512_x4_avx2
+
+section .data
+default rel
+align 16
+byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607
+ dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+
+section .text
+
+%ifndef FUNC
+%define FUNC submit_job_hmac_sha_512_avx2
+%define SHA_X_DIGEST_SIZE 512
+%endif
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define reg3 rcx
+%define reg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define reg3 rdi
+%define reg4 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbp, r13, r14, r16
+%define last_len rbp
+%define idx rbp
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+%define tmp4 rbx
+
+%define job_rax rax
+%define len rax
+
+%define size_offset reg3
+%define tmp2 reg3
+
+%define lane reg4
+%define tmp3 reg4
+
+%define extra_blocks r8
+
+%define tmp r9
+%define p2 r9
+
+%define lane_data r10
+
+%endif
+
+; Define stack usage
+
+; we clobber rbx, rsi, rdi, rbp; called routine also clobbers r12
+struc STACK
+_gpr_save: resq 5
+_rsp_save: resq 1
+endstruc
+
+; JOB* FUNC(MB_MGR_HMAC_sha_512_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+MKGLOBAL(FUNC,function,internal)
+FUNC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -32
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*3], rsi
+ mov [rsp + _gpr_save + 8*4], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ movzx lane, BYTE(unused_lanes)
+ shr unused_lanes, 8
+ imul lane_data, lane, _SHA512_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha512 + lane_data]
+ mov [state + _unused_lanes_sha512], unused_lanes
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov tmp, len
+ shr tmp, 7 ; divide by 128, len in terms of blocks
+
+ mov [lane_data + _job_in_lane_sha512], job
+ mov dword [lane_data + _outer_done_sha512], 0
+
+ vmovdqa xmm0, [state + _lens_sha512]
+ XVPINSRW xmm0, xmm1, extra_blocks, lane, tmp, scale_x16
+ vmovdqa [state + _lens_sha512], xmm0
+
+
+ mov last_len, len
+ and last_len, 127
+ lea extra_blocks, [last_len + 17 + 127]
+ shr extra_blocks, 7
+ mov [lane_data + _extra_blocks_sha512], DWORD(extra_blocks)
+
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], p
+
+ cmp len, 128
+ jb copy_lt128
+
+fast_copy:
+ add p, len
+ vmovdqu ymm0, [p - 128 + 0*32]
+ vmovdqu ymm1, [p - 128 + 1*32]
+ vmovdqu ymm2, [p - 128 + 2*32]
+ vmovdqu ymm3, [p - 128 + 3*32]
+ vmovdqu [lane_data + _extra_block_sha512 + 0*32], ymm0
+ vmovdqu [lane_data + _extra_block_sha512 + 1*32], ymm1
+ vmovdqu [lane_data + _extra_block_sha512 + 2*32], ymm2
+ vmovdqu [lane_data + _extra_block_sha512 + 3*32], ymm3
+end_fast_copy:
+
+ mov size_offset, extra_blocks
+ shl size_offset, 7
+ sub size_offset, last_len
+ add size_offset, 128-8
+ mov [lane_data + _size_offset_sha512], DWORD(size_offset)
+ mov start_offset, 128
+ sub start_offset, last_len
+ mov [lane_data + _start_offset_sha512], DWORD(start_offset)
+
+ lea tmp, [8*128 + 8*len]
+ bswap tmp
+ mov [lane_data + _extra_block_sha512 + size_offset], tmp
+
+ mov tmp, [job + _auth_key_xor_ipad]
+
+%assign I 0
+%rep 4
+ vmovdqu xmm0, [tmp + I * 2 * SHA512_DIGEST_WORD_SIZE]
+ vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I + 0)*SHA512_DIGEST_ROW_SIZE], xmm0
+ vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1
+%assign I (I+1)
+%endrep
+
+ test len, ~127
+ jnz ge128_bytes
+
+lt128_bytes:
+ vmovdqa xmm0, [state + _lens_sha512]
+ XVPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16
+ vmovdqa [state + _lens_sha512], xmm0
+
+ lea tmp, [lane_data + _extra_block_sha512 + start_offset]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], tmp ;; 8 to hold a UINT8
+ mov dword [lane_data + _extra_blocks_sha512], 0
+
+ge128_bytes:
+ cmp unused_lanes, 0xff
+ jne return_null
+ jmp start_loop
+
+ align 16
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens_sha512]
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...1)
+ cmp len2, 0
+ je len_is_0
+
+ vpshuflw xmm1, xmm1, 0x00
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_sha512], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_x4_avx2
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _SHA512_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha512 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done_sha512], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done_sha512], 1
+ mov DWORD(size_offset), [lane_data + _size_offset_sha512]
+ mov qword [lane_data + _extra_block_sha512 + size_offset], 0
+
+ vmovdqa xmm0, [state + _lens_sha512]
+ XVPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16
+ vmovdqa [state + _lens_sha512], xmm0
+
+ lea tmp, [lane_data + _outer_block_sha512]
+ mov job, [lane_data + _job_in_lane_sha512]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
+
+%assign I 0
+%rep (SHA_X_DIGEST_SIZE / (8 * 16))
+ vmovq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 0)*SHA512_DIGEST_ROW_SIZE]
+ vpinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1
+ vpshufb xmm0, [rel byteswap]
+ vmovdqa [lane_data + _outer_block_sha512 + I * 2 * SHA512_DIGEST_WORD_SIZE], xmm0
+%assign I (I+1)
+%endrep
+
+ mov tmp, [job + _auth_key_xor_opad]
+%assign I 0
+%rep 4
+ vmovdqu xmm0, [tmp + I * 16]
+ vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I+0)*SHA512_DIGEST_ROW_SIZE], xmm0
+ vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1
+%assign I (I+1)
+%endrep
+
+ jmp start_loop
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset_sha512]
+
+ vmovdqa xmm0, [state + _lens_sha512]
+ XVPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16
+ vmovdqa [state + _lens_sha512], xmm0
+
+ lea tmp, [lane_data + _extra_block_sha512 + start_offset]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp ;; idx is index of shortest length message
+ mov dword [lane_data + _extra_blocks_sha512], 0
+ jmp start_loop
+
+ align 16
+copy_lt128:
+ ;; less than one message block of data
+ ;; destination extra block but backwards by len from where 0x80 pre-populated
+ lea p2, [lane_data + _extra_block + 128]
+ sub p2, len
+ memcpy_avx2_128_1 p2, p, len, tmp4, tmp2, ymm0, ymm1, ymm2, ymm3
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane_sha512]
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ mov qword [lane_data + _job_in_lane_sha512], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha512], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ vzeroupper
+
+%if (SHA_X_DIGEST_SIZE != 384)
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 32
+ jne copy_full_digest
+%else
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 24
+ jne copy_full_digest
+%endif
+ ;; copy 32 bytes for SHA512 / 24 bytes for SHA384
+ mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
+%endif
+ bswap QWORD(tmp)
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp3)
+%if (SHA_X_DIGEST_SIZE != 384)
+ bswap QWORD(tmp4)
+%endif
+ mov [p + 0*8], QWORD(tmp)
+ mov [p + 1*8], QWORD(tmp2)
+ mov [p + 2*8], QWORD(tmp3)
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov [p + 3*8], QWORD(tmp4)
+%endif
+ jmp clear_ret
+
+copy_full_digest:
+ ;; copy 64 bytes for SHA512 / 48 bytes for SHA384
+ mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
+ bswap QWORD(tmp)
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp3)
+ bswap QWORD(tmp4)
+ mov [p + 0*8], QWORD(tmp)
+ mov [p + 1*8], QWORD(tmp2)
+ mov [p + 2*8], QWORD(tmp3)
+ mov [p + 3*8], QWORD(tmp4)
+
+ mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 4*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 5*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA512_DIGEST_ROW_SIZE]
+%endif
+ bswap QWORD(tmp)
+ bswap QWORD(tmp2)
+%if (SHA_X_DIGEST_SIZE != 384)
+ bswap QWORD(tmp3)
+ bswap QWORD(tmp4)
+%endif
+ mov [p + 4*8], QWORD(tmp)
+ mov [p + 5*8], QWORD(tmp2)
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov [p + 6*8], QWORD(tmp3)
+ mov [p + 7*8], QWORD(tmp4)
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ ;; Clear digest (48B/64B), outer_block (48B/64B) and extra_block (128B) of returned job
+%assign J 0
+%rep 6
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + J*SHA512_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA256_DIGEST_ROW_SIZE], 0
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA256_DIGEST_ROW_SIZE], 0
+%endif
+
+ vpxor ymm0, ymm0
+ imul lane_data, idx, _SHA512_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha512 + lane_data]
+ ;; Clear first 128 bytes of extra_block
+%assign offset 0
+%rep 4
+ vmovdqa [lane_data + _extra_block + offset], ymm0
+%assign offset (offset + 32)
+%endrep
+
+ ;; Clear first 48 bytes (SHA-384) or 64 bytes (SHA-512) of outer_block
+ vmovdqu [lane_data + _outer_block], ymm0
+%if (SHA_X_DIGEST_SIZE == 384)
+ vmovdqa [lane_data + _outer_block + 32], xmm0
+%else
+ vmovdqu [lane_data + _outer_block + 32], ymm0
+%endif
+%endif ;; SAFE_DATA
+
+return:
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*3]
+ mov rdi, [rsp + _gpr_save + 8*4]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_submit_avx2.asm b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_submit_avx2.asm
new file mode 100644
index 000000000..92b129f74
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx2/mb_mgr_hmac_submit_avx2.asm
@@ -0,0 +1,369 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+%include "include/const.inc"
+
+extern sha1_x8_avx2
+
+section .data
+default rel
+align 16
+byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+section .text
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define reg3 rcx
+%define reg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define reg3 rdi
+%define reg4 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rdi, rbp
+%define last_len rbp
+%define idx rbp
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes r12
+%define tmp4 r12
+
+%define job_rax rax
+%define len rax
+
+%define size_offset reg3
+%define tmp2 reg3
+
+%define lane reg4
+%define tmp3 reg4
+
+%define extra_blocks r8
+
+%define tmp r9
+%define p2 r9
+
+%define lane_data r10
+
+%endif
+
+; we clobber rsi, rdi, rbp, r12; called routine clobbers also r13-r15
+struc STACK
+_gpr_save: resq 7
+_rsp_save: resq 1
+endstruc
+
+; JOB* submit_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+MKGLOBAL(submit_job_hmac_avx2,function,internal)
+submit_job_hmac_avx2:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -32 ; align to 32 byte boundary
+ mov [rsp + _gpr_save + 8*0], rbp
+ mov [rsp + _gpr_save + 8*1], r12
+ mov [rsp + _gpr_save + 8*2], r13
+ mov [rsp + _gpr_save + 8*3], r14
+ mov [rsp + _gpr_save + 8*4], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*5], rsi
+ mov [rsp + _gpr_save + 8*6], rdi
+%endif
+ mov [rsp + _rsp_save], rax
+ DBGPRINTL "---------- enter sha1 submit -----------"
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF ;; just a nibble
+ shr unused_lanes, 4
+ imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov tmp, len
+ shr tmp, 6 ; divide by 64, len in terms of blocks
+
+ mov [lane_data + _job_in_lane], job
+ mov dword [lane_data + _outer_done], 0
+
+ vmovdqa xmm0, [state + _lens]
+ XVPINSRW xmm0, xmm1, extra_blocks, lane, tmp, scale_x16
+ vmovdqa [state + _lens], xmm0
+
+ mov last_len, len
+ and last_len, 63
+ lea extra_blocks, [last_len + 9 + 63]
+ shr extra_blocks, 6
+ mov [lane_data + _extra_blocks], DWORD(extra_blocks)
+
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _args_data_ptr + PTR_SZ*lane], p
+ cmp len, 64
+ jb copy_lt64
+
+fast_copy:
+ add p, len
+ vmovdqu ymm0, [p - 64 + 0 * 32]
+ vmovdqu ymm1, [p - 64 + 1 * 32]
+ vmovdqu [lane_data + _extra_block + 0*32], ymm0
+ vmovdqu [lane_data + _extra_block + 1*32], ymm1
+end_fast_copy:
+
+ mov size_offset, extra_blocks
+ shl size_offset, 6
+ sub size_offset, last_len
+ add size_offset, 64-8
+ mov [lane_data + _size_offset], DWORD(size_offset)
+ mov start_offset, 64
+ sub start_offset, last_len
+ mov [lane_data + _start_offset], DWORD(start_offset)
+
+ lea tmp, [8*64 + 8*len]
+ bswap tmp
+ mov [lane_data + _extra_block + size_offset], tmp
+
+ mov tmp, [job + _auth_key_xor_ipad]
+ vmovdqu xmm0, [tmp]
+ mov DWORD(tmp), [tmp + 4*4]
+ vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 0*SHA1_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3
+ mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp)
+
+ test len, ~63
+ jnz ge64_bytes
+
+lt64_bytes:
+ vmovdqa xmm0, [state + _lens]
+ XVPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16
+ vmovdqa [state + _lens], xmm0
+
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr + PTR_SZ*lane], tmp
+ mov dword [lane_data + _extra_blocks], 0
+
+ge64_bytes:
+ cmp unused_lanes, 0xf
+ jne return_null
+ jmp start_loop
+
+ align 16
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens]
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...3)
+ DBGPRINTL64 "min_length", len2
+ DBGPRINTL64 "min_length index ", idx
+ cmp len2, 0
+ je len_is_0
+
+ vpbroadcastw xmm1, xmm1
+ DBGPRINTL_XMM "SUBMIT lens after shuffle", xmm1
+
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens], xmm0
+ DBGPRINTL_XMM "lengths after subtraction", xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_x8_avx2
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+
+ vmovdqa xmm0, [state + _lens]
+ XVPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16
+ vmovdqa [state + _lens], xmm0
+
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+
+ vmovd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3
+ vpshufb xmm0, xmm0, [rel byteswap]
+ mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ vmovdqa [lane_data + _outer_block], xmm0
+ mov [lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ mov DWORD(tmp), [tmp + 4*4]
+ vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3
+ mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp)
+ jmp start_loop
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+
+ vmovdqa xmm0, [state + _lens]
+ XVPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16
+ vmovdqa [state + _lens], xmm0
+
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp start_loop
+
+ align 16
+copy_lt64:
+ ;; less than one message block of data
+ ;; beginning of source block
+ ;; destination extrablock but backwards by len from where 0x80 pre-populated
+ lea p2, [lane_data + _extra_block + 64]
+ sub p2, len
+ memcpy_avx2_64_1 p2, p, len, tmp4, tmp2, ymm0, ymm1
+ mov unused_lanes, [state + _unused_lanes]
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ vzeroupper
+
+ ; copy 12 bytes
+ mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+ mov [p + 0*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+ mov [p + 1*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2)
+ mov [p + 2*SHA1_DIGEST_WORD_SIZE], DWORD(tmp3)
+
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 12
+ je clear_ret
+
+ ;; copy remaining 8 bytes to return 20 byte digest
+ mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ mov [p + 3*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+ mov [p + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2)
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ ;; Clear digest (20B), outer_block (20B) and extra_block (64B) of returned job
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], 0
+
+ vpxor ymm0, ymm0
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ ;; Clear first 64 bytes of extra_block
+ vmovdqa [lane_data + _extra_block], ymm0
+ vmovdqa [lane_data + _extra_block + 32], ymm0
+
+ ;; Clear first 20 bytes of outer_block
+ vmovdqa [lane_data + _outer_block], xmm0
+ mov dword [lane_data + _outer_block + 16], 0
+%endif
+
+return:
+ DBGPRINTL "---------- exit sha1 submit -----------"
+ mov rbp, [rsp + _gpr_save + 8*0]
+ mov r12, [rsp + _gpr_save + 8*1]
+ mov r13, [rsp + _gpr_save + 8*2]
+ mov r14, [rsp + _gpr_save + 8*3]
+ mov r15, [rsp + _gpr_save + 8*4]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*5]
+ mov rdi, [rsp + _gpr_save + 8*6]
+%endif
+ mov rsp, [rsp + _rsp_save]
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx2/md5_x8x2_avx2.asm b/src/spdk/intel-ipsec-mb/avx2/md5_x8x2_avx2.asm
new file mode 100644
index 000000000..6d6830a99
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx2/md5_x8x2_avx2.asm
@@ -0,0 +1,820 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; code to compute double octal MD5 using AVX2
+
+;; Stack must be aligned to 32 bytes before call
+;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rbp
+;;
+;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdi rbp
+;;
+;; clobbers ymm0-15
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/transpose_avx2.asm"
+
+section .data
+default rel
+align 64
+MD5_TABLE:
+ dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
+ dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
+ dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
+ dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
+ dd 0x242070db, 0x242070db, 0x242070db, 0x242070db
+ dd 0x242070db, 0x242070db, 0x242070db, 0x242070db
+ dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
+ dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
+ dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
+ dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
+ dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
+ dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
+ dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
+ dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
+ dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
+ dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
+ dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
+ dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
+ dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
+ dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
+ dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
+ dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
+ dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
+ dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
+ dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
+ dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
+ dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
+ dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
+ dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
+ dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
+ dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
+ dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
+ dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
+ dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
+ dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
+ dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
+ dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
+ dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
+ dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
+ dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
+ dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
+ dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
+ dd 0x02441453, 0x02441453, 0x02441453, 0x02441453
+ dd 0x02441453, 0x02441453, 0x02441453, 0x02441453
+ dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
+ dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
+ dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
+ dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
+ dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
+ dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
+ dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
+ dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
+ dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
+ dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
+ dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
+ dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
+ dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
+ dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
+ dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
+ dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
+ dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
+ dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
+ dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
+ dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
+ dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
+ dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
+ dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
+ dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
+ dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
+ dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
+ dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
+ dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
+ dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
+ dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
+ dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
+ dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
+ dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
+ dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
+ dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
+ dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
+ dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
+ dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
+ dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
+ dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
+ dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
+ dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
+ dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
+ dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
+ dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
+ dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
+ dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
+ dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
+ dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
+ dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
+ dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
+ dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
+ dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
+ dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
+ dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
+ dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
+ dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
+ dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
+ dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
+ dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
+ dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
+ dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
+ dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
+ dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
+ dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
+ dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
+ dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
+ dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
+ dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
+ dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
+ dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
+ dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
+ dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
+ dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
+ dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
+ dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
+ dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
+ dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
+ dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
+ dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
+ dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
+ dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
+ dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
+ dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
+ONES: dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
+ dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
+
+section .text
+
+%ifndef LINUX
+ %define arg1 rcx
+ %define arg2 rdx
+ %define reg3 rdi
+ %define reg4 rsi
+%else
+ %define arg1 rdi
+ %define arg2 rsi
+ %define reg3 rcx
+ %define reg4 rdx
+%endif
+
+;; rbp is not clobbered
+
+%define state arg1
+%define num_blks arg2
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+%define inp4 r12
+%define inp5 r13
+%define inp6 r14
+%define inp7 r15
+
+;; These are pointers to data block1 and block2 in the stack
+; which will ping pong back and forth
+%define DPTR1 rbx
+%define DPTR2 reg3
+
+%define TBL rax
+%define IDX reg4
+
+;; Transposed Digest Storage
+%define Y_A ymm0
+%define Y_B ymm1
+%define Y_C ymm2
+%define Y_D ymm3
+%define Y_A2 ymm4
+%define Y_B2 ymm5
+%define Y_C2 ymm6
+%define Y_D2 ymm7
+
+;; Temp YMM registers corresponding to the Temp XMM registers
+;; used during the transposition of the digests
+%define Y_KTMP1 ymm12
+%define Y_KTMP2 ymm13
+;; Temporary registers used during MD5 round operations
+%define Y_FUN ymm8
+%define Y_TMP ymm9
+%define Y_FUN2 ymm10
+%define Y_TMP2 ymm11
+
+;; YMM registers used during data fetching.
+;; Data are stored into the stack after transposition
+%define Y_DAT0 ymm8
+%define Y_DAT1 ymm9
+%define Y_DAT2 ymm10
+%define Y_DAT3 ymm11
+%define Y_DAT4 ymm12
+%define Y_DAT5 ymm13
+%define Y_DAT6 ymm14
+%define Y_DAT7 ymm15
+
+;; Temporary registers used during data transposition
+%define Y_DTMP1 ymm0
+%define Y_DTMP2 ymm1
+
+
+%define RESY resb 32*
+;; Assume stack aligned to 32 bytes before call
+;; Therefore FRAMESIZE mod 32 must be 32-8 = 24
+struc STACK
+_DATA: RESY 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs
+_DIGEST: RESY 8 ; stores Y_AA-Y_DD, Y_AA2-Y_DD2
+_TMPDIGEST: RESY 2 ; stores Y_AA, Y_BB temporarily
+ resb 24 ; align
+endstruc
+
+
+%define Y_AA rsp + _DIGEST + 32*0
+%define Y_BB rsp + _DIGEST + 32*1
+%define Y_CC rsp + _DIGEST + 32*2
+%define Y_DD rsp + _DIGEST + 32*3
+%define Y_AA2 rsp + _DIGEST + 32*4
+%define Y_BB2 rsp + _DIGEST + 32*5
+%define Y_CC2 rsp + _DIGEST + 32*6
+%define Y_DD2 rsp + _DIGEST + 32*7
+
+;;
+;; MD5 left rotations (number of bits)
+;;
+rot11 equ 7
+rot12 equ 12
+rot13 equ 17
+rot14 equ 22
+rot21 equ 5
+rot22 equ 9
+rot23 equ 14
+rot24 equ 20
+rot31 equ 4
+rot32 equ 11
+rot33 equ 16
+rot34 equ 23
+rot41 equ 6
+rot42 equ 10
+rot43 equ 15
+rot44 equ 21
+
+
+;;
+;; Magic functions defined in RFC 1321
+;;
+; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z))))
+%macro MAGIC_F 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ vpxor %%F,%%Z, %%Y
+ vpand %%F,%%F,%%X
+ vpxor %%F,%%F,%%Z
+%endmacro
+
+; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y))
+%macro MAGIC_G 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ MAGIC_F %%F,%%Z,%%X,%%Y
+%endmacro
+
+; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z))
+%macro MAGIC_H 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ vpxor %%F,%%Z, %%Y
+ vpxor %%F,%%F, %%X
+%endmacro
+
+; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z)))
+%macro MAGIC_I 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ vpxor %%F,%%Z,[rel ONES] ; pnot %%F
+ vpor %%F,%%F,%%X
+ vpxor %%F,%%F,%%Y
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsrld %%tmp, %%reg, (32-%%imm)
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data,
+; MD5const, nrot
+%macro MD5_STEP 16
+%define %%MAGIC_FUN %1
+%define %%rA %2
+%define %%rB %3
+%define %%rC %4
+%define %%rD %5
+%define %%rA2 %6
+%define %%rB2 %7
+%define %%rC2 %8
+%define %%rD2 %9
+%define %%FUN %10
+%define %%TMP %11
+%define %%FUN2 %12
+%define %%TMP2 %13
+%define %%data %14
+%define %%MD5const %15
+%define %%nrot %16
+
+ vpaddd %%rA, %%rA, %%MD5const
+ vpaddd %%rA2, %%rA2, %%MD5const
+ vpaddd %%rA, %%rA, [%%data]
+ vpaddd %%rA2, %%rA2, [%%data + 16*32]
+ %%MAGIC_FUN %%FUN, %%rB,%%rC,%%rD
+ %%MAGIC_FUN %%FUN2, %%rB2,%%rC2,%%rD2
+ vpaddd %%rA, %%rA, %%FUN
+ vpaddd %%rA2, %%rA2, %%FUN2
+ PROLD %%rA,%%nrot, %%TMP
+ PROLD %%rA2,%%nrot, %%TMP2
+ vpaddd %%rA, %%rA, %%rB
+ vpaddd %%rA2, %%rA2, %%rB2
+%endmacro
+
+align 32
+
+; void md5_x8x2_avx(MD5_ARGS *args, UINT64 num_blks)
+; arg 1 : pointer to MD5_ARGS structure
+; arg 2 : number of blocks (>=1)
+
+MKGLOBAL(md5_x8x2_avx2,function,internal)
+md5_x8x2_avx2:
+ sub rsp, STACK_size
+
+ mov DPTR1, rsp
+ lea DPTR2, [rsp + 32*32]
+
+ ;; Load MD5 constant pointer to register
+ lea TBL, [rel MD5_TABLE]
+
+ ; Initialize index for data retrieval
+ xor IDX, IDX
+
+ ;; Fetch Pointers to Data Stream 1 to 8
+ mov inp0,[state + _data_ptr_md5+0*PTR_SZ]
+ mov inp1,[state + _data_ptr_md5+1*PTR_SZ]
+ mov inp2,[state + _data_ptr_md5+2*PTR_SZ]
+ mov inp3,[state + _data_ptr_md5+3*PTR_SZ]
+ mov inp4,[state + _data_ptr_md5+4*PTR_SZ]
+ mov inp5,[state + _data_ptr_md5+5*PTR_SZ]
+ mov inp6,[state + _data_ptr_md5+6*PTR_SZ]
+ mov inp7,[state + _data_ptr_md5+7*PTR_SZ]
+
+%assign I 0
+%rep 2
+ TRANSPOSE8_U32_LOAD8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, \
+ inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, IDX+I*32
+
+ TRANSPOSE8_U32 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+ vmovdqa [DPTR1+_DATA+(I*8+0)*32],Y_DAT0
+ vmovdqa [DPTR1+_DATA+(I*8+1)*32],Y_DAT1
+ vmovdqa [DPTR1+_DATA+(I*8+2)*32],Y_DAT2
+ vmovdqa [DPTR1+_DATA+(I*8+3)*32],Y_DAT3
+ vmovdqa [DPTR1+_DATA+(I*8+4)*32],Y_DAT4
+ vmovdqa [DPTR1+_DATA+(I*8+5)*32],Y_DAT5
+ vmovdqa [DPTR1+_DATA+(I*8+6)*32],Y_DAT6
+ vmovdqa [DPTR1+_DATA+(I*8+7)*32],Y_DAT7
+
+%assign I (I+1)
+%endrep
+
+ ;; Fetch Pointers to Data Stream 9 to 16
+ mov inp0,[state + _data_ptr_md5 + 8*8]
+ mov inp1,[state + _data_ptr_md5 + 9*8]
+ mov inp2,[state + _data_ptr_md5 + 10*8]
+ mov inp3,[state + _data_ptr_md5 + 11*8]
+ mov inp4,[state + _data_ptr_md5 + 12*8]
+ mov inp5,[state + _data_ptr_md5 + 13*8]
+ mov inp6,[state + _data_ptr_md5 + 14*8]
+ mov inp7,[state + _data_ptr_md5 + 15*8]
+
+%assign I 0
+%rep 2
+ TRANSPOSE8_U32_LOAD8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, \
+ inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, IDX+I*32
+
+ TRANSPOSE8_U32 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+ vmovdqa [DPTR1+_DATA+((I+2)*8+0)*32],Y_DAT0
+ vmovdqa [DPTR1+_DATA+((I+2)*8+1)*32],Y_DAT1
+ vmovdqa [DPTR1+_DATA+((I+2)*8+2)*32],Y_DAT2
+ vmovdqa [DPTR1+_DATA+((I+2)*8+3)*32],Y_DAT3
+ vmovdqa [DPTR1+_DATA+((I+2)*8+4)*32],Y_DAT4
+ vmovdqa [DPTR1+_DATA+((I+2)*8+5)*32],Y_DAT5
+ vmovdqa [DPTR1+_DATA+((I+2)*8+6)*32],Y_DAT6
+ vmovdqa [DPTR1+_DATA+((I+2)*8+7)*32],Y_DAT7
+
+%assign I (I+1)
+%endrep
+ ;; digests are already transposed
+ vmovdqu Y_A,[state + 0 * MD5_DIGEST_ROW_SIZE ]
+ vmovdqu Y_B,[state + 1 * MD5_DIGEST_ROW_SIZE ]
+ vmovdqu Y_C,[state + 2 * MD5_DIGEST_ROW_SIZE ]
+ vmovdqu Y_D,[state + 3 * MD5_DIGEST_ROW_SIZE ]
+
+ ; Load the digest for each stream (9-16)
+ vmovdqu Y_A2,[state + 0 * MD5_DIGEST_ROW_SIZE + 32]
+ vmovdqu Y_B2,[state + 1 * MD5_DIGEST_ROW_SIZE + 32]
+ vmovdqu Y_C2,[state + 2 * MD5_DIGEST_ROW_SIZE + 32]
+ vmovdqu Y_D2,[state + 3 * MD5_DIGEST_ROW_SIZE + 32]
+
+lloop:
+
+ ; save old digests to stack
+ vmovdqa [Y_AA], Y_A
+ vmovdqa [Y_BB], Y_B
+ vmovdqa [Y_CC], Y_C
+ vmovdqa [Y_DD], Y_D
+
+ vmovdqa [Y_AA2], Y_A2
+ vmovdqa [Y_BB2], Y_B2
+ vmovdqa [Y_CC2], Y_C2
+ vmovdqa [Y_DD2], Y_D2
+
+ ;; Increment IDX to point to next data block (64 bytes per block)
+ add IDX, 64
+
+ ;; Update size of remaining blocks to process
+ sub num_blks, 1
+ je lastblock
+
+ ; Perform the 64 rounds of processing ...
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+ 0*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+ 1*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+ 2*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+ 3*32], rot14
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+ 4*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+ 5*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+ 6*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+ 7*32], rot14
+
+
+ ;; Fetch Pointers to Data Stream 1 to 8 ??
+ mov inp0,[state + _data_ptr_md5 + 0*8]
+ mov inp1,[state + _data_ptr_md5 + 1*8]
+ mov inp2,[state + _data_ptr_md5 + 2*8]
+ mov inp3,[state + _data_ptr_md5 + 3*8]
+ mov inp4,[state + _data_ptr_md5 + 4*8]
+ mov inp5,[state + _data_ptr_md5 + 5*8]
+ mov inp6,[state + _data_ptr_md5 + 6*8]
+ mov inp7,[state + _data_ptr_md5 + 7*8]
+
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+ 8*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+ 9*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+10*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+11*32], rot14
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+12*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+13*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+14*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+15*32], rot14
+
+%assign I 0
+
+ ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
+ ; Therefore we need to save these to stack and restore after transpose
+ vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
+ vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
+
+ TRANSPOSE8_U32_LOAD8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, \
+ inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, IDX+I*32
+
+ TRANSPOSE8_U32 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+ vmovdqa [DPTR2+_DATA+(I*8+0)*32],Y_DAT0
+ vmovdqa [DPTR2+_DATA+(I*8+1)*32],Y_DAT1
+ vmovdqa [DPTR2+_DATA+(I*8+2)*32],Y_DAT2
+ vmovdqa [DPTR2+_DATA+(I*8+3)*32],Y_DAT3
+ vmovdqa [DPTR2+_DATA+(I*8+4)*32],Y_DAT4
+ vmovdqa [DPTR2+_DATA+(I*8+5)*32],Y_DAT5
+ vmovdqa [DPTR2+_DATA+(I*8+6)*32],Y_DAT6
+ vmovdqa [DPTR2+_DATA+(I*8+7)*32],Y_DAT7
+
+ ; Restore Y_A and Y_B
+ vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
+ vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
+
+
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+16*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+17*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+18*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+19*32], rot24
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+20*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+21*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+22*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+23*32], rot24
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+24*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+25*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+26*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+27*32], rot24
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+28*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+29*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+30*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+31*32], rot24
+
+%assign I (I+1)
+
+ ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
+ ; Therefore we need to save these to stack and restore after transpose
+ vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
+ vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
+
+ TRANSPOSE8_U32_LOAD8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, \
+ inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, IDX+I*32
+
+ TRANSPOSE8_U32 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+ vmovdqa [DPTR2+_DATA+(I*8+0)*32],Y_DAT0
+ vmovdqa [DPTR2+_DATA+(I*8+1)*32],Y_DAT1
+ vmovdqa [DPTR2+_DATA+(I*8+2)*32],Y_DAT2
+ vmovdqa [DPTR2+_DATA+(I*8+3)*32],Y_DAT3
+ vmovdqa [DPTR2+_DATA+(I*8+4)*32],Y_DAT4
+ vmovdqa [DPTR2+_DATA+(I*8+5)*32],Y_DAT5
+ vmovdqa [DPTR2+_DATA+(I*8+6)*32],Y_DAT6
+ vmovdqa [DPTR2+_DATA+(I*8+7)*32],Y_DAT7
+
+ ; Restore Y_A and Y_B
+ vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
+ vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
+
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+32*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+33*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+34*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+35*32], rot34
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+36*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+37*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+38*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+39*32], rot34
+
+ ;; Fetch Pointers to Data Stream 9 to 16
+ mov inp0,[state + _data_ptr_md5 + 8*8]
+ mov inp1,[state + _data_ptr_md5 + 9*8]
+ mov inp2,[state + _data_ptr_md5 + 10*8]
+ mov inp3,[state + _data_ptr_md5 + 11*8]
+ mov inp4,[state + _data_ptr_md5 + 12*8]
+ mov inp5,[state + _data_ptr_md5 + 13*8]
+ mov inp6,[state + _data_ptr_md5 + 14*8]
+ mov inp7,[state + _data_ptr_md5 + 15*8]
+
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+40*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+41*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+42*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+43*32], rot34
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+44*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+45*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+46*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+47*32], rot34
+
+%assign I 0
+
+ ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
+ ; Therefore we need to save these to stack and restore after transpose
+ vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
+ vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
+
+ TRANSPOSE8_U32_LOAD8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, \
+ inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, IDX+I*32
+
+ TRANSPOSE8_U32 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+ vmovdqa [DPTR2+_DATA+((I+2)*8+0)*32],Y_DAT0
+ vmovdqa [DPTR2+_DATA+((I+2)*8+1)*32],Y_DAT1
+ vmovdqa [DPTR2+_DATA+((I+2)*8+2)*32],Y_DAT2
+ vmovdqa [DPTR2+_DATA+((I+2)*8+3)*32],Y_DAT3
+ vmovdqa [DPTR2+_DATA+((I+2)*8+4)*32],Y_DAT4
+ vmovdqa [DPTR2+_DATA+((I+2)*8+5)*32],Y_DAT5
+ vmovdqa [DPTR2+_DATA+((I+2)*8+6)*32],Y_DAT6
+ vmovdqa [DPTR2+_DATA+((I+2)*8+7)*32],Y_DAT7
+
+ ; Restore Y_A and Y_B
+ vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
+ vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
+
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+48*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+49*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+50*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+51*32], rot44
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+52*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+53*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+54*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+55*32], rot44
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+56*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+57*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+58*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+59*32], rot44
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+60*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+61*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+62*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+63*32], rot44
+
+%assign I (I+1)
+
+ ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
+ ; Therefore we need to save these to stack and restore after transpose
+ vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
+ vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
+
+ TRANSPOSE8_U32_LOAD8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, \
+ inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, IDX+I*32
+
+ TRANSPOSE8_U32 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+ vmovdqa [DPTR2+_DATA+((I+2)*8+0)*32],Y_DAT0
+ vmovdqa [DPTR2+_DATA+((I+2)*8+1)*32],Y_DAT1
+ vmovdqa [DPTR2+_DATA+((I+2)*8+2)*32],Y_DAT2
+ vmovdqa [DPTR2+_DATA+((I+2)*8+3)*32],Y_DAT3
+ vmovdqa [DPTR2+_DATA+((I+2)*8+4)*32],Y_DAT4
+ vmovdqa [DPTR2+_DATA+((I+2)*8+5)*32],Y_DAT5
+ vmovdqa [DPTR2+_DATA+((I+2)*8+6)*32],Y_DAT6
+ vmovdqa [DPTR2+_DATA+((I+2)*8+7)*32],Y_DAT7
+
+ ; Restore Y_A and Y_B
+ vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
+ vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
+
+ ; Add results to old digest values
+
+ vpaddd Y_A,Y_A,[Y_AA]
+ vpaddd Y_B,Y_B,[Y_BB]
+ vpaddd Y_C,Y_C,[Y_CC]
+ vpaddd Y_D,Y_D,[Y_DD]
+
+ vpaddd Y_A2,Y_A2,[Y_AA2]
+ vpaddd Y_B2,Y_B2,[Y_BB2]
+ vpaddd Y_C2,Y_C2,[Y_CC2]
+ vpaddd Y_D2,Y_D2,[Y_DD2]
+
+ ; Swap DPTR1 and DPTR2
+ xchg DPTR1, DPTR2
+
+ ;; Proceed to processing of next block
+ jmp lloop
+
+lastblock:
+
+ ; Perform the 64 rounds of processing ...
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+ 0*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+ 1*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+ 2*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+ 3*32], rot14
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+ 4*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+ 5*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+ 6*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+ 7*32], rot14
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+ 8*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+ 9*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+10*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+11*32], rot14
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+12*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+13*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+14*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+15*32], rot14
+
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+16*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+17*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+18*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+19*32], rot24
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+20*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+21*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+22*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+23*32], rot24
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+24*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+25*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+26*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+27*32], rot24
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+28*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+29*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+30*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+31*32], rot24
+
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+32*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+33*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+34*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+35*32], rot34
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+36*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+37*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+38*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+39*32], rot34
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+40*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+41*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+42*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+43*32], rot34
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+44*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+45*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+46*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+47*32], rot34
+
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+48*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+49*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+50*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+51*32], rot44
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+52*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+53*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+54*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+55*32], rot44
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+56*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+57*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+58*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+59*32], rot44
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+60*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+61*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+62*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+63*32], rot44
+
+ ;; update into data pointers
+%assign I 0
+%rep 8
+ mov inp0, [state + _data_ptr_md5 + (2*I)*8]
+ mov inp1, [state + _data_ptr_md5 + (2*I +1)*8]
+ add inp0, IDX
+ add inp1, IDX
+ mov [state + _data_ptr_md5 + (2*I)*8], inp0
+ mov [state + _data_ptr_md5 + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+
+ vpaddd Y_A,Y_A,[Y_AA]
+ vpaddd Y_B,Y_B,[Y_BB]
+ vpaddd Y_C,Y_C,[Y_CC]
+ vpaddd Y_D,Y_D,[Y_DD]
+
+ vpaddd Y_A2,Y_A2,[Y_AA2]
+ vpaddd Y_B2,Y_B2,[Y_BB2]
+ vpaddd Y_C2,Y_C2,[Y_CC2]
+ vpaddd Y_D2,Y_D2,[Y_DD2]
+
+ vmovdqu [state + 0*MD5_DIGEST_ROW_SIZE ],Y_A
+ vmovdqu [state + 1*MD5_DIGEST_ROW_SIZE ],Y_B
+ vmovdqu [state + 2*MD5_DIGEST_ROW_SIZE ],Y_C
+ vmovdqu [state + 3*MD5_DIGEST_ROW_SIZE ],Y_D
+
+
+ vmovdqu [state + 0*MD5_DIGEST_ROW_SIZE + 32 ],Y_A2 ;; 32 is YMM width
+ vmovdqu [state + 1*MD5_DIGEST_ROW_SIZE + 32 ],Y_B2
+ vmovdqu [state + 2*MD5_DIGEST_ROW_SIZE + 32 ],Y_C2
+ vmovdqu [state + 3*MD5_DIGEST_ROW_SIZE + 32 ],Y_D2
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ ;; Clear stack frame ((64+8+2)*32 bytes)
+%ifdef SAFE_DATA
+ vpxor ymm0, ymm0
+%assign i 0
+%rep (2*2*16+8+2)
+ vmovdqa [rsp + i*32], ymm0
+%assign i (i+1)
+%endrep
+%endif
+
+ add rsp, STACK_size
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx2/sha1_x8_avx2.asm b/src/spdk/intel-ipsec-mb/avx2/sha1_x8_avx2.asm
new file mode 100644
index 000000000..d614e1b0e
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx2/sha1_x8_avx2.asm
@@ -0,0 +1,466 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; Stack must be aligned to 32 bytes before call
+;; Windows clobbers: rax rdx r8 r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rbx rcx rsi rdi rbp
+;;
+;; Linux clobbers: rax rdx rsi r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rbx rcx rdi rbp r8
+;;
+;; clobbers ymm0-15
+
+%include "include/os.asm"
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/transpose_avx2.asm"
+
+section .data
+default rel
+align 32
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19: ;ddq 0x5A8279995A8279995A8279995A827999
+ ;ddq 0x5A8279995A8279995A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+ ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+ ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+ ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+
+section .text
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define reg3 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define reg3 r8
+%endif
+
+%define state arg1
+%define num_blks arg2
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 reg3
+
+%define IDX rax
+
+; ymm0 A
+; ymm1 B
+; ymm2 C
+; ymm3 D
+; ymm4 E
+; ymm5 F AA
+; ymm6 T0 BB
+; ymm7 T1 CC
+; ymm8 T2 DD
+; ymm9 T3 EE
+; ymm10 T4 TMP
+; ymm11 T5 FUN
+; ymm12 T6 K
+; ymm13 T7 W14
+; ymm14 T8 W15
+; ymm15 T9 W16
+
+%define A ymm0
+%define B ymm1
+%define C ymm2
+%define D ymm3
+%define E ymm4
+
+%define F ymm5
+%define T0 ymm6
+%define T1 ymm7
+%define T2 ymm8
+%define T3 ymm9
+%define T4 ymm10
+%define T5 ymm11
+%define T6 ymm12
+%define T7 ymm13
+%define T8 ymm14
+%define T9 ymm15
+
+%define AA ymm5
+%define BB ymm6
+%define CC ymm7
+%define DD ymm8
+%define EE ymm9
+%define TMP ymm10
+%define FUN ymm11
+%define K ymm12
+%define W14 ymm13
+%define W15 ymm14
+%define W16 ymm15
+
+
+;; Assume stack aligned to 32 bytes before call
+;; Therefore FRAMESIZE mod 32 must be 32-8 = 24
+%define FRAMESZ 32*16 + 24
+
+%define VMOVPS vmovups
+
+;;
+;; Magic functions defined in FIPS 180-1
+;;
+;MAGIC_F0 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ ;vmovdqa %%regF,%%regC
+ vpxor %%regF, %%regC,%%regD
+ vpand %%regF, %%regF,%%regB
+ vpxor %%regF, %%regF,%%regD
+%endmacro
+
+;MAGIC_F1 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ ;vmovdqa %%regF,%%regD
+ vpxor %%regF,%%regD,%%regC
+ vpxor %%regF,%%regF,%%regB
+%endmacro
+
+
+
+;MAGIC_F2 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ ;vmovdqa %%regF,%%regB
+ ;vmovdqa %%regT,%%regB
+ vpor %%regF,%%regB,%%regC
+ vpand %%regT,%%regB,%%regC
+ vpand %%regF,%%regF,%%regD
+ vpor %%regF,%%regF,%%regT
+%endmacro
+
+;MAGIC_F3 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ ;vmovdqa %%tmp, %%reg
+ vpsrld %%tmp, %%reg, (32-%%imm)
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ ;vmovdqa %%tmp, %%reg
+ vpsrld %%tmp, %%src, (32-%%imm)
+ vpslld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ vpaddd %%regE, %%regE,%%immCNT
+ vpaddd %%regE, %%regE,[rsp + (%%memW * 32)]
+ ;vmovdqa %%regT,%%regA
+ PROLD_nd %%regT,5, %%regF,%%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE, %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqa W14, [rsp + ((%%memW - 14) & 15) * 32]
+ vpxor W16, W16, W14
+ vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 32]
+ vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 32]
+
+ ;vmovdqa %%regF, W16
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqa [rsp + ((%%memW - 0) & 15) * 32],%%regF
+ vpaddd %%regE, %%regE,%%regF
+
+ ;vmovdqa %%regT,%%regA
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+align 32
+
+; void sha1_x8_avx2(void *state, int num_blks)
+; arg 1 : rcx : pointer to array[4] of pointer to input data
+; arg 2 : rdx : size (in blocks) ;; assumed to be >= 1
+MKGLOBAL(sha1_x8_avx2,function,internal)
+sha1_x8_avx2:
+ sub rsp, FRAMESZ
+
+ ;; Initialize digests
+ vmovdqu A, [state + 0*SHA1_DIGEST_ROW_SIZE]
+ vmovdqu B, [state + 1*SHA1_DIGEST_ROW_SIZE]
+ vmovdqu C, [state + 2*SHA1_DIGEST_ROW_SIZE]
+ vmovdqu D, [state + 3*SHA1_DIGEST_ROW_SIZE]
+ vmovdqu E, [state + 4*SHA1_DIGEST_ROW_SIZE]
+ DBGPRINTL_YMM "Sha1-AVX2 incoming transposed digest", A, B, C, D, E
+
+ ;; transpose input onto stack
+ mov inp0,[state+_data_ptr_sha1+0*PTR_SZ]
+ mov inp1,[state+_data_ptr_sha1+1*PTR_SZ]
+ mov inp2,[state+_data_ptr_sha1+2*PTR_SZ]
+ mov inp3,[state+_data_ptr_sha1+3*PTR_SZ]
+ mov inp4,[state+_data_ptr_sha1+4*PTR_SZ]
+ mov inp5,[state+_data_ptr_sha1+5*PTR_SZ]
+ mov inp6,[state+_data_ptr_sha1+6*PTR_SZ]
+ mov inp7,[state+_data_ptr_sha1+7*PTR_SZ]
+
+ xor IDX, IDX
+lloop:
+ vmovdqa F, [rel PSHUFFLE_BYTE_FLIP_MASK]
+%assign I 0
+%rep 2
+ TRANSPOSE8_U32_LOAD8 T0, T1, T2, T3, T4, T5, T6, T7, \
+ inp0, inp1, inp2, inp3, inp4, inp5, \
+ inp6, inp7, IDX
+
+ TRANSPOSE8_U32 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
+ DBGPRINTL_YMM "Sha1-AVX2 incoming transposed input", T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
+ vpshufb T0, T0, F
+ vmovdqa [rsp+(I*8+0)*32],T0
+ vpshufb T1, T1, F
+ vmovdqa [rsp+(I*8+1)*32],T1
+ vpshufb T2, T2, F
+ vmovdqa [rsp+(I*8+2)*32],T2
+ vpshufb T3, T3, F
+ vmovdqa [rsp+(I*8+3)*32],T3
+ vpshufb T4, T4, F
+ vmovdqa [rsp+(I*8+4)*32],T4
+ vpshufb T5, T5, F
+ vmovdqa [rsp+(I*8+5)*32],T5
+ vpshufb T6, T6, F
+ vmovdqa [rsp+(I*8+6)*32],T6
+ vpshufb T7, T7, F
+ vmovdqa [rsp+(I*8+7)*32],T7
+ add IDX, 32
+%assign I (I+1)
+%endrep
+
+
+ ; save old digests
+ vmovdqa AA, A
+ vmovdqa BB, B
+ vmovdqa CC, C
+ vmovdqa DD, D
+ vmovdqa EE, E
+
+;;
+;; perform 0-79 steps
+;;
+ vmovdqa K, [rel K00_19]
+;; do rounds 0...15
+%assign I 0
+%rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 16...19
+ vmovdqa W16, [rsp + ((16 - 16) & 15) * 32]
+ vmovdqa W15, [rsp + ((16 - 15) & 15) * 32]
+%rep 4
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 20...39
+ vmovdqa K, [rel K20_39]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 40...59
+ vmovdqa K, [rel K40_59]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 60...79
+ vmovdqa K, [rel K60_79]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+ vpaddd A,A,AA
+ vpaddd B,B,BB
+ vpaddd C,C,CC
+ vpaddd D,D,DD
+ vpaddd E,E,EE
+
+ sub num_blks, 1
+ jne lloop
+
+ ; write out digests
+ vmovdqu [state + 0*SHA1_DIGEST_ROW_SIZE], A
+ vmovdqu [state + 1*SHA1_DIGEST_ROW_SIZE], B
+ vmovdqu [state + 2*SHA1_DIGEST_ROW_SIZE], C
+ vmovdqu [state + 3*SHA1_DIGEST_ROW_SIZE], D
+ vmovdqu [state + 4*SHA1_DIGEST_ROW_SIZE], E
+ DBGPRINTL_YMM "Sha1-AVX2 outgoing transposed digest", A, B, C, D, E
+ ;; update input pointers
+ add inp0, IDX
+ add inp1, IDX
+ add inp2, IDX
+ add inp3, IDX
+ add inp4, IDX
+ add inp5, IDX
+ add inp6, IDX
+ add inp7, IDX
+ mov [state+_data_ptr_sha1+0*PTR_SZ], inp0
+ mov [state+_data_ptr_sha1+1*PTR_SZ], inp1
+ mov [state+_data_ptr_sha1+2*PTR_SZ], inp2
+ mov [state+_data_ptr_sha1+3*PTR_SZ], inp3
+ mov [state+_data_ptr_sha1+4*PTR_SZ], inp4
+ mov [state+_data_ptr_sha1+5*PTR_SZ], inp5
+ mov [state+_data_ptr_sha1+6*PTR_SZ], inp6
+ mov [state+_data_ptr_sha1+7*PTR_SZ], inp7
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ ;; Clear stack frame (16*32 bytes)
+%ifdef SAFE_DATA
+ vpxor ymm0, ymm0
+%assign i 0
+%rep 16
+ vmovdqa [rsp + i*32], ymm0
+%assign i (i+1)
+%endrep
+%endif
+
+ add rsp, FRAMESZ
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx2/sha256_oct_avx2.asm b/src/spdk/intel-ipsec-mb/avx2/sha256_oct_avx2.asm
new file mode 100644
index 000000000..08361609d
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx2/sha256_oct_avx2.asm
@@ -0,0 +1,587 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; code to compute oct SHA256 using SSE-256
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
+;; Stack must be aligned to 32 bytes before call
+;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14
+;; Windows preserves: rcx rbp r15
+;;
+;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14
+;; Linux preserves: rdi rbp r15
+;;
+;; clobbers ymm0-15
+
+%include "include/os.asm"
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+%include "mb_mgr_datastruct.asm"
+%include "include/transpose_avx2.asm"
+
+section .data
+default rel
+align 64
+;global K256_8
+K256_8:
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+align 64
+MKGLOBAL(K256,data,internal)
+K256:
+ dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+section .text
+
+%ifdef LINUX
+ %define arg1 rdi
+ %define arg2 rsi
+ %define reg3 rcx
+ %define reg4 rdx
+%else
+ ; Windows definitions
+ %define arg1 rcx
+ %define arg2 rdx
+ %define reg3 rsi
+ %define reg4 rdi
+%endif
+
+; Common definitions
+%define STATE arg1
+%define INP_SIZE arg2
+
+%define IDX rax
+%define ROUND rbx
+%define TBL reg3
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r8
+%define inp7 reg4
+
+; ymm0 a
+; ymm1 b
+; ymm2 c
+; ymm3 d
+; ymm4 e
+; ymm5 f
+; ymm6 g TMP0
+; ymm7 h TMP1
+; ymm8 T1 TT0
+; ymm9 TT1
+; ymm10 TT2
+; ymm11 TT3
+; ymm12 a0 TT4
+; ymm13 a1 TT5
+; ymm14 a2 TT6
+; ymm15 TMP TT7
+
+%define a ymm0
+%define b ymm1
+%define c ymm2
+%define d ymm3
+%define e ymm4
+%define f ymm5
+%define g ymm6
+%define h ymm7
+
+%define T1 ymm8
+
+%define a0 ymm12
+%define a1 ymm13
+%define a2 ymm14
+%define TMP ymm15
+
+%define TMP0 ymm6
+%define TMP1 ymm7
+
+%define TT0 ymm8
+%define TT1 ymm9
+%define TT2 ymm10
+%define TT3 ymm11
+%define TT4 ymm12
+%define TT5 ymm13
+%define TT6 ymm14
+%define TT7 ymm15
+
+%define SZ8 8*SHA256_DIGEST_WORD_SIZE ; Size of one vector register
+%define ROUNDS 64*SZ8
+
+; Define stack usage
+
+;; Assume stack aligned to 32 bytes before call
+;; Therefore FRAMESZ mod 32 must be 32-8 = 24
+struc stack_frame
+ .data resb 16*SZ8
+ .digest resb 8*SZ8
+ .ytmp resb 4*SZ8
+ .align resb 24
+endstruc
+%define FRAMESZ stack_frame_size
+%define _DIGEST stack_frame.digest
+%define _YTMP stack_frame.ytmp
+
+%define YTMP0 rsp + _YTMP + 0*SZ8
+%define YTMP1 rsp + _YTMP + 1*SZ8
+%define YTMP2 rsp + _YTMP + 2*SZ8
+%define YTMP3 rsp + _YTMP + 3*SZ8
+
+%define VMOVPS vmovups
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpslld %%tmp, %%reg, (32-(%%imm))
+ vpsrld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORD_nd reg, imm, tmp, src
+%macro PRORD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ ;vmovdqa %%tmp, %%reg
+ vpslld %%tmp, %%src, (32-(%%imm))
+ vpsrld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PRORD dst/src, amt
+%macro PRORD 2
+ PRORD %1, %2, TMP
+%endmacro
+
+; PRORD_nd dst, src, amt
+%macro PRORD_nd 3
+ PRORD_nd %1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i %2
+ PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
+
+ vpxor a2, f, g ; ch: a2 = f^g
+ vpand a2, a2, e ; ch: a2 = (f^g)&e
+ vpxor a2, a2, g ; a2 = ch
+
+ PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
+ vmovdqa [SZ8*(%%i&0xf) + rsp], %%T1
+ vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
+ vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ vpaddd h, h, a2 ; h = h + ch
+ PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
+ vpaddd h, h, %%T1 ; h = h + ch + W + K
+ vpxor a0, a0, a1 ; a0 = sigma1
+ PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
+ vpxor %%T1, a, c ; maj: T1 = a^c
+ add ROUND, SZ8 ; ROUND++
+ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
+ vpaddd h, h, a0
+
+ vpaddd d, d, h
+
+ vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ vpxor a2, a2, a1 ; a2 = sig0
+ vpand a1, a, c ; maj: a1 = a&c
+ vpor a1, a1, %%T1 ; a1 = maj
+ vpaddd h, h, a1 ; h = h + ch + W + K + maj
+ vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i %2
+ vmovdqa %%T1, [SZ8*((%%i-15)&0xf) + rsp]
+ vmovdqa a1, [SZ8*((%%i-2)&0xf) + rsp]
+ vmovdqa a0, %%T1
+ PRORD %%T1, 18-7
+ vmovdqa a2, a1
+ PRORD a1, 19-17
+ vpxor %%T1, %%T1, a0
+ PRORD %%T1, 7
+ vpxor a1, a1, a2
+ PRORD a1, 17
+ vpsrld a0, a0, 3
+ vpxor %%T1, %%T1, a0
+ vpsrld a2, a2, 10
+ vpxor a1, a1, a2
+ vpaddd %%T1, %%T1, [SZ8*((%%i-16)&0xf) + rsp]
+ vpaddd a1, a1, [SZ8*((%%i-7)&0xf) + rsp]
+ vpaddd %%T1, %%T1, a1
+
+ ROUND_00_15 %%T1, %%i
+
+%endm
+
+
+;; SHA256_ARGS:
+;; UINT128 digest[8]; // transposed digests
+;; UINT8 *data_ptr[4];
+;;
+
+;; void sha256_oct_avx2(SHA256_ARGS *args, UINT64 bytes);
+;; arg 1 : STATE : pointer to array of pointers to input data
+;; arg 2 : INP_SIZE : size of input in blocks
+MKGLOBAL(sha256_oct_avx2,function,internal)
+align 16
+sha256_oct_avx2:
+ ; general registers preserved in outer calling routine
+ ; outer calling routine saves all the XMM registers
+ sub rsp, FRAMESZ
+
+ ;; Load the pre-transposed incoming digest.
+ vmovdqu a,[STATE + 0*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu b,[STATE + 1*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu c,[STATE + 2*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu d,[STATE + 3*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu e,[STATE + 4*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu f,[STATE + 5*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu g,[STATE + 6*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu h,[STATE + 7*SHA256_DIGEST_ROW_SIZE]
+
+ lea TBL,[rel K256_8]
+
+ ;; load the address of each of the 4 message lanes
+ ;; getting ready to transpose input onto stack
+ mov inp0,[STATE + _data_ptr_sha256 + 0*PTR_SZ]
+ mov inp1,[STATE + _data_ptr_sha256 + 1*PTR_SZ]
+ mov inp2,[STATE + _data_ptr_sha256 + 2*PTR_SZ]
+ mov inp3,[STATE + _data_ptr_sha256 + 3*PTR_SZ]
+ mov inp4,[STATE + _data_ptr_sha256 + 4*PTR_SZ]
+ mov inp5,[STATE + _data_ptr_sha256 + 5*PTR_SZ]
+ mov inp6,[STATE + _data_ptr_sha256 + 6*PTR_SZ]
+ mov inp7,[STATE + _data_ptr_sha256 + 7*PTR_SZ]
+
+ xor IDX, IDX
+lloop:
+ xor ROUND, ROUND
+
+ ;; save old digest
+ vmovdqa [rsp + _DIGEST + 0*SZ8], a
+ vmovdqa [rsp + _DIGEST + 1*SZ8], b
+ vmovdqa [rsp + _DIGEST + 2*SZ8], c
+ vmovdqa [rsp + _DIGEST + 3*SZ8], d
+ vmovdqa [rsp + _DIGEST + 4*SZ8], e
+ vmovdqa [rsp + _DIGEST + 5*SZ8], f
+ vmovdqa [rsp + _DIGEST + 6*SZ8], g
+ vmovdqa [rsp + _DIGEST + 7*SZ8], h
+ DBGPRINTL_YMM "transposed digest ", a,b,c,d,e,f,g,h
+%assign i 0
+%rep 2
+ TRANSPOSE8_U32_LOAD8 TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7, \
+ inp0, inp1, inp2, inp3, inp4, inp5, \
+ inp6, inp7, IDX+i*32
+
+ vmovdqa [YTMP0], g
+ vmovdqa [YTMP1], h
+ TRANSPOSE8_U32 TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7, TMP0, TMP1
+ DBGPRINTL_YMM "transposed input ", TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7
+ vmovdqa TMP1, [rel PSHUFFLE_BYTE_FLIP_MASK]
+ vmovdqa g, [YTMP0]
+ vpshufb TT0, TT0, TMP1
+ vpshufb TT1, TT1, TMP1
+ vpshufb TT2, TT2, TMP1
+ vpshufb TT3, TT3, TMP1
+ vpshufb TT4, TT4, TMP1
+ vpshufb TT5, TT5, TMP1
+ vpshufb TT6, TT6, TMP1
+ vpshufb TT7, TT7, TMP1
+ vmovdqa h, [YTMP1]
+ vmovdqa [YTMP0], TT4
+ vmovdqa [YTMP1], TT5
+ vmovdqa [YTMP2], TT6
+ vmovdqa [YTMP3], TT7
+ ROUND_00_15 TT0,(i*8+0)
+ vmovdqa TT0, [YTMP0]
+ ROUND_00_15 TT1,(i*8+1)
+ vmovdqa TT1, [YTMP1]
+ ROUND_00_15 TT2,(i*8+2)
+ vmovdqa TT2, [YTMP2]
+ ROUND_00_15 TT3,(i*8+3)
+ vmovdqa TT3, [YTMP3]
+ ROUND_00_15 TT0,(i*8+4)
+ ROUND_00_15 TT1,(i*8+5)
+ ROUND_00_15 TT2,(i*8+6)
+ ROUND_00_15 TT3,(i*8+7)
+%assign i (i+1)
+%endrep
+ add IDX, 4*4*4
+
+%assign i (i*8)
+
+ jmp Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+ ROUND_16_XX T1, i
+%assign i (i+1)
+%endrep
+
+ cmp ROUND,ROUNDS
+ jb Lrounds_16_xx
+
+ ;; add old digest
+ vpaddd a, a, [rsp + _DIGEST + 0*SZ8]
+ vpaddd b, b, [rsp + _DIGEST + 1*SZ8]
+ vpaddd c, c, [rsp + _DIGEST + 2*SZ8]
+ vpaddd d, d, [rsp + _DIGEST + 3*SZ8]
+ vpaddd e, e, [rsp + _DIGEST + 4*SZ8]
+ vpaddd f, f, [rsp + _DIGEST + 5*SZ8]
+ vpaddd g, g, [rsp + _DIGEST + 6*SZ8]
+ vpaddd h, h, [rsp + _DIGEST + 7*SZ8]
+
+ sub INP_SIZE, 1 ;; unit is blocks
+ jne lloop
+
+ ; write back to memory (state object) the transposed digest
+ vmovdqu [STATE + 0*SHA256_DIGEST_ROW_SIZE],a
+ vmovdqu [STATE + 1*SHA256_DIGEST_ROW_SIZE],b
+ vmovdqu [STATE + 2*SHA256_DIGEST_ROW_SIZE],c
+ vmovdqu [STATE + 3*SHA256_DIGEST_ROW_SIZE],d
+ vmovdqu [STATE + 4*SHA256_DIGEST_ROW_SIZE],e
+ vmovdqu [STATE + 5*SHA256_DIGEST_ROW_SIZE],f
+ vmovdqu [STATE + 6*SHA256_DIGEST_ROW_SIZE],g
+ vmovdqu [STATE + 7*SHA256_DIGEST_ROW_SIZE],h
+ DBGPRINTL_YMM "sha256 digest on exit ", a,b,c,d,e,f,g,h
+
+ ; update input pointers
+ add inp0, IDX
+ mov [STATE + _data_ptr_sha256 + 0*8], inp0
+ add inp1, IDX
+ mov [STATE + _data_ptr_sha256 + 1*8], inp1
+ add inp2, IDX
+ mov [STATE + _data_ptr_sha256 + 2*8], inp2
+ add inp3, IDX
+ mov [STATE + _data_ptr_sha256 + 3*8], inp3
+ add inp4, IDX
+ mov [STATE + _data_ptr_sha256 + 4*8], inp4
+ add inp5, IDX
+ mov [STATE + _data_ptr_sha256 + 5*8], inp5
+ add inp6, IDX
+ mov [STATE + _data_ptr_sha256 + 6*8], inp6
+ add inp7, IDX
+ mov [STATE + _data_ptr_sha256 + 7*8], inp7
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+%ifdef SAFE_DATA
+ ;; Clear stack frame ((16+8+4)*32 bytes)
+ vpxor ymm0, ymm0
+%assign i 0
+%rep (16+8+4)
+ vmovdqa [rsp + i*SZ8], ymm0
+%assign i (i+1)
+%endrep
+%endif
+
+ add rsp, FRAMESZ
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx2/sha512_x4_avx2.asm b/src/spdk/intel-ipsec-mb/avx2/sha512_x4_avx2.asm
new file mode 100644
index 000000000..80e8c8c57
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx2/sha512_x4_avx2.asm
@@ -0,0 +1,452 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; code to compute quad SHA512 using AVX
+;; use YMMs to tackle the larger digest size
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
+;; Stack must be aligned to 32 bytes before call
+;; Windows clobbers: rax rbx rdx r8 r9 r10 r11 r12
+;; Windows preserves: rcx rsi rdi rbp r13 r14 r15
+;;
+;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12
+;; Linux preserves: rcx rdx rdi rbp r13 r14 r15
+;;
+;; clobbers ymm0-15
+
+%include "include/os.asm"
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+%include "include/transpose_avx2.asm"
+%include "include/dbgprint.asm"
+%include "mb_mgr_datastruct.asm"
+
+section .data
+default rel
+align 64
+K512_4:
+ dq 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22
+ dq 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd
+ dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
+ dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
+ dq 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538
+ dq 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019
+ dq 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b
+ dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
+ dq 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242
+ dq 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe
+ dq 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c
+ dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
+ dq 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f
+ dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
+ dq 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235
+ dq 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694
+ dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
+ dq 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3
+ dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
+ dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
+ dq 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275
+ dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
+ dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
+ dq 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5
+ dq 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab
+ dq 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210
+ dq 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f
+ dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
+ dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
+ dq 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725
+ dq 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f
+ dq 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70
+ dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc
+ dq 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926
+ dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
+ dq 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df
+ dq 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de
+ dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
+ dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6
+ dq 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b
+ dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
+ dq 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001
+ dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791
+ dq 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30
+ dq 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218
+ dq 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910
+ dq 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a
+ dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8
+ dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
+ dq 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53
+ dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99
+ dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
+ dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
+ dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
+ dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373
+ dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
+ dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc
+ dq 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60
+ dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72
+ dq 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec
+ dq 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28
+ dq 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9
+ dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
+ dq 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b
+ dq 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c
+ dq 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207
+ dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
+ dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
+ dq 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba
+ dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
+ dq 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae
+ dq 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b
+ dq 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84
+ dq 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493
+ dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
+ dq 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c
+ dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
+ dq 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a
+ dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
+ dq 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817
+
+align 32
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607
+ dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+ ;ddq 0x18191a1b1c1d1e1f1011121314151617
+ dq 0x1011121314151617, 0x18191a1b1c1d1e1f
+
+section .text
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+; Common definitions
+%define STATE arg1
+%define INP_SIZE arg2
+
+%define IDX rax
+%define ROUND rbx
+%define TBL r8
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+
+%define a ymm0
+%define b ymm1
+%define c ymm2
+%define d ymm3
+%define e ymm4
+%define f ymm5
+%define g ymm6
+%define h ymm7
+
+%define a0 ymm8
+%define a1 ymm9
+%define a2 ymm10
+
+%define TT0 ymm14
+%define TT1 ymm13
+%define TT2 ymm12
+%define TT3 ymm11
+%define TT4 ymm10
+%define TT5 ymm9
+
+%define T1 ymm14
+%define TMP ymm15
+
+
+
+%define SZ4 4*SHA512_DIGEST_WORD_SIZE ; Size of one vector register
+%define ROUNDS 80*SZ4
+
+; Define stack usage
+
+;; Assume stack aligned to 32 bytes before call
+;; Therefore FRAMESZ mod 32 must be 32-8 = 24
+struc stack_frame
+ .data resb 16*SZ4
+ .digest resb NUM_SHA512_DIGEST_WORDS*SZ4
+ .align resb 24
+endstruc
+
+%define _DIGEST stack_frame.digest
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORQ reg, imm, tmp
+; packed-rotate-right-double
+; does a rotate by doing two shifts and an or
+%macro PRORQ 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsllq %%tmp, %%reg, (64-(%%imm))
+ vpsrlq %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORQ_nd reg, imm, tmp, src
+%macro PRORQ_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpsllq %%tmp, %%src, (64-(%%imm))
+ vpsrlq %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PRORQ dst/src, amt
+%macro PRORQ 2
+ PRORQ %1, %2, TMP
+%endmacro
+
+; PRORQ_nd dst, src, amt
+%macro PRORQ_nd 3
+ PRORQ_nd %1, %3, TMP, %2
+%endmacro
+
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i %2
+ PRORQ_nd a0, e, (18-14) ; sig1: a0 = (e >> 4)
+
+ vpxor a2, f, g ; ch: a2 = f^g
+ vpand a2, a2, e ; ch: a2 = (f^g)&e
+ vpxor a2, a2, g ; a2 = ch
+
+ PRORQ_nd a1, e, 41 ; sig1: a1 = (e >> 41)
+ vmovdqa [SZ4*(%%i&0xf) + rsp],%%T1
+ vpaddq %%T1,%%T1,[TBL + ROUND] ; T1 = W + K
+ vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18)
+ vpaddq h, h, a2 ; h = h + ch
+ PRORQ_nd a2, a, (34-28) ; sig0: a2 = (a >> 6)
+ vpaddq h, h, %%T1 ; h = h + ch + W + K
+ vpxor a0, a0, a1 ; a0 = sigma1
+ vmovdqa %%T1, a ; maj: T1 = a
+ PRORQ_nd a1, a, 39 ; sig0: a1 = (a >> 39)
+ vpxor %%T1, %%T1, c ; maj: T1 = a^c
+ add ROUND, SZ4 ; ROUND++
+ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
+ vpaddq h, h, a0
+
+ vpaddq d, d, h
+
+ vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34)
+ vpxor a2, a2, a1 ; a2 = sig0
+ vpand a1, a, c ; maj: a1 = a&c
+ vpor a1, a1, %%T1 ; a1 = maj
+ vpaddq h, h, a1 ; h = h + ch + W + K + maj
+ vpaddq h, h, a2 ; h = h + ch + W + K + maj + sigma0
+ ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i %2
+ vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp]
+ vmovdqa a1, [SZ4*((%%i-2)&0xf) + rsp]
+ vmovdqa a0, %%T1
+ PRORQ %%T1, 8-1
+ vmovdqa a2, a1
+ PRORQ a1, 61-19
+ vpxor %%T1, %%T1, a0
+ PRORQ %%T1, 1
+ vpxor a1, a1, a2
+ PRORQ a1, 19
+ vpsrlq a0, a0, 7
+ vpxor %%T1, %%T1, a0
+ vpsrlq a2, a2, 6
+ vpxor a1, a1, a2
+ vpaddq %%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp]
+ vpaddq a1, a1, [SZ4*((%%i-7)&0xf) + rsp]
+ vpaddq %%T1, %%T1, a1
+
+ ROUND_00_15 %%T1, %%i
+
+%endm
+
+
+;; void sha512_x4_avx2(void *STATE, const int INP_SIZE)
+;; arg 1 : STATE : pointer to input data
+;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
+MKGLOBAL(sha512_x4_avx2,function,internal)
+align 32
+sha512_x4_avx2:
+ ; general registers preserved in outer calling routine
+ ; outer calling routine saves all the XMM registers
+
+ sub rsp, stack_frame_size
+
+ ;; Load the pre-transposed incoming digest.
+ vmovdqu a, [STATE+ 0*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu b, [STATE+ 1*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu c, [STATE+ 2*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu d, [STATE+ 3*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu e, [STATE+ 4*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu f, [STATE+ 5*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu g, [STATE+ 6*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu h, [STATE+ 7*SHA512_DIGEST_ROW_SIZE]
+
+ DBGPRINTL_YMM "sha512-avx2 Incoming digest", a, b, c, d, e, f, g, h
+ lea TBL,[K512_4]
+
+ ;; load the address of each of the MAX_LANES (4) message lanes
+ ;; getting ready to transpose input onto stack
+ mov inp0,[STATE + _data_ptr_sha512 + 0*PTR_SZ]
+ mov inp1,[STATE + _data_ptr_sha512 + 1*PTR_SZ]
+ mov inp2,[STATE + _data_ptr_sha512 + 2*PTR_SZ]
+ mov inp3,[STATE + _data_ptr_sha512 + 3*PTR_SZ]
+
+ xor IDX, IDX
+lloop:
+ xor ROUND, ROUND
+
+ ;; save old digest
+ vmovdqa [rsp + _DIGEST + 0*SZ4], a
+ vmovdqa [rsp + _DIGEST + 1*SZ4], b
+ vmovdqa [rsp + _DIGEST + 2*SZ4], c
+ vmovdqa [rsp + _DIGEST + 3*SZ4], d
+ vmovdqa [rsp + _DIGEST + 4*SZ4], e
+ vmovdqa [rsp + _DIGEST + 5*SZ4], f
+ vmovdqa [rsp + _DIGEST + 6*SZ4], g
+ vmovdqa [rsp + _DIGEST + 7*SZ4], h
+
+%assign i 0
+%rep 4
+ ;; load up the shuffler for little-endian to big-endian format
+ vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+
+ TRANSPOSE4_U64_LOAD4 TT4, TT1, TT5, TT3, inp0, inp1, inp2, inp3, IDX+i*32
+
+ TRANSPOSE4_U64 TT4, TT1, TT5, TT3, TT0, TT2
+ DBGPRINTL_YMM "sha512-avx2 Incoming data", TT0, TT1, TT2, TT3
+ vpshufb TT0, TT0, TMP
+ vpshufb TT1, TT1, TMP
+ vpshufb TT2, TT2, TMP
+ vpshufb TT3, TT3, TMP
+ ROUND_00_15 TT0,(i*4+0)
+ ROUND_00_15 TT1,(i*4+1)
+ ROUND_00_15 TT2,(i*4+2)
+ ROUND_00_15 TT3,(i*4+3)
+%assign i (i+1)
+%endrep
+;; Increment IDX by message block size == 8 (loop) * 16 (XMM width in bytes)
+ add IDX, 4 * 32
+
+%assign i (i*4)
+
+ jmp Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+ ROUND_16_XX T1, i
+%assign i (i+1)
+%endrep
+
+ cmp ROUND,ROUNDS
+ jb Lrounds_16_xx
+
+ ;; add old digest
+ vpaddq a, a, [rsp + _DIGEST + 0*SZ4]
+ vpaddq b, b, [rsp + _DIGEST + 1*SZ4]
+ vpaddq c, c, [rsp + _DIGEST + 2*SZ4]
+ vpaddq d, d, [rsp + _DIGEST + 3*SZ4]
+ vpaddq e, e, [rsp + _DIGEST + 4*SZ4]
+ vpaddq f, f, [rsp + _DIGEST + 5*SZ4]
+ vpaddq g, g, [rsp + _DIGEST + 6*SZ4]
+ vpaddq h, h, [rsp + _DIGEST + 7*SZ4]
+
+ sub INP_SIZE, 1 ;; consumed one message block
+ jne lloop
+
+ ; write back to memory (state object) the transposed digest
+ vmovdqu [STATE+ 0*SHA512_DIGEST_ROW_SIZE ],a
+ vmovdqu [STATE+ 1*SHA512_DIGEST_ROW_SIZE ],b
+ vmovdqu [STATE+ 2*SHA512_DIGEST_ROW_SIZE ],c
+ vmovdqu [STATE+ 3*SHA512_DIGEST_ROW_SIZE ],d
+ vmovdqu [STATE+ 4*SHA512_DIGEST_ROW_SIZE ],e
+ vmovdqu [STATE+ 5*SHA512_DIGEST_ROW_SIZE ],f
+ vmovdqu [STATE+ 6*SHA512_DIGEST_ROW_SIZE ],g
+ vmovdqu [STATE+ 7*SHA512_DIGEST_ROW_SIZE ],h
+ DBGPRINTL_YMM "sha512-avx2 Outgoing digest", a, b, c, d, e, f, g, h
+
+ ;; update input data pointers
+ add inp0, IDX
+ mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0
+ add inp1, IDX
+ mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1
+ add inp2, IDX
+ mov [STATE + _data_ptr_sha512 + 2*PTR_SZ], inp2
+ add inp3, IDX
+ mov [STATE + _data_ptr_sha512 + 3*PTR_SZ], inp3
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ ;; Clear stack frame ((16 + 8)*32 bytes)
+%ifdef SAFE_DATA
+ vpxor ymm0, ymm0
+%assign i 0
+%rep (16+NUM_SHA512_DIGEST_WORDS)
+ vmovdqa [rsp + i*SZ4], ymm0
+%assign i (i+1)
+%endrep
+%endif
+
+ add rsp, stack_frame_size
+
+ ; outer calling routine restores XMM and other GP registers
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx2/snow3g_avx2.c b/src/spdk/intel-ipsec-mb/avx2/snow3g_avx2.c
new file mode 100644
index 000000000..7945d026a
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx2/snow3g_avx2.c
@@ -0,0 +1,49 @@
+/*******************************************************************************
+ Copyright (c) 2019, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+
+#if defined (_WIN32) || defined (SAFE_LOOKUP)
+/* use AVX implementation on Windows for now or when SAFE_LOOKUP flag is set */
+#define AVX
+#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_xmms_avx
+#else
+#define AVX2
+#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_ymms
+#endif
+#define SNOW3G_F8_1_BUFFER_BIT snow3g_f8_1_buffer_bit_avx2
+#define SNOW3G_F8_1_BUFFER snow3g_f8_1_buffer_avx2
+#define SNOW3G_F8_2_BUFFER snow3g_f8_2_buffer_avx2
+#define SNOW3G_F8_4_BUFFER snow3g_f8_4_buffer_avx2
+#define SNOW3G_F8_8_BUFFER snow3g_f8_8_buffer_avx2
+#define SNOW3G_F8_N_BUFFER snow3g_f8_n_buffer_avx2
+#define SNOW3G_F8_8_BUFFER_MULTIKEY snow3g_f8_8_buffer_multikey_avx2
+#define SNOW3G_F8_N_BUFFER_MULTIKEY snow3g_f8_n_buffer_multikey_avx2
+#define SNOW3G_F9_1_BUFFER snow3g_f9_1_buffer_avx2
+#define SNOW3G_INIT_KEY_SCHED snow3g_init_key_sched_avx2
+#define SNOW3G_KEY_SCHED_SIZE snow3g_key_sched_size_avx2
+
+#include "include/snow3g_common.h"
diff --git a/src/spdk/intel-ipsec-mb/avx512/aes_cbc_dec_vaes_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/aes_cbc_dec_vaes_avx512.asm
new file mode 100644
index 000000000..ce33caa92
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/aes_cbc_dec_vaes_avx512.asm
@@ -0,0 +1,477 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "include/reg_sizes.asm"
+%include "include/aes_common.asm"
+
+%define zIV zmm0
+%define zBLK_0_3 zmm1
+%define zBLK_4_7 zmm2
+%define zBLK_8_11 zmm3
+%define zBLK_12_15 zmm4
+%define zTMP0 zmm5
+%define zTMP1 zmm6
+%define zTMP2 zmm7
+%define zTMP3 zmm8
+
+%define ZKEY0 zmm17
+%define ZKEY1 zmm18
+%define ZKEY2 zmm19
+%define ZKEY3 zmm20
+%define ZKEY4 zmm21
+%define ZKEY5 zmm22
+%define ZKEY6 zmm23
+%define ZKEY7 zmm24
+%define ZKEY8 zmm25
+%define ZKEY9 zmm26
+%define ZKEY10 zmm27
+%define ZKEY11 zmm28
+%define ZKEY12 zmm29
+%define ZKEY13 zmm30
+%define ZKEY14 zmm31
+
+%ifdef LINUX
+%define p_in rdi
+%define p_IV rsi
+%define p_keys rdx
+%define p_out rcx
+%define num_bytes r8
+%else
+%define p_in rcx
+%define p_IV rdx
+%define p_keys r8
+%define p_out r9
+%define num_bytes rax
+%endif
+
+%define tmp r10
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; macro to preload keys
+;;; - uses ZKEY[0-14] registers (ZMM)
+%macro LOAD_KEYS 2
+%define %%KEYS %1 ; [in] key pointer
+%define %%NROUNDS %2 ; [in] numerical value, number of AES rounds
+ ; excluding 1st and last rounds.
+ ; Example: AES-128 -> value 9
+
+%assign i 0
+%rep (%%NROUNDS + 2)
+ vbroadcastf64x2 ZKEY %+ i, [%%KEYS + 16*i]
+%assign i (i + 1)
+%endrep
+
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; This macro is used to "cool down" pipeline after DECRYPT_16_PARALLEL macro
+;;; code as the number of final blocks is variable.
+;;; Processes the last %%num_final_blocks blocks (1 to 15, can't be 0)
+
+%macro FINAL_BLOCKS 14
+%define %%PLAIN_OUT %1 ; [in] output buffer
+%define %%CIPH_IN %2 ; [in] input buffer
+%define %%LAST_CIPH_BLK %3 ; [in/out] ZMM with IV/last cipher blk (in idx 3)
+%define %%num_final_blocks %4 ; [in] numerical value (1 - 15)
+%define %%CIPHER_PLAIN_0_3 %5 ; [out] ZMM next 0-3 cipher blocks
+%define %%CIPHER_PLAIN_4_7 %6 ; [out] ZMM next 4-7 cipher blocks
+%define %%CIPHER_PLAIN_8_11 %7 ; [out] ZMM next 8-11 cipher blocks
+%define %%CIPHER_PLAIN_12_15 %8 ; [out] ZMM next 12-15 cipher blocks
+%define %%ZT1 %9 ; [clobbered] ZMM temporary
+%define %%ZT2 %10 ; [clobbered] ZMM temporary
+%define %%ZT3 %11 ; [clobbered] ZMM temporary
+%define %%ZT4 %12 ; [clobbered] ZMM temporary
+%define %%IA0 %13 ; [clobbered] GP temporary
+%define %%NROUNDS %14 ; [in] number of rounds; numerical value
+
+ ;; load plain/cipher text
+ ZMM_LOAD_BLOCKS_0_16 %%num_final_blocks, %%CIPH_IN, 0, \
+ %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \
+ %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15
+
+ ;; Prepare final cipher text blocks to
+ ;; be XOR'd later after AESDEC
+ valignq %%ZT1, %%CIPHER_PLAIN_0_3, %%LAST_CIPH_BLK, 6
+%if %%num_final_blocks > 4
+ valignq %%ZT2, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_0_3, 6
+%endif
+%if %%num_final_blocks > 8
+ valignq %%ZT3, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_4_7, 6
+%endif
+%if %%num_final_blocks > 12
+ valignq %%ZT4, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_8_11, 6
+%endif
+
+ ;; Update IV with last cipher block
+ ;; to be used later in DECRYPT_16_PARALLEL
+%if %%num_final_blocks == 1
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, 2
+%elif %%num_final_blocks == 2
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, 4
+%elif %%num_final_blocks == 3
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, 6
+%elif %%num_final_blocks == 4
+ vmovdqa64 %%LAST_CIPH_BLK, %%CIPHER_PLAIN_0_3
+%elif %%num_final_blocks == 5
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, 2
+%elif %%num_final_blocks == 6
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, 4
+%elif %%num_final_blocks == 7
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, 6
+%elif %%num_final_blocks == 8
+ vmovdqa64 %%LAST_CIPH_BLK, %%CIPHER_PLAIN_4_7
+%elif %%num_final_blocks == 9
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, 2
+%elif %%num_final_blocks == 10
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, 4
+%elif %%num_final_blocks == 11
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, 6
+%elif %%num_final_blocks == 12
+ vmovdqa64 %%LAST_CIPH_BLK, %%CIPHER_PLAIN_8_11
+%elif %%num_final_blocks == 13
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, 2
+%elif %%num_final_blocks == 14
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, 4
+%elif %%num_final_blocks == 15
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, 6
+%endif
+
+ ;; AES rounds
+%assign j 0
+%rep (%%NROUNDS + 2)
+ ZMM_AESDEC_ROUND_BLOCKS_0_16 %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \
+ %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15, \
+ ZKEY %+ j, j, no_data, no_data, no_data, no_data, \
+ %%num_final_blocks, %%NROUNDS
+%assign j (j + 1)
+%endrep
+
+ ;; XOR with decrypted blocks to get plain text
+ vpxorq %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, %%ZT1
+%if %%num_final_blocks > 4
+ vpxorq %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, %%ZT2
+%endif
+%if %%num_final_blocks > 8
+ vpxorq %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, %%ZT3
+%endif
+%if %%num_final_blocks > 12
+ vpxorq %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, %%ZT4
+%endif
+
+ ;; write plain text back to output
+ ZMM_STORE_BLOCKS_0_16 %%num_final_blocks, %%PLAIN_OUT, 0, \
+ %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \
+ %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15
+
+%endmacro ; FINAL_BLOCKS
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Main AES-CBC decrypt macro
+;;; - operates on single stream
+;;; - decrypts 16 blocks at a time
+%macro DECRYPT_16_PARALLEL 14
+%define %%PLAIN_OUT %1 ; [in] output buffer
+%define %%CIPH_IN %2 ; [in] input buffer
+%define %%LENGTH %3 ; [in/out] number of bytes to process
+%define %%LAST_CIPH_BLK %4 ; [in/out] ZMM with IV (first block) or last cipher block (idx 3)
+%define %%CIPHER_PLAIN_0_3 %5 ; [out] ZMM next 0-3 cipher blocks
+%define %%CIPHER_PLAIN_4_7 %6 ; [out] ZMM next 4-7 cipher blocks
+%define %%CIPHER_PLAIN_8_11 %7 ; [out] ZMM next 8-11 cipher blocks
+%define %%CIPHER_PLAIN_12_15 %8 ; [out] ZMM next 12-15 cipher blocks
+%define %%ZT1 %9 ; [clobbered] ZMM temporary
+%define %%ZT2 %10 ; [clobbered] ZMM temporary
+%define %%ZT3 %11 ; [clobbered] ZMM temporary
+%define %%ZT4 %12 ; [clobbered] ZMM temporary
+%define %%NROUNDS %13 ; [in] number of rounds; numerical value
+%define %%IA0 %14 ; [clobbered] GP temporary
+
+ vmovdqu8 %%CIPHER_PLAIN_0_3, [%%CIPH_IN]
+ vmovdqu8 %%CIPHER_PLAIN_4_7, [%%CIPH_IN + 64]
+ vmovdqu8 %%CIPHER_PLAIN_8_11, [%%CIPH_IN + 128]
+ vmovdqu8 %%CIPHER_PLAIN_12_15, [%%CIPH_IN + 192]
+
+ ;; prepare first set of cipher blocks for later XOR'ing
+ valignq %%ZT1, %%CIPHER_PLAIN_0_3, %%LAST_CIPH_BLK, 6
+ valignq %%ZT2, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_0_3, 6
+ valignq %%ZT3, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_4_7, 6
+ valignq %%ZT4, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_8_11, 6
+
+ ;; store last cipher text block to be used for next 16 blocks
+ vmovdqa64 %%LAST_CIPH_BLK, %%CIPHER_PLAIN_12_15
+
+ ;; AES rounds
+%assign j 0
+%rep (%%NROUNDS + 2)
+ ZMM_AESDEC_ROUND_BLOCKS_0_16 %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \
+ %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15, \
+ ZKEY %+ j, j, no_data, no_data, no_data, no_data, \
+ 16, %%NROUNDS
+%assign j (j + 1)
+%endrep
+
+ ;; XOR with decrypted blocks to get plain text
+ vpxorq %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, %%ZT1
+ vpxorq %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, %%ZT2
+ vpxorq %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, %%ZT3
+ vpxorq %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, %%ZT4
+
+ ;; write plain text back to output
+ vmovdqu8 [%%PLAIN_OUT], %%CIPHER_PLAIN_0_3
+ vmovdqu8 [%%PLAIN_OUT + 64], %%CIPHER_PLAIN_4_7
+ vmovdqu8 [%%PLAIN_OUT + 128], %%CIPHER_PLAIN_8_11
+ vmovdqu8 [%%PLAIN_OUT + 192], %%CIPHER_PLAIN_12_15
+
+ ;; adjust input pointer and length
+ sub %%LENGTH, (16 * 16)
+ add %%CIPH_IN, (16 * 16)
+ add %%PLAIN_OUT, (16 * 16)
+
+%endmacro ; DECRYPT_16_PARALLEL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; AES_CBC_DEC macro decrypts given data.
+;;; Flow:
+;;; - Decrypt all blocks (multiple of 16) up to final 1-15 blocks
+;;; - Decrypt final blocks (1-15 blocks)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro AES_CBC_DEC 7
+%define %%CIPH_IN %1 ;; [in] pointer to input buffer
+%define %%PLAIN_OUT %2 ;; [in] pointer to output buffer
+%define %%KEYS %3 ;; [in] pointer to expanded keys
+%define %%IV %4 ;; [in] pointer to IV
+%define %%LENGTH %5 ;; [in/out] GP register with length in bytes
+%define %%NROUNDS %6 ;; [in] Number of AES rounds; numerical value
+%define %%TMP %7 ;; [clobbered] GP register
+
+ cmp %%LENGTH, 0
+ je %%cbc_dec_done
+
+ vinserti64x2 zIV, zIV, [%%IV], 3
+
+ ;; preload keys
+ LOAD_KEYS %%KEYS, %%NROUNDS
+
+%%decrypt_16_parallel:
+ cmp %%LENGTH, 256
+ jb %%final_blocks
+
+ DECRYPT_16_PARALLEL %%PLAIN_OUT, %%CIPH_IN, %%LENGTH, zIV, \
+ zBLK_0_3, zBLK_4_7, zBLK_8_11, zBLK_12_15, \
+ zTMP0, zTMP1, zTMP2, zTMP3, %%NROUNDS, %%TMP
+ jmp %%decrypt_16_parallel
+
+%%final_blocks:
+ ;; get num final blocks
+ shr %%LENGTH, 4
+ and %%LENGTH, 0xf
+ je %%cbc_dec_done
+
+ cmp %%LENGTH, 8
+ je %%final_num_blocks_is_8
+ jl %%final_blocks_is_1_7
+
+ ; Final blocks 9-15
+ cmp %%LENGTH, 12
+ je %%final_num_blocks_is_12
+ jl %%final_blocks_is_9_11
+
+ ; Final blocks 13-15
+ cmp %%LENGTH, 15
+ je %%final_num_blocks_is_15
+ cmp %%LENGTH, 14
+ je %%final_num_blocks_is_14
+ cmp %%LENGTH, 13
+ je %%final_num_blocks_is_13
+
+%%final_blocks_is_9_11:
+ cmp %%LENGTH, 11
+ je %%final_num_blocks_is_11
+ cmp %%LENGTH, 10
+ je %%final_num_blocks_is_10
+ cmp %%LENGTH, 9
+ je %%final_num_blocks_is_9
+
+%%final_blocks_is_1_7:
+ cmp %%LENGTH, 4
+ je %%final_num_blocks_is_4
+ jl %%final_blocks_is_1_3
+
+ ; Final blocks 5-7
+ cmp %%LENGTH, 7
+ je %%final_num_blocks_is_7
+ cmp %%LENGTH, 6
+ je %%final_num_blocks_is_6
+ cmp %%LENGTH, 5
+ je %%final_num_blocks_is_5
+
+%%final_blocks_is_1_3:
+ cmp %%LENGTH, 3
+ je %%final_num_blocks_is_3
+ cmp %%LENGTH, 2
+ je %%final_num_blocks_is_2
+ jmp %%final_num_blocks_is_1
+
+
+%%final_num_blocks_is_15:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 15, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_14:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 14, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_13:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 13, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_12:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 12, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_11:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 11, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_10:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 10, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_9:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 9, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_8:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 8, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_7:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 7, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_6:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 6, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_5:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 5, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_4:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 4, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_3:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 3, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_2:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 2, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_1:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 1, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+
+%%cbc_dec_done:
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aes_cbc_dec_128_vaes_avx512(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cbc_dec_128_vaes_avx512,function,internal)
+aes_cbc_dec_128_vaes_avx512:
+%ifndef LINUX
+ mov num_bytes, [rsp + 8*5]
+%endif
+ AES_CBC_DEC p_in, p_out, p_keys, p_IV, num_bytes, 9, tmp
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aes_cbc_dec_192_vaes_avx512(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cbc_dec_192_vaes_avx512,function,internal)
+aes_cbc_dec_192_vaes_avx512:
+%ifndef LINUX
+ mov num_bytes, [rsp + 8*5]
+%endif
+ AES_CBC_DEC p_in, p_out, p_keys, p_IV, num_bytes, 11, tmp
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aes_cbc_dec_256_vaes_avx512(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cbc_dec_256_vaes_avx512,function,internal)
+aes_cbc_dec_256_vaes_avx512:
+%ifndef LINUX
+ mov num_bytes, [rsp + 8*5]
+%endif
+ AES_CBC_DEC p_in, p_out, p_keys, p_IV, num_bytes, 13, tmp
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
diff --git a/src/spdk/intel-ipsec-mb/avx512/aes_cbc_enc_vaes_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/aes_cbc_enc_vaes_avx512.asm
new file mode 100644
index 000000000..c4b1dd561
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/aes_cbc_enc_vaes_avx512.asm
@@ -0,0 +1,727 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;;; routines to do 128/192/256 bit CBC AES encrypt
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+struc STACK
+_gpr_save: resq 3
+endstruc
+
+%define GPR_SAVE_AREA rsp + _gpr_save
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rcx
+%define arg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 rdi
+%define arg4 rsi
+%endif
+
+%define ARG arg1
+%define LEN arg2
+
+%define IA0 rax
+%define IA1 rbx
+%define IA2 arg3
+%define IA3 arg4
+%define IA4 rbp
+%define IA5 r8
+%define IA6 r9
+%define IA7 r10
+%define IA8 r11
+%define IA9 r13
+%define IA10 r14
+%define IA11 r15
+%define IA12 r12
+
+%define ZIV00_03 zmm8
+%define ZIV04_07 zmm9
+%define ZIV08_11 zmm10
+%define ZIV12_15 zmm11
+
+%define ZT0 zmm16
+%define ZT1 zmm17
+%define ZT2 zmm18
+%define ZT3 zmm19
+%define ZT4 zmm20
+%define ZT5 zmm21
+%define ZT6 zmm22
+%define ZT7 zmm23
+%define ZT8 zmm24
+%define ZT9 zmm25
+%define ZT10 zmm26
+%define ZT11 zmm27
+%define ZT12 zmm28
+%define ZT13 zmm29
+%define ZT14 zmm30
+%define ZT15 zmm31
+
+%define ZT16 zmm12
+%define ZT17 zmm13
+%define ZT18 zmm14
+%define ZT19 zmm15
+
+%define TAB_A0B0A1B1 zmm6
+%define TAB_A2B2A3B3 zmm7
+
+;; Save registers states
+%macro FUNC_SAVE 0
+ sub rsp, STACK_size
+ mov [GPR_SAVE_AREA + 8*0], rbp
+%ifndef LINUX
+ mov [GPR_SAVE_AREA + 8*1], rsi
+ mov [GPR_SAVE_AREA + 8*2], rdi
+%endif
+%endmacro
+
+;; Restore register states
+%macro FUNC_RESTORE 0
+ ;; XMMs are saved at a higher level
+ mov rbp, [GPR_SAVE_AREA + 8*0]
+%ifndef LINUX
+ mov rsi, [GPR_SAVE_AREA + 8*1]
+ mov rdi, [GPR_SAVE_AREA + 8*2]
+%endif
+ add rsp, STACK_size
+ vzeroupper
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Transpose macro - executes 4x4 transpose of 4 ZMM registers
+; in: L0B0-3 out: B0L0-3
+; L1B0-3 B1L0-3
+; L2B0-3 B2L0-3
+; L3B0-3 B3L0-3
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro TRANSPOSE_4x4 8
+%define %%IN_OUT_0 %1
+%define %%IN_OUT_1 %2
+%define %%IN_OUT_2 %3
+%define %%IN_OUT_3 %4
+%define %%ZTMP_0 %5
+%define %%ZTMP_1 %6
+%define %%ZTMP_2 %7
+%define %%ZTMP_3 %8
+
+ vmovdqa64 %%ZTMP_0, TAB_A0B0A1B1
+ vmovdqa64 %%ZTMP_1, %%ZTMP_0
+ vmovdqa64 %%ZTMP_2, TAB_A2B2A3B3
+ vmovdqa64 %%ZTMP_3, %%ZTMP_2
+
+ vpermi2q %%ZTMP_0, %%IN_OUT_0, %%IN_OUT_1
+ vpermi2q %%ZTMP_1, %%IN_OUT_2, %%IN_OUT_3
+ vpermi2q %%ZTMP_2, %%IN_OUT_0, %%IN_OUT_1
+ vpermi2q %%ZTMP_3, %%IN_OUT_2, %%IN_OUT_3
+
+ vshufi64x2 %%IN_OUT_0, %%ZTMP_0, %%ZTMP_1, 0x44
+ vshufi64x2 %%IN_OUT_2, %%ZTMP_2, %%ZTMP_3, 0x44
+ vshufi64x2 %%IN_OUT_1, %%ZTMP_0, %%ZTMP_1, 0xee
+ vshufi64x2 %%IN_OUT_3, %%ZTMP_2, %%ZTMP_3, 0xee
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LOAD_STORE - loads/stores 1-4 blocks (16 bytes) for 4 lanes into ZMM registers
+; - Loads 4 blocks by default
+; - Pass %%MASK_REG argument to load/store 1-3 blocks (optional)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro LOAD_STORE_x4 15-16
+%define %%LANE_A %1 ; [in] lane index to load/store (numerical)
+%define %%LANE_B %2 ; [in] lane index to load/store (numerical)
+%define %%LANE_C %3 ; [in] lane index to load/store (numerical)
+%define %%LANE_D %4 ; [in] lane index to load/store (numerical)
+%define %%DATA_PTR %5 ; [in] GP reg with ptr to lane input table
+%define %%OFFSET %6 ; [in] GP reg input/output buffer offset
+%define %%ZDATA0 %7 ; [in/out] ZMM reg to load/store data
+%define %%ZDATA1 %8 ; [in/out] ZMM reg to load/store data
+%define %%ZDATA2 %9 ; [in/out] ZMM reg to load/store data
+%define %%ZDATA3 %10 ; [in/out] ZMM reg to load/store data
+%define %%GP0 %11 ; [clobbered] tmp GP reg
+%define %%GP1 %12 ; [clobbered] tmp GP reg
+%define %%GP2 %13 ; [clobbered] tmp GP reg
+%define %%GP3 %14 ; [clobbered] tmp GP reg
+%define %%LOAD_STORE %15 ; [in] string value to select LOAD or STORE
+%define %%MASK_REG %16 ; [in] mask reg used for load/store mask
+%define %%NUM_ARGS %0
+
+ mov %%GP0, [%%DATA_PTR + 8*(%%LANE_A)]
+ mov %%GP1, [%%DATA_PTR + 8*(%%LANE_B)]
+ mov %%GP2, [%%DATA_PTR + 8*(%%LANE_C)]
+ mov %%GP3, [%%DATA_PTR + 8*(%%LANE_D)]
+
+%if %%NUM_ARGS <= 15 ;; %%MASK_REG not set, assume 4 block load/store
+%ifidn %%LOAD_STORE, LOAD
+ vmovdqu8 %%ZDATA0, [%%GP0 + %%OFFSET]
+ vmovdqu8 %%ZDATA1, [%%GP1 + %%OFFSET]
+ vmovdqu8 %%ZDATA2, [%%GP2 + %%OFFSET]
+ vmovdqu8 %%ZDATA3, [%%GP3 + %%OFFSET]
+%else ; STORE8
+ vmovdqu8 [%%GP0 + %%OFFSET], %%ZDATA0
+ vmovdqu8 [%%GP1 + %%OFFSET], %%ZDATA1
+ vmovdqu8 [%%GP2 + %%OFFSET], %%ZDATA2
+ vmovdqu8 [%%GP3 + %%OFFSET], %%ZDATA3
+%endif
+%else ;; %%MASK_REG argument passed - 1, 2, or 3 block load/store
+%ifidn %%LOAD_STORE, LOAD
+ vmovdqu8 %%ZDATA0{%%MASK_REG}{z}, [%%GP0 + %%OFFSET]
+ vmovdqu8 %%ZDATA1{%%MASK_REG}{z}, [%%GP1 + %%OFFSET]
+ vmovdqu8 %%ZDATA2{%%MASK_REG}{z}, [%%GP2 + %%OFFSET]
+ vmovdqu8 %%ZDATA3{%%MASK_REG}{z}, [%%GP3 + %%OFFSET]
+%else ; STORE
+ vmovdqu8 [%%GP0 + %%OFFSET]{%%MASK_REG}, %%ZDATA0
+ vmovdqu8 [%%GP1 + %%OFFSET]{%%MASK_REG}, %%ZDATA1
+ vmovdqu8 [%%GP2 + %%OFFSET]{%%MASK_REG}, %%ZDATA2
+ vmovdqu8 [%%GP3 + %%OFFSET]{%%MASK_REG}, %%ZDATA3
+%endif
+%endif ;; %%NUM_ARGS
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; AESENC_ROUNDS_x16 macro
+; - 16 lanes, 1 block per lane
+; - it handles special cases: the last and zero rounds
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro AESENC_ROUNDS_x16 5
+%define %%L00_03 %1 ; [in/out] ZMM with lane 0-3 blocks
+%define %%L04_07 %2 ; [in/out] ZMM with lane 4-7 blocks
+%define %%L08_11 %3 ; [in/out] ZMM with lane 8-11 blocks
+%define %%L12_15 %4 ; [in/out] ZMM with lane 12-15 blocks
+%define %%NROUNDS %5 ; [in] number of aes rounds
+
+%define %%KP ARG + _aesarg_key_tab
+%define K00_03_OFFSET 0
+%define K04_07_OFFSET 64
+%define K08_11_OFFSET 128
+%define K12_15_OFFSET 192
+
+%assign ROUND 0
+%rep (%%NROUNDS + 2)
+
+%if ROUND < 1
+ ;; XOR with key 0 before doing aesenc
+ vpxorq %%L00_03, [%%KP + K00_03_OFFSET + ROUND * (16*16)]
+ vpxorq %%L04_07, [%%KP + K04_07_OFFSET + ROUND * (16*16)]
+ vpxorq %%L08_11, [%%KP + K08_11_OFFSET + ROUND * (16*16)]
+ vpxorq %%L12_15, [%%KP + K12_15_OFFSET + ROUND * (16*16)]
+%else
+%if ROUND <= %%NROUNDS
+
+ ;; rounds 1 to 9/11/13
+ vaesenc %%L00_03, %%L00_03, [%%KP + K00_03_OFFSET + ROUND * (16*16)]
+ vaesenc %%L04_07, %%L04_07, [%%KP + K04_07_OFFSET + ROUND * (16*16)]
+ vaesenc %%L08_11, %%L08_11, [%%KP + K08_11_OFFSET + ROUND * (16*16)]
+ vaesenc %%L12_15, %%L12_15, [%%KP + K12_15_OFFSET + ROUND * (16*16)]
+%else
+ ;; the last round
+ vaesenclast %%L00_03, %%L00_03, [%%KP + K00_03_OFFSET + ROUND * (16*16)]
+ vaesenclast %%L04_07, %%L04_07, [%%KP + K04_07_OFFSET + ROUND * (16*16)]
+ vaesenclast %%L08_11, %%L08_11, [%%KP + K08_11_OFFSET + ROUND * (16*16)]
+ vaesenclast %%L12_15, %%L12_15, [%%KP + K12_15_OFFSET + ROUND * (16*16)]
+%endif
+%endif
+
+%assign ROUND (ROUND + 1)
+%endrep
+
+%endmacro ; AESENC_ROUNDS_x16
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; ENCRYPT_16_PARALLEL - Encode all blocks up to multiple of 4
+; - Operation
+; - loop encrypting %%LENGTH bytes of input data
+; - each loop encrypts 4 blocks across 16 lanes
+; - stop when %%LENGTH is less than 64 bytes (4 blocks)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro ENCRYPT_16_PARALLEL 31
+%define %%ZIV00_03 %1 ;; [in] lane 0-3 IVs
+%define %%ZIV04_07 %2 ;; [in] lane 4-7 IVs
+%define %%ZIV08_11 %3 ;; [in] lane 8-11 IVs
+%define %%ZIV12_15 %4 ;; [in] lane 12-15 IVs
+%define %%LENGTH %5 ;; [in/out] GP register with length in bytes
+%define %%NROUNDS %6 ;; [in] Number of AES rounds; numerical value
+%define %%IDX %7 ;; [clobbered] GP reg to maintain idx
+%define %%B0L00_03 %8 ;; [clobbered] tmp ZMM register
+%define %%B0L04_07 %9 ;; [clobbered] tmp ZMM register
+%define %%B0L08_11 %10 ;; [clobbered] tmp ZMM register
+%define %%B0L12_15 %11 ;; [clobbered] tmp ZMM register
+%define %%B1L00_03 %12 ;; [clobbered] tmp ZMM register
+%define %%B1L04_07 %13 ;; [clobbered] tmp ZMM register
+%define %%B1L08_11 %14 ;; [clobbered] tmp ZMM register
+%define %%B1L12_15 %15 ;; [clobbered] tmp ZMM register
+%define %%B2L00_03 %16 ;; [clobbered] tmp ZMM register
+%define %%B2L04_07 %17 ;; [clobbered] tmp ZMM register
+%define %%B2L08_11 %18 ;; [clobbered] tmp ZMM register
+%define %%B2L12_15 %19 ;; [clobbered] tmp ZMM register
+%define %%B3L00_03 %20 ;; [clobbered] tmp ZMM register
+%define %%B3L04_07 %21 ;; [clobbered] tmp ZMM register
+%define %%B3L08_11 %22 ;; [clobbered] tmp ZMM register
+%define %%B3L12_15 %23 ;; [clobbered] tmp ZMM register
+%define %%ZTMP0 %24 ;; [clobbered] tmp ZMM register
+%define %%ZTMP1 %25 ;; [clobbered] tmp ZMM register
+%define %%ZTMP2 %26 ;; [clobbered] tmp ZMM register
+%define %%ZTMP3 %27 ;; [clobbered] tmp ZMM register
+%define %%TMP0 %28 ;; [clobbered] tmp GP register
+%define %%TMP1 %29 ;; [clobbered] tmp GP register
+%define %%TMP2 %30 ;; [clobbered] tmp GP register
+%define %%TMP3 %31 ;; [clobbered] tmp GP register
+
+%define %%IN ARG + _aesarg_in
+%define %%OUT ARG + _aesarg_out
+
+ ;; check for at least 4 blocks
+ cmp %%LENGTH, 64
+ jl %%encrypt_16_done
+
+ xor %%IDX, %%IDX
+ ;; skip length check on first loop
+ jmp %%encrypt_16_first
+
+%%encrypt_16_start:
+ cmp %%LENGTH, 64
+ jl %%encrypt_16_end
+
+%%encrypt_16_first:
+ ;; load 4 plaintext blocks for lanes 0-3
+ LOAD_STORE_x4 0, 1, 2, 3, %%IN, %%IDX, %%B0L00_03, %%B1L00_03, \
+ %%B2L00_03, %%B3L00_03, %%TMP0, %%TMP1, %%TMP2, %%TMP3, LOAD
+
+ TRANSPOSE_4x4 %%B0L00_03, %%B1L00_03, %%B2L00_03, %%B3L00_03, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ ;; load 4 plaintext blocks for lanes 4-7
+ LOAD_STORE_x4 4, 5, 6, 7, %%IN, %%IDX, %%B0L04_07, %%B1L04_07, \
+ %%B2L04_07, %%B3L04_07, %%TMP0, %%TMP1, %%TMP2, %%TMP3, LOAD
+
+ TRANSPOSE_4x4 %%B0L04_07, %%B1L04_07, %%B2L04_07, %%B3L04_07, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ ;; load 4 plaintext blocks for lanes 8-11
+ LOAD_STORE_x4 8, 9, 10, 11, %%IN, %%IDX, %%B0L08_11, %%B1L08_11, \
+ %%B2L08_11, %%B3L08_11, %%TMP0, %%TMP1, %%TMP2, %%TMP3, LOAD
+
+ TRANSPOSE_4x4 %%B0L08_11, %%B1L08_11, %%B2L08_11, %%B3L08_11, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ ;; load 4 plaintext blocks for lanes 12-15
+ LOAD_STORE_x4 12, 13, 14, 15, %%IN, %%IDX, %%B0L12_15, %%B1L12_15, \
+ %%B2L12_15, %%B3L12_15, %%TMP0, %%TMP1, %%TMP2, %%TMP3, LOAD
+
+ TRANSPOSE_4x4 %%B0L12_15, %%B1L12_15, %%B2L12_15, %%B3L12_15, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ ;; xor first plaintext block with IV
+ vpxorq %%B0L00_03, %%ZIV00_03
+ vpxorq %%B0L04_07, %%ZIV04_07
+ vpxorq %%B0L08_11, %%ZIV08_11
+ vpxorq %%B0L12_15, %%ZIV12_15
+
+ ;; encrypt block 0 lanes
+ AESENC_ROUNDS_x16 %%B0L00_03, %%B0L04_07, %%B0L08_11, %%B0L12_15, %%NROUNDS
+
+ ;; xor plaintext block with last cipher block
+ vpxorq %%B1L00_03, %%B0L00_03
+ vpxorq %%B1L04_07, %%B0L04_07
+ vpxorq %%B1L08_11, %%B0L08_11
+ vpxorq %%B1L12_15, %%B0L12_15
+
+ ;; encrypt block 1 lanes
+ AESENC_ROUNDS_x16 %%B1L00_03, %%B1L04_07, %%B1L08_11, %%B1L12_15, %%NROUNDS
+
+ ;; xor plaintext block with last cipher block
+ vpxorq %%B2L00_03, %%B1L00_03
+ vpxorq %%B2L04_07, %%B1L04_07
+ vpxorq %%B2L08_11, %%B1L08_11
+ vpxorq %%B2L12_15, %%B1L12_15
+
+ ;; encrypt block 2 lanes
+ AESENC_ROUNDS_x16 %%B2L00_03, %%B2L04_07, %%B2L08_11, %%B2L12_15, %%NROUNDS
+
+ ;; xor plaintext block with last cipher block
+ vpxorq %%B3L00_03, %%B2L00_03
+ vpxorq %%B3L04_07, %%B2L04_07
+ vpxorq %%B3L08_11, %%B2L08_11
+ vpxorq %%B3L12_15, %%B2L12_15
+
+ ;; encrypt block 3 lanes
+ AESENC_ROUNDS_x16 %%B3L00_03, %%B3L04_07, %%B3L08_11, %%B3L12_15, %%NROUNDS
+
+ ;; store last cipher block
+ vmovdqa64 %%ZIV00_03, %%B3L00_03
+ vmovdqa64 %%ZIV04_07, %%B3L04_07
+ vmovdqa64 %%ZIV08_11, %%B3L08_11
+ vmovdqa64 %%ZIV12_15, %%B3L12_15
+
+ ;; write back cipher text for lanes 0-3
+ TRANSPOSE_4x4 %%B0L00_03, %%B1L00_03, %%B2L00_03, %%B3L00_03, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ LOAD_STORE_x4 0, 1, 2, 3, %%OUT, %%IDX, %%B0L00_03, %%B1L00_03, \
+ %%B2L00_03, %%B3L00_03, %%TMP0, %%TMP1, %%TMP2, %%TMP3, STORE
+
+ ;; write back cipher text for lanes 4-7
+ TRANSPOSE_4x4 %%B0L04_07, %%B1L04_07, %%B2L04_07, %%B3L04_07, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ LOAD_STORE_x4 4, 5, 6, 7, %%OUT, %%IDX, %%B0L04_07, %%B1L04_07, \
+ %%B2L04_07, %%B3L04_07, %%TMP0, %%TMP1, %%TMP2, %%TMP3, STORE
+
+ ;; write back cipher text for lanes 8-11
+ TRANSPOSE_4x4 %%B0L08_11, %%B1L08_11, %%B2L08_11, %%B3L08_11, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ LOAD_STORE_x4 8, 9, 10, 11, %%OUT, %%IDX, %%B0L08_11, %%B1L08_11, \
+ %%B2L08_11, %%B3L08_11, %%TMP0, %%TMP1, %%TMP2, %%TMP3, STORE
+
+ ;; write back cipher text for lanes 12-15
+ TRANSPOSE_4x4 %%B0L12_15, %%B1L12_15, %%B2L12_15, %%B3L12_15, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ LOAD_STORE_x4 12, 13, 14, 15, %%OUT, %%IDX, %%B0L12_15, %%B1L12_15, \
+ %%B2L12_15, %%B3L12_15, %%TMP0, %%TMP1, %%TMP2, %%TMP3, STORE
+
+ sub %%LENGTH, 64
+ add %%IDX, 64
+ jmp %%encrypt_16_start
+
+%%encrypt_16_end:
+ ;; update in/out pointers
+ vpbroadcastq %%ZTMP2, %%IDX
+ vpaddq %%ZTMP0, %%ZTMP2, [%%IN]
+ vpaddq %%ZTMP1, %%ZTMP2, [%%IN + 64]
+ vmovdqa64 [%%IN], %%ZTMP0
+ vmovdqa64 [%%IN + 64], %%ZTMP1
+
+ vpaddq %%ZTMP0, %%ZTMP2, [%%OUT]
+ vpaddq %%ZTMP1, %%ZTMP2, [%%OUT + 64]
+ vmovdqa64 [%%OUT], %%ZTMP0
+ vmovdqa64 [%%OUT + 64], %%ZTMP1
+
+%%encrypt_16_done:
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; ENCRYPT_16_FINAL Encodes final blocks (less than 4) across 16 lanes
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro ENCRYPT_16_FINAL 31
+%define %%ZIV00_03 %1 ;; [in] lane 0-3 IVs
+%define %%ZIV04_07 %2 ;; [in] lane 4-7 IVs
+%define %%ZIV08_11 %3 ;; [in] lane 8-11 IVs
+%define %%ZIV12_15 %4 ;; [in] lane 12-15 IVs
+%define %%NROUNDS %5 ;; [in] Number of AES rounds; numerical value
+%define %%IDX %6 ;; [clobbered] GP reg to maintain idx
+%define %%B0L00_03 %7 ;; [clobbered] tmp ZMM register
+%define %%B0L04_07 %8 ;; [clobbered] tmp ZMM register
+%define %%B0L08_11 %9 ;; [clobbered] tmp ZMM register
+%define %%B0L12_15 %10 ;; [clobbered] tmp ZMM register
+%define %%B1L00_03 %11 ;; [clobbered] tmp ZMM register
+%define %%B1L04_07 %12 ;; [clobbered] tmp ZMM register
+%define %%B1L08_11 %13 ;; [clobbered] tmp ZMM register
+%define %%B1L12_15 %14 ;; [clobbered] tmp ZMM register
+%define %%B2L00_03 %15 ;; [clobbered] tmp ZMM register
+%define %%B2L04_07 %16 ;; [clobbered] tmp ZMM register
+%define %%B2L08_11 %17 ;; [clobbered] tmp ZMM register
+%define %%B2L12_15 %18 ;; [clobbered] tmp ZMM register
+%define %%B3L00_03 %19 ;; [clobbered] tmp ZMM register
+%define %%B3L04_07 %20 ;; [clobbered] tmp ZMM register
+%define %%B3L08_11 %21 ;; [clobbered] tmp ZMM register
+%define %%B3L12_15 %22 ;; [clobbered] tmp ZMM register
+%define %%ZTMP0 %23 ;; [clobbered] tmp ZMM register
+%define %%ZTMP1 %24 ;; [clobbered] tmp ZMM register
+%define %%ZTMP2 %25 ;; [clobbered] tmp ZMM register
+%define %%ZTMP3 %26 ;; [clobbered] tmp ZMM register
+%define %%TMP0 %27 ;; [clobbered] tmp GP register
+%define %%TMP1 %28 ;; [clobbered] tmp GP register
+%define %%TMP2 %29 ;; [clobbered] tmp GP register
+%define %%TMP3 %30 ;; [clobbered] tmp GP register
+%define %%NUM_BLKS %31 ;; [in] number of blocks (numerical value)
+
+%define %%IN ARG + _aesarg_in
+%define %%OUT ARG + _aesarg_out
+
+%if %%NUM_BLKS == 1
+ mov %%TMP0, 0x0000_0000_0000_ffff
+ kmovq k1, %%TMP0
+%elif %%NUM_BLKS == 2
+ mov %%TMP0, 0x0000_0000_ffff_ffff
+ kmovq k1, %%TMP0
+%elif %%NUM_BLKS == 3
+ mov %%TMP0, 0x0000_ffff_ffff_ffff
+ kmovq k1, %%TMP0
+%endif
+ xor %%IDX, %%IDX
+
+ ;; load 4 plaintext blocks for lanes 0-3
+ LOAD_STORE_x4 0, 1, 2, 3, %%IN, %%IDX, %%B0L00_03, %%B1L00_03, \
+ %%B2L00_03, %%B3L00_03, %%TMP0, %%TMP1, %%TMP2, \
+ %%TMP3, LOAD, k1
+
+ TRANSPOSE_4x4 %%B0L00_03, %%B1L00_03, %%B2L00_03, %%B3L00_03, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ ;; load 4 plaintext blocks for lanes 4-7
+ LOAD_STORE_x4 4, 5, 6, 7, %%IN, %%IDX, %%B0L04_07, %%B1L04_07, \
+ %%B2L04_07, %%B3L04_07, %%TMP0, %%TMP1, %%TMP2, \
+ %%TMP3, LOAD, k1
+
+ TRANSPOSE_4x4 %%B0L04_07, %%B1L04_07, %%B2L04_07, %%B3L04_07, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ ;; load 4 plaintext blocks for lanes 8-11
+ LOAD_STORE_x4 8, 9, 10, 11, %%IN, %%IDX, %%B0L08_11, %%B1L08_11, \
+ %%B2L08_11, %%B3L08_11, %%TMP0, %%TMP1, %%TMP2, \
+ %%TMP3, LOAD, k1
+
+ TRANSPOSE_4x4 %%B0L08_11, %%B1L08_11, %%B2L08_11, %%B3L08_11, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ ;; load 4 plaintext blocks for lanes 12-15
+ LOAD_STORE_x4 12, 13, 14, 15, %%IN, %%IDX, %%B0L12_15, %%B1L12_15, \
+ %%B2L12_15, %%B3L12_15, %%TMP0, %%TMP1, %%TMP2, \
+ %%TMP3, LOAD, k1
+
+ TRANSPOSE_4x4 %%B0L12_15, %%B1L12_15, %%B2L12_15, %%B3L12_15, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ ;; xor plaintext block with IV
+ vpxorq %%B0L00_03, %%ZIV00_03
+ vpxorq %%B0L04_07, %%ZIV04_07
+ vpxorq %%B0L08_11, %%ZIV08_11
+ vpxorq %%B0L12_15, %%ZIV12_15
+
+ ;; encrypt block 0 lanes
+ AESENC_ROUNDS_x16 %%B0L00_03, %%B0L04_07, %%B0L08_11, %%B0L12_15, %%NROUNDS
+
+%if %%NUM_BLKS == 1
+ ;; store last cipher block
+ vmovdqa64 %%ZIV00_03, %%B0L00_03
+ vmovdqa64 %%ZIV04_07, %%B0L04_07
+ vmovdqa64 %%ZIV08_11, %%B0L08_11
+ vmovdqa64 %%ZIV12_15, %%B0L12_15
+%endif
+
+%if %%NUM_BLKS >= 2
+ ;; xor plaintext block with last cipher block
+ vpxorq %%B1L00_03, %%B0L00_03
+ vpxorq %%B1L04_07, %%B0L04_07
+ vpxorq %%B1L08_11, %%B0L08_11
+ vpxorq %%B1L12_15, %%B0L12_15
+
+ ;; encrypt block 1 lanes
+ AESENC_ROUNDS_x16 %%B1L00_03, %%B1L04_07, %%B1L08_11, %%B1L12_15, %%NROUNDS
+%endif
+%if %%NUM_BLKS == 2
+ ;; store last cipher block
+ vmovdqa64 %%ZIV00_03, %%B1L00_03
+ vmovdqa64 %%ZIV04_07, %%B1L04_07
+ vmovdqa64 %%ZIV08_11, %%B1L08_11
+ vmovdqa64 %%ZIV12_15, %%B1L12_15
+%endif
+
+%if %%NUM_BLKS >= 3
+ ;; xor plaintext block with last cipher block
+ vpxorq %%B2L00_03, %%B1L00_03
+ vpxorq %%B2L04_07, %%B1L04_07
+ vpxorq %%B2L08_11, %%B1L08_11
+ vpxorq %%B2L12_15, %%B1L12_15
+
+ ;; encrypt block 2 lanes
+ AESENC_ROUNDS_x16 %%B2L00_03, %%B2L04_07, %%B2L08_11, %%B2L12_15, %%NROUNDS
+%endif
+%if %%NUM_BLKS == 3
+ ;; store last cipher block
+ vmovdqa64 %%ZIV00_03, %%B2L00_03
+ vmovdqa64 %%ZIV04_07, %%B2L04_07
+ vmovdqa64 %%ZIV08_11, %%B2L08_11
+ vmovdqa64 %%ZIV12_15, %%B2L12_15
+%endif
+ ;; write back cipher text for lanes 0-3
+ TRANSPOSE_4x4 %%B0L00_03, %%B1L00_03, %%B2L00_03, %%B3L00_03, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ LOAD_STORE_x4 0, 1, 2, 3, %%OUT, %%IDX, %%B0L00_03, %%B1L00_03, \
+ %%B2L00_03, %%B3L00_03, %%TMP0, %%TMP1, %%TMP2, \
+ %%TMP3, STORE, k1
+
+ ;; write back cipher text for lanes 4-7
+ TRANSPOSE_4x4 %%B0L04_07, %%B1L04_07, %%B2L04_07, %%B3L04_07, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ LOAD_STORE_x4 4, 5, 6, 7, %%OUT, %%IDX, %%B0L04_07, %%B1L04_07, \
+ %%B2L04_07, %%B3L04_07, %%TMP0, %%TMP1, %%TMP2, \
+ %%TMP3, STORE, k1
+
+ ;; write back cipher text for lanes 8-11
+ TRANSPOSE_4x4 %%B0L08_11, %%B1L08_11, %%B2L08_11, %%B3L08_11, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ LOAD_STORE_x4 8, 9, 10, 11, %%OUT, %%IDX, %%B0L08_11, %%B1L08_11, \
+ %%B2L08_11, %%B3L08_11, %%TMP0, %%TMP1, %%TMP2, \
+ %%TMP3, STORE, k1
+
+ ;; write back cipher text for lanes 12-15
+ TRANSPOSE_4x4 %%B0L12_15, %%B1L12_15, %%B2L12_15, %%B3L12_15, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3
+
+ LOAD_STORE_x4 12, 13, 14, 15, %%OUT, %%IDX, %%B0L12_15, %%B1L12_15, \
+ %%B2L12_15, %%B3L12_15, %%TMP0, %%TMP1, %%TMP2, \
+ %%TMP3, STORE, k1
+
+ ;; update in/out pointers
+ mov %%IDX, %%NUM_BLKS
+ shl %%IDX, 4
+ vpbroadcastq %%ZTMP2, %%IDX
+ vpaddq %%ZTMP0, %%ZTMP2, [%%IN]
+ vpaddq %%ZTMP1, %%ZTMP2, [%%IN + 64]
+ vmovdqa64 [%%IN], %%ZTMP0
+ vmovdqa64 [%%IN + 64], %%ZTMP1
+
+ vpaddq %%ZTMP0, %%ZTMP2, [%%OUT]
+ vpaddq %%ZTMP1, %%ZTMP2, [%%OUT + 64]
+ vmovdqa64 [%%OUT], %%ZTMP0
+ vmovdqa64 [%%OUT + 64], %%ZTMP1
+
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CBC_ENC Encodes given data.
+; Requires the input data be at least 1 block (16 bytes) long
+; Input: Number of AES rounds
+;
+; First encrypts block up to multiple of 4
+; Then encrypts final blocks (less than 4)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CBC_ENC 1
+%define %%ROUNDS %1
+
+ ;; load transpose tables
+ vmovdqa64 TAB_A0B0A1B1, [rel A0B0A1B1]
+ vmovdqa64 TAB_A2B2A3B3, [rel A2B2A3B3]
+
+ ;; load IV's per lane
+ vmovdqa64 ZIV00_03, [ARG + _aesarg_IV + 16*0]
+ vmovdqa64 ZIV04_07, [ARG + _aesarg_IV + 16*4]
+ vmovdqa64 ZIV08_11, [ARG + _aesarg_IV + 16*8]
+ vmovdqa64 ZIV12_15, [ARG + _aesarg_IV + 16*12]
+
+ ENCRYPT_16_PARALLEL ZIV00_03, ZIV04_07, ZIV08_11, ZIV12_15, \
+ LEN, %%ROUNDS, IA12, ZT0, ZT1, ZT2, ZT3, ZT4, ZT5, \
+ ZT6, ZT7, ZT8, ZT9, ZT10, ZT11, ZT12, ZT13, ZT14, \
+ ZT15, ZT16, ZT17, ZT18, ZT19, IA2, IA3, IA4, IA5
+
+ ;; get num remaining blocks
+ shr LEN, 4
+ and LEN, 3
+ je %%_cbc_enc_done
+ cmp LEN, 1
+ je %%_final_blocks_1
+ cmp LEN, 2
+ je %%_final_blocks_2
+
+%%_final_blocks_3:
+ ENCRYPT_16_FINAL ZIV00_03, ZIV04_07, ZIV08_11, ZIV12_15, \
+ %%ROUNDS, IA12, ZT0, ZT1, ZT2, ZT3, ZT4, ZT5, ZT6, ZT7, \
+ ZT8, ZT9, ZT10, ZT11, ZT12, ZT13, ZT14, ZT15, ZT16, ZT17, \
+ ZT18, ZT19, IA2, IA3, IA4, IA5, 3
+ jmp %%_cbc_enc_done
+%%_final_blocks_1:
+ ENCRYPT_16_FINAL ZIV00_03, ZIV04_07, ZIV08_11, ZIV12_15, \
+ %%ROUNDS, IA12, ZT0, ZT1, ZT2, ZT3, ZT4, ZT5, ZT6, ZT7, \
+ ZT8, ZT9, ZT10, ZT11, ZT12, ZT13, ZT14, ZT15, ZT16, ZT17, \
+ ZT18, ZT19, IA2, IA3, IA4, IA5, 1
+ jmp %%_cbc_enc_done
+%%_final_blocks_2:
+ ENCRYPT_16_FINAL ZIV00_03, ZIV04_07, ZIV08_11, ZIV12_15, \
+ %%ROUNDS, IA12, ZT0, ZT1, ZT2, ZT3, ZT4, ZT5, ZT6, ZT7, \
+ ZT8, ZT9, ZT10, ZT11, ZT12, ZT13, ZT14, ZT15, ZT16, ZT17, \
+ ZT18, ZT19, IA2, IA3, IA4, IA5, 2
+%%_cbc_enc_done:
+ ;; store IV's per lane
+ vmovdqa64 [ARG + _aesarg_IV + 16*0], ZIV00_03
+ vmovdqa64 [ARG + _aesarg_IV + 16*4], ZIV04_07
+ vmovdqa64 [ARG + _aesarg_IV + 16*8], ZIV08_11
+ vmovdqa64 [ARG + _aesarg_IV + 16*12], ZIV12_15
+%endmacro
+
+
+section .data
+;;;;;;;;;;;;;;;;;;
+; Transpose tables
+;;;;;;;;;;;;;;;;;;
+default rel
+
+align 64
+A0B0A1B1:
+ dq 0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xa, 0xb
+
+align 64
+A2B2A3B3:
+ dq 0x4, 0x5, 0xc, 0xd, 0x6, 0x7, 0xe, 0xf
+
+
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cbc_enc_128_vaes_avx512(AES_ARGS *args, uint64_t len_in_bytes);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cbc_enc_128_vaes_avx512,function,internal)
+aes_cbc_enc_128_vaes_avx512:
+ FUNC_SAVE
+ CBC_ENC 9
+ FUNC_RESTORE
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cbc_enc_192_vaes_avx512(AES_ARGS *args, uint64_t len_in_bytes);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cbc_enc_192_vaes_avx512,function,internal)
+aes_cbc_enc_192_vaes_avx512:
+ FUNC_SAVE
+ CBC_ENC 11
+ FUNC_RESTORE
+ ret
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_cbc_enc_256_vaes_avx512(AES_ARGS *args, uint64_t len_in_bytes);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cbc_enc_256_vaes_avx512,function,internal)
+aes_cbc_enc_256_vaes_avx512:
+ FUNC_SAVE
+ CBC_ENC 13
+ FUNC_RESTORE
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/cntr_vaes_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/cntr_vaes_avx512.asm
new file mode 100644
index 000000000..50ff86b6e
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/cntr_vaes_avx512.asm
@@ -0,0 +1,1524 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2019, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "include/os.asm"
+%include "include/reg_sizes.asm"
+%include "mb_mgr_datastruct.asm"
+%include "job_aes_hmac.asm"
+%include "include/memcpy.asm"
+
+%include "include/aes_common.asm"
+%include "include/const.inc"
+
+section .data
+default rel
+
+align 16
+ONE:
+ dq 0x0000000000000001, 0x0000000000000000
+
+align 64
+SHUF_MASK:
+ dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+ dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+ dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+ dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+align 64
+ddq_add_13_16:
+ dq 0x000000000000000d, 0x0000000000000000
+ dq 0x000000000000000e, 0x0000000000000000
+ dq 0x000000000000000f, 0x0000000000000000
+ dq 0x0000000000000010, 0x0000000000000000
+
+align 64
+ddq_add_9_12:
+ dq 0x0000000000000009, 0x0000000000000000
+ dq 0x000000000000000a, 0x0000000000000000
+ dq 0x000000000000000b, 0x0000000000000000
+ dq 0x000000000000000c, 0x0000000000000000
+
+align 64
+ddq_add_5_8:
+ dq 0x0000000000000005, 0x0000000000000000
+ dq 0x0000000000000006, 0x0000000000000000
+ dq 0x0000000000000007, 0x0000000000000000
+ dq 0x0000000000000008, 0x0000000000000000
+
+align 64
+ddq_add_1_4:
+ dq 0x0000000000000001, 0x0000000000000000
+ dq 0x0000000000000002, 0x0000000000000000
+ dq 0x0000000000000003, 0x0000000000000000
+ dq 0x0000000000000004, 0x0000000000000000
+
+align 64
+ddq_add_12_15:
+ dq 0x000000000000000c, 0x0000000000000000
+ dq 0x000000000000000d, 0x0000000000000000
+ dq 0x000000000000000e, 0x0000000000000000
+ dq 0x000000000000000f, 0x0000000000000000
+
+align 64
+ddq_add_8_11:
+ dq 0x0000000000000008, 0x0000000000000000
+ dq 0x0000000000000009, 0x0000000000000000
+ dq 0x000000000000000a, 0x0000000000000000
+ dq 0x000000000000000b, 0x0000000000000000
+
+align 64
+ddq_add_4_7:
+ dq 0x0000000000000004, 0x0000000000000000
+ dq 0x0000000000000005, 0x0000000000000000
+ dq 0x0000000000000006, 0x0000000000000000
+ dq 0x0000000000000007, 0x0000000000000000
+
+align 64
+ddq_add_0_3:
+ dq 0x0000000000000000, 0x0000000000000000
+ dq 0x0000000000000001, 0x0000000000000000
+ dq 0x0000000000000002, 0x0000000000000000
+ dq 0x0000000000000003, 0x0000000000000000
+
+align 64
+ddq_add_16:
+ dq 0x0000000000000010, 0x0000000000000000
+ dq 0x0000000000000010, 0x0000000000000000
+ dq 0x0000000000000010, 0x0000000000000000
+ dq 0x0000000000000010, 0x0000000000000000
+
+align 64
+byte64_len_to_mask_table:
+ dq 0x0000000000000000, 0x0000000000000001
+ dq 0x0000000000000003, 0x0000000000000007
+ dq 0x000000000000000f, 0x000000000000001f
+ dq 0x000000000000003f, 0x000000000000007f
+ dq 0x00000000000000ff, 0x00000000000001ff
+ dq 0x00000000000003ff, 0x00000000000007ff
+ dq 0x0000000000000fff, 0x0000000000001fff
+ dq 0x0000000000003fff, 0x0000000000007fff
+ dq 0x000000000000ffff, 0x000000000001ffff
+ dq 0x000000000003ffff, 0x000000000007ffff
+ dq 0x00000000000fffff, 0x00000000001fffff
+ dq 0x00000000003fffff, 0x00000000007fffff
+ dq 0x0000000000ffffff, 0x0000000001ffffff
+ dq 0x0000000003ffffff, 0x0000000007ffffff
+ dq 0x000000000fffffff, 0x000000001fffffff
+ dq 0x000000003fffffff, 0x000000007fffffff
+ dq 0x00000000ffffffff, 0x00000001ffffffff
+ dq 0x00000003ffffffff, 0x00000007ffffffff
+ dq 0x0000000fffffffff, 0x0000001fffffffff
+ dq 0x0000003fffffffff, 0x0000007fffffffff
+ dq 0x000000ffffffffff, 0x000001ffffffffff
+ dq 0x000003ffffffffff, 0x000007ffffffffff
+ dq 0x00000fffffffffff, 0x00001fffffffffff
+ dq 0x00003fffffffffff, 0x00007fffffffffff
+ dq 0x0000ffffffffffff, 0x0001ffffffffffff
+ dq 0x0003ffffffffffff, 0x0007ffffffffffff
+ dq 0x000fffffffffffff, 0x001fffffffffffff
+ dq 0x003fffffffffffff, 0x007fffffffffffff
+ dq 0x00ffffffffffffff, 0x01ffffffffffffff
+ dq 0x03ffffffffffffff, 0x07ffffffffffffff
+ dq 0x0fffffffffffffff, 0x1fffffffffffffff
+ dq 0x3fffffffffffffff, 0x7fffffffffffffff
+ dq 0xffffffffffffffff
+
+align 16
+initial_12_IV_counter:
+ dq 0x0000000000000000, 0x0100000000000000
+
+mask_16_bytes:
+ dq 0x000000000000ffff
+
+section .text
+default rel
+
+%ifdef LINUX
+%define arg1 rdi
+%else
+%define arg1 rcx
+%endif
+
+%define ZKEY0 zmm17
+%define ZKEY1 zmm18
+%define ZKEY2 zmm19
+%define ZKEY3 zmm20
+%define ZKEY4 zmm21
+%define ZKEY5 zmm22
+%define ZKEY6 zmm23
+%define ZKEY7 zmm24
+%define ZKEY8 zmm25
+%define ZKEY9 zmm26
+%define ZKEY10 zmm27
+%define ZKEY11 zmm28
+%define ZKEY12 zmm29
+%define ZKEY13 zmm30
+%define ZKEY14 zmm31
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Stack frame definition
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, win64
+ %define GP_STORAGE (7*8) ; space for 7 GP registers
+%else
+ %define GP_STORAGE (5*8) ; space for 5 GP registers
+%endif
+
+%define STACK_FRAME_SIZE GP_STORAGE
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; This macro is used to maintain the bits from the output text
+;;; when writing out the output blocks, in case there are some bits
+;;; that do not require encryption
+%macro PRESERVE_BITS 12-13
+%define %%RBITS %1 ; [in] Remaining bits in last byte
+%define %%LENGTH %2 ; [in] Length of the last set of blocks
+%define %%CYPH_PLAIN_OUT %3 ; [in] Pointer to output buffer
+%define %%ZIN_OUT %4 ; [in/out] ZMM with last set of output blocks
+%define %%ZTMP0 %5 ; [clobbered] ZMM temporary
+%define %%ZTMP1 %6 ; [clobbered] ZMM temporary
+%define %%ZTMP2 %7 ; [clobbered] ZMM temporary
+%define %%IA0 %8 ; [clobbered] GP temporary
+%define %%IA1 %9 ; [clobbered] GP temporary
+%define %%blocks_to_skip %10 ; [in] Number of blocks to skip from output
+%define %%FULL_PARTIAL %11 ; [in] Last block type selection "full" or "partial"
+%define %%MASKREG %12 ; [clobbered] Mask register
+%define %%DATA_OFFSET %13 ; [in/out] Data offset
+%define %%NUM_ARGS %0
+
+;; offset = number of sets of 4 blocks to skip
+%assign offset (((%%blocks_to_skip) / 4) * 64)
+;; num_left_blocks = number of blocks in the last set
+%assign num_left_blocks (((%%blocks_to_skip) & 3) + 1) ;; Range 1-4 blocks
+
+%if %%NUM_ARGS == 13
+ ;; Load output to get last partial byte
+%ifidn %%FULL_PARTIAL, partial
+ vmovdqu8 %%ZTMP0{%%MASKREG}, [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + offset]
+%else
+ vmovdqu8 %%ZTMP0, [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + offset]
+%endif ; %%FULL_PARTIAL == partial
+%else
+ ;; Load output to get last partial byte (loading up to the last 4 blocks)
+ ZMM_LOAD_MASKED_BLOCKS_0_16 num_left_blocks, %%CYPH_PLAIN_OUT, offset, \
+ %%ZTMP0, no_zmm, no_zmm, no_zmm, %%MASKREG
+%endif ;; %%NUM_ARGS == 13
+
+ ;; Save RCX in temporary GP register
+ mov %%IA0, rcx
+ mov DWORD(%%IA1), 0xff
+ mov cl, BYTE(%%RBITS)
+ shr DWORD(%%IA1), cl ;; e.g. 3 remaining bits -> mask = 00011111
+ mov rcx, %%IA0
+
+ vmovq XWORD(%%ZTMP1), %%IA1
+
+ ;; Get number of full bytes in last block.
+ ;; Subtracting the bytes in the blocks to skip to the length of whole
+ ;; set of blocks gives us the number of bytes in the last block,
+ ;; but the last block has a partial byte at the end, so an extra byte
+ ;; needs to be subtracted
+ mov %%IA1, %%LENGTH
+ sub %%IA1, (%%blocks_to_skip * 16 + 1)
+ XVPSLLB XWORD(%%ZTMP1), %%IA1, XWORD(%%ZTMP2), %%IA0
+%if num_left_blocks == 4
+ vshufi64x2 %%ZTMP1, %%ZTMP1, %%ZTMP1, 0x15
+%elif num_left_blocks == 3
+ vshufi64x2 %%ZTMP1, %%ZTMP1, %%ZTMP1, 0x45
+%elif num_left_blocks == 2
+ vshufi64x2 %%ZTMP1, %%ZTMP1, %%ZTMP1, 0x51
+%endif ;; No need to shift if there is only one block
+
+ ;; At this point, ZTMP1 contains a mask with all 0s, but with some ones
+ ;; in the partial byte
+
+ ;; First, clear the last bits (not to be ciphered) of the last output block
+ ;; %%ZIN_OUT = %%ZIN_OUT AND NOT %%ZTMP1 (0x50 = andA!C)
+ vpternlogq %%ZIN_OUT, %%ZTMP1, %%ZTMP1, 0x50
+
+ ;; Then, set these last bits to the last bits coming from the output
+ ;; %%ZIN_OUT = %%ZIN_OUT OR (%%ZTMP0 AND %%ZTMP1) (0xF8 = orAandBC)
+ vpternlogq %%ZIN_OUT, %%ZTMP0, %%ZTMP1, 0xF8
+
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; This macro is used to "warm-up" pipeline for ENCRYPT_16_PARALLEL
+;;; macro code. It is called only for data lengths 256 and above.
+;;; The flow is as follows:
+;;; - encrypt the initial %%num_initial_blocks blocks (can be 0)
+;;; - encrypt the next 16 blocks
+;;; - the last 16th block can be partial (lengths between 257 and 367)
+;;; - partial block ciphering is handled within this macro
+
+%macro INITIAL_BLOCKS 26
+%define %%KEY %1 ; [in] pointer to key
+%define %%CYPH_PLAIN_OUT %2 ; [in] output buffer
+%define %%PLAIN_CYPH_IN %3 ; [in] input buffer
+%define %%LENGTH %4 ; [in/out] number of bytes to process
+%define %%DATA_OFFSET %5 ; [in/out] data offset
+%define %%num_initial_blocks %6 ; [in] can be between 0 and 15
+%define %%CTR %7 ; [in] XMM first counter block
+%define %%CTR_1_4 %8 ; [out] ZMM next 1-4 counter blocks
+%define %%CTR_5_8 %9 ; [out] ZMM next 5-8 counter blocks
+%define %%CTR_9_12 %10 ; [out] ZMM next 9-12 counter blocks
+%define %%CTR_13_16 %11 ; [out] ZMM next 13-16 counter blocks
+%define %%ZT1 %12 ; [clobbered] ZMM temporary
+%define %%ZT2 %13 ; [clobbered] ZMM temporary
+%define %%ZT3 %14 ; [clobbered] ZMM temporary
+%define %%ZT4 %15 ; [clobbered] ZMM temporary
+%define %%ZT5 %16 ; [clobbered] ZMM temporary
+%define %%ZT6 %17 ; [clobbered] ZMM temporary
+%define %%ZT7 %18 ; [clobbered] ZMM temporary
+%define %%ZT8 %19 ; [clobbered] ZMM temporary
+%define %%IA0 %20 ; [clobbered] GP temporary
+%define %%IA1 %21 ; [clobbered] GP temporary
+%define %%MASKREG %22 ; [clobbered] mask register
+%define %%SHUFREG %23 ; [in] ZMM register with shuffle mask
+%define %%NROUNDS %24 ; [in] number of rounds; numerical value
+%define %%CNTR_TYPE %25 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+%define %%RBITS %26 ; [in] Number of remaining bits in last byte
+
+%define %%T1 XWORD(%%ZT1)
+%define %%T2 XWORD(%%ZT2)
+%define %%T3 XWORD(%%ZT3)
+%define %%T4 XWORD(%%ZT4)
+%define %%T5 XWORD(%%ZT5)
+%define %%T6 XWORD(%%ZT6)
+%define %%T7 XWORD(%%ZT7)
+%define %%T8 XWORD(%%ZT8)
+
+%ifidn %%CNTR_TYPE, CNTR
+%define %%VPADD vpaddd
+%else
+%define %%VPADD vpaddq
+%endif
+
+%if %%num_initial_blocks > 0
+ ;; load plain/cipher text
+ ZMM_LOAD_BLOCKS_0_16 %%num_initial_blocks, %%PLAIN_CYPH_IN, 0, \
+ %%ZT5, %%ZT6, %%ZT7, %%ZT8, load_4_instead_of_3
+
+ ;; prepare AES counter blocks
+%if %%num_initial_blocks > 1
+%if %%num_initial_blocks == 2
+ vshufi64x2 YWORD(%%ZT1), YWORD(%%CTR), YWORD(%%CTR), 0
+ %%VPADD YWORD(%%ZT1), YWORD(%%ZT1), [rel ddq_add_0_3]
+%elif %%num_initial_blocks <= 4
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ %%VPADD %%ZT1, ZWORD(%%CTR), [rel ddq_add_0_3]
+%elif %%num_initial_blocks <= 8
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ %%VPADD %%ZT1, ZWORD(%%CTR), [rel ddq_add_0_3]
+ %%VPADD %%ZT2, ZWORD(%%CTR), [rel ddq_add_4_7]
+%elif %%num_initial_blocks <= 12
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ %%VPADD %%ZT1, ZWORD(%%CTR), [rel ddq_add_0_3]
+ %%VPADD %%ZT2, ZWORD(%%CTR), [rel ddq_add_4_7]
+ %%VPADD %%ZT3, ZWORD(%%CTR), [rel ddq_add_8_11]
+%else
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ %%VPADD %%ZT1, ZWORD(%%CTR), [rel ddq_add_0_3]
+ %%VPADD %%ZT2, ZWORD(%%CTR), [rel ddq_add_4_7]
+ %%VPADD %%ZT3, ZWORD(%%CTR), [rel ddq_add_8_11]
+ %%VPADD %%ZT4, ZWORD(%%CTR), [rel ddq_add_12_15]
+%endif
+%endif
+
+ ;; extract new counter value (%%T1)
+ ;; shuffle the counters for AES rounds
+%if %%num_initial_blocks == 1
+ vpshufb %%T1, %%CTR, XWORD(%%SHUFREG)
+%elif %%num_initial_blocks == 2
+ vextracti32x4 %%CTR, YWORD(%%ZT1), 1
+ vpshufb YWORD(%%ZT1), YWORD(%%SHUFREG)
+%elif %%num_initial_blocks <= 4
+ vextracti32x4 %%CTR, %%ZT1, (%%num_initial_blocks - 1)
+ vpshufb %%ZT1, %%SHUFREG
+%elif %%num_initial_blocks == 5
+ vmovdqa64 %%CTR, %%T2
+ vpshufb %%ZT1, %%SHUFREG
+ vpshufb %%T2, XWORD(%%SHUFREG)
+%elif %%num_initial_blocks == 6
+ vextracti32x4 %%CTR, YWORD(%%ZT2), 1
+ vpshufb %%ZT1, %%SHUFREG
+ vpshufb YWORD(%%ZT2), YWORD(%%SHUFREG)
+%elif %%num_initial_blocks = 7
+ vextracti32x4 %%CTR, %%ZT2, 2
+ vpshufb %%ZT1, %%SHUFREG
+ vpshufb %%ZT2, %%SHUFREG
+%elif %%num_initial_blocks = 8
+ vextracti32x4 %%CTR, %%ZT2, 3
+ vpshufb %%ZT1, %%SHUFREG
+ vpshufb %%ZT2, %%SHUFREG
+%elif %%num_initial_blocks = 9
+ vmovdqa64 %%CTR, %%T3
+ vpshufb %%ZT1, %%SHUFREG
+ vpshufb %%ZT2, %%SHUFREG
+ vpshufb %%T3, XWORD(%%SHUFREG)
+%elif %%num_initial_blocks = 10
+ vextracti32x4 %%CTR, YWORD(%%ZT3), 1
+ vpshufb %%ZT1, %%SHUFREG
+ vpshufb %%ZT2, %%SHUFREG
+ vpshufb YWORD(%%ZT3), YWORD(%%SHUFREG)
+%elif %%num_initial_blocks = 11
+ vextracti32x4 %%CTR, %%ZT3, 2
+ vpshufb %%ZT1, %%SHUFREG
+ vpshufb %%ZT2, %%SHUFREG
+ vpshufb %%ZT3, %%SHUFREG
+%elif %%num_initial_blocks = 12
+ vextracti32x4 %%CTR, %%ZT3, 3
+ vpshufb %%ZT1, %%SHUFREG
+ vpshufb %%ZT2, %%SHUFREG
+ vpshufb %%ZT3, %%SHUFREG
+%elif %%num_initial_blocks = 13
+ vmovdqa64 %%CTR, %%T4
+ vpshufb %%ZT1, %%SHUFREG
+ vpshufb %%ZT2, %%SHUFREG
+ vpshufb %%ZT3, %%SHUFREG
+ vpshufb %%T4, XWORD(%%SHUFREG)
+%elif %%num_initial_blocks = 14
+ vextracti32x4 %%CTR, YWORD(%%ZT4), 1
+ vpshufb %%ZT1, %%SHUFREG
+ vpshufb %%ZT2, %%SHUFREG
+ vpshufb %%ZT3, %%SHUFREG
+ vpshufb YWORD(%%ZT4), YWORD(%%SHUFREG)
+%elif %%num_initial_blocks = 15
+ vextracti32x4 %%CTR, %%ZT4, 2
+ vpshufb %%ZT1, %%SHUFREG
+ vpshufb %%ZT2, %%SHUFREG
+ vpshufb %%ZT3, %%SHUFREG
+ vpshufb %%ZT4, %%SHUFREG
+%endif
+
+ ;; AES rounds and XOR with plain/cipher text
+%assign j 0
+%rep (%%NROUNDS + 2)
+ ZMM_AESENC_ROUND_BLOCKS_0_16 \
+ %%ZT1, %%ZT2, %%ZT3, %%ZT4, ZKEY %+ j, j, \
+ %%ZT5, %%ZT6, %%ZT7, %%ZT8, %%num_initial_blocks, \
+ %%NROUNDS
+%assign j (j + 1)
+%endrep
+
+ ;; write cipher/plain text back to output
+ ZMM_STORE_BLOCKS_0_16 %%num_initial_blocks, %%CYPH_PLAIN_OUT, 0, \
+ %%ZT1, %%ZT2, %%ZT3, %%ZT4
+
+ ;; adjust data offset and length
+ sub %%LENGTH, (%%num_initial_blocks * 16)
+ add %%DATA_OFFSET, (%%num_initial_blocks * 16)
+%endif ; %%num_initial_blocks > 0
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; - cipher of %%num_initial_blocks is done
+ ;; - prepare counter blocks for the next 16 blocks (ZT5-ZT8)
+ ;; - shuffle the blocks for AES
+ ;; - encrypt the next 16 blocks
+
+ ;; get text load/store mask (assume full mask by default)
+ mov %%IA0, 0xffff_ffff_ffff_ffff
+%if %%num_initial_blocks > 0
+ ;; NOTE: 'jge' is always taken for %%num_initial_blocks = 0
+ ;; This macro is executed for length 256 and up,
+ ;; zero length is checked in CNTR_ENC_DEC.
+ ;; We know there is partial block if:
+ ;; LENGTH - 16*num_initial_blocks < 256
+ cmp %%LENGTH, 256
+ jge %%_initial_partial_block_continue
+ mov %%IA1, rcx
+ mov rcx, 256
+ sub rcx, %%LENGTH
+ shr %%IA0, cl
+ mov rcx, %%IA1
+%%_initial_partial_block_continue:
+%endif
+ kmovq %%MASKREG, %%IA0
+ ;; load plain or cipher text
+ vmovdqu8 %%ZT5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vmovdqu8 %%ZT6, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 64]
+ vmovdqu8 %%ZT7, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 128]
+ vmovdqu8 %%ZT8{%%MASKREG}{z}, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 192]
+
+ ;; prepare next counter blocks
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+%if %%num_initial_blocks > 0
+ vpaddd %%CTR_1_4, ZWORD(%%CTR), [rel ddq_add_1_4]
+ vpaddd %%CTR_5_8, ZWORD(%%CTR), [rel ddq_add_5_8]
+ vpaddd %%CTR_9_12, ZWORD(%%CTR), [rel ddq_add_9_12]
+ vpaddd %%CTR_13_16, ZWORD(%%CTR), [rel ddq_add_13_16]
+%else
+ vpaddd %%CTR_1_4, ZWORD(%%CTR), [rel ddq_add_0_3]
+ vpaddd %%CTR_5_8, ZWORD(%%CTR), [rel ddq_add_4_7]
+ vpaddd %%CTR_9_12, ZWORD(%%CTR), [rel ddq_add_8_11]
+ vpaddd %%CTR_13_16, ZWORD(%%CTR), [rel ddq_add_12_15]
+%endif
+
+ vpshufb %%ZT1, %%CTR_1_4, %%SHUFREG
+ vpshufb %%ZT2, %%CTR_5_8, %%SHUFREG
+ vpshufb %%ZT3, %%CTR_9_12, %%SHUFREG
+ vpshufb %%ZT4, %%CTR_13_16, %%SHUFREG
+
+ ;; AES rounds and XOR with plain/cipher text
+%assign j 0
+%rep (%%NROUNDS + 2)
+ ZMM_AESENC_ROUND_BLOCKS_0_16 \
+ %%ZT1, %%ZT2, %%ZT3, %%ZT4, ZKEY %+ j, j, \
+ %%ZT5, %%ZT6, %%ZT7, %%ZT8, 16, %%NROUNDS
+%assign j (j + 1)
+%endrep
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ ;; check if this is the end of the message
+ cmp %%LENGTH, 256
+ jg %%store_output
+ ;; Check if there is a partial byte
+ or %%RBITS, %%RBITS
+ jz %%store_output
+
+ ;; Copy the bits that are not ciphered from the output text,
+ ;; into the last bits of the output block, before writing it out
+ PRESERVE_BITS %%RBITS, %%LENGTH, %%CYPH_PLAIN_OUT, %%ZT4, %%ZT5, %%ZT6, %%ZT7, \
+ %%IA0, %%IA1, 15, partial, %%MASKREG, %%DATA_OFFSET
+
+%endif
+
+%%store_output:
+ ;; write cipher/plain text back to output
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], %%ZT1
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 64], %%ZT2
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 128], %%ZT3
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 192]{%%MASKREG}, %%ZT4
+
+ ;; check if there is partial block
+ cmp %%LENGTH, 256
+ jl %%_initial_partial_done
+ ;; adjust offset and length
+ add %%DATA_OFFSET, 256
+ sub %%LENGTH, 256
+ jmp %%_initial_blocks_done
+%%_initial_partial_done:
+ ;; zero the length (all encryption is complete)
+ xor %%LENGTH, %%LENGTH
+%%_initial_blocks_done:
+
+%endmacro ; INITIAL_BLOCKS
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; INITIAL_BLOCKS_PARTIAL macro with support for a partial final block.
+;;; It may look similar to INITIAL_BLOCKS but its usage is different:
+;;; - It is not meant to cipher counter blocks for the main by16 loop.
+;;; Just ciphers amount of blocks.
+;;; - Small packets (<256 bytes)
+;;;
+;;; num_initial_blocks is expected to include the partial final block
+;;; in the count.
+%macro INITIAL_BLOCKS_PARTIAL 21
+%define %%KEY %1 ; [in] key pointer
+%define %%CYPH_PLAIN_OUT %2 ; [in] text out pointer
+%define %%PLAIN_CYPH_IN %3 ; [in] text out pointer
+%define %%LENGTH %4 ; [in/clobbered] length in bytes
+%define %%num_initial_blocks %5 ; [in] can be from 1 to 16 (not 0)
+%define %%CTR %6 ; [in/out] current counter value
+%define %%ZT1 %7 ; [clobbered] ZMM temporary
+%define %%ZT2 %8 ; [clobbered] ZMM temporary
+%define %%ZT3 %9 ; [clobbered] ZMM temporary
+%define %%ZT4 %10 ; [clobbered] ZMM temporary
+%define %%ZT5 %11 ; [clobbered] ZMM temporary
+%define %%ZT6 %12 ; [clobbered] ZMM temporary
+%define %%ZT7 %13 ; [clobbered] ZMM temporary
+%define %%ZT8 %14 ; [clobbered] ZMM temporary
+%define %%IA0 %15 ; [clobbered] GP temporary
+%define %%IA1 %16 ; [clobbered] GP temporary
+%define %%MASKREG %17 ; [clobbered] mask register
+%define %%SHUFREG %18 ; [in] ZMM register with shuffle mask
+%define %%NROUNDS %19 ; [in] number of rounds; numerical value
+%define %%CNTR_TYPE %20 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+%define %%RBITS %21 ; [in] Number of remaining bits in last byte
+
+%define %%T1 XWORD(%%ZT1)
+%define %%T2 XWORD(%%ZT2)
+%define %%T3 XWORD(%%ZT3)
+%define %%T4 XWORD(%%ZT4)
+%define %%T5 XWORD(%%ZT5)
+%define %%T6 XWORD(%%ZT6)
+%define %%T7 XWORD(%%ZT7)
+%define %%T8 XWORD(%%ZT8)
+
+ ;; get load/store mask
+ lea %%IA0, [rel byte64_len_to_mask_table]
+ mov %%IA1, %%LENGTH
+%if %%num_initial_blocks > 12
+ sub %%IA1, 192
+%elif %%num_initial_blocks > 8
+ sub %%IA1, 128
+%elif %%num_initial_blocks > 4
+ sub %%IA1, 64
+%endif
+ kmovq %%MASKREG, [%%IA0 + %%IA1*8]
+
+ ;; load plain/cipher text
+ ZMM_LOAD_MASKED_BLOCKS_0_16 %%num_initial_blocks, %%PLAIN_CYPH_IN, 0, \
+ %%ZT5, %%ZT6, %%ZT7, %%ZT8, %%MASKREG
+
+ ;; prepare AES counter blocks
+%if %%num_initial_blocks == 1
+ vmovdqa64 XWORD(%%ZT1), XWORD(%%CTR)
+%elif %%num_initial_blocks == 2
+ vshufi64x2 YWORD(%%ZT1), YWORD(%%CTR), YWORD(%%CTR), 0
+ vpaddd YWORD(%%ZT1), YWORD(%%ZT1), [rel ddq_add_0_3]
+%elif %%num_initial_blocks <= 4
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ vpaddd %%ZT1, ZWORD(%%CTR), [rel ddq_add_0_3]
+%elif %%num_initial_blocks <= 8
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ vpaddd %%ZT1, ZWORD(%%CTR), [rel ddq_add_0_3]
+ vpaddd %%ZT2, ZWORD(%%CTR), [rel ddq_add_4_7]
+%elif %%num_initial_blocks <= 12
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ vpaddd %%ZT1, ZWORD(%%CTR), [rel ddq_add_0_3]
+ vpaddd %%ZT2, ZWORD(%%CTR), [rel ddq_add_4_7]
+ vpaddd %%ZT3, ZWORD(%%CTR), [rel ddq_add_8_11]
+%else
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ vpaddd %%ZT1, ZWORD(%%CTR), [rel ddq_add_0_3]
+ vpaddd %%ZT2, ZWORD(%%CTR), [rel ddq_add_4_7]
+ vpaddd %%ZT3, ZWORD(%%CTR), [rel ddq_add_8_11]
+ vpaddd %%ZT4, ZWORD(%%CTR), [rel ddq_add_12_15]
+%endif
+
+ ;; shuffle the counters for AES rounds
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+ %%ZT1, %%ZT2, %%ZT3, %%ZT4, \
+ %%ZT1, %%ZT2, %%ZT3, %%ZT4, \
+ %%SHUFREG, %%SHUFREG, %%SHUFREG, %%SHUFREG
+
+ ;; AES rounds and XOR with plain/cipher text
+%assign j 0
+%rep (%%NROUNDS + 2)
+ ZMM_AESENC_ROUND_BLOCKS_0_16 \
+ %%ZT1, %%ZT2, %%ZT3, %%ZT4, ZKEY %+ j, j, \
+ %%ZT5, %%ZT6, %%ZT7, %%ZT8, %%num_initial_blocks, \
+ %%NROUNDS
+%assign j (j + 1)
+%endrep
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ ;; Check if there is a partial byte
+ or %%RBITS, %%RBITS
+ jz %%store_output
+
+ ;; Copy the bits that are not ciphered from the output text,
+ ;; into the last bits of the output block, before writing it out
+%if %%num_initial_blocks <= 4
+ PRESERVE_BITS %%RBITS, %%LENGTH, %%CYPH_PLAIN_OUT, %%ZT1, %%ZT5, %%ZT6, %%ZT7, \
+ %%IA0, %%IA1, (%%num_initial_blocks - 1), \
+ partial, %%MASKREG
+%elif %%num_initial_blocks <= 8
+ PRESERVE_BITS %%RBITS, %%LENGTH, %%CYPH_PLAIN_OUT, %%ZT2, %%ZT5, %%ZT6, %%ZT7, \
+ %%IA0, %%IA1, (%%num_initial_blocks - 1), \
+ partial, %%MASKREG
+%elif %%num_initial_blocks <= 12
+ PRESERVE_BITS %%RBITS, %%LENGTH, %%CYPH_PLAIN_OUT, %%ZT3, %%ZT5, %%ZT6, %%ZT7, \
+ %%IA0, %%IA1, (%%num_initial_blocks - 1), \
+ partial, %%MASKREG
+%else
+ PRESERVE_BITS %%RBITS, %%LENGTH, %%CYPH_PLAIN_OUT, %%ZT4, %%ZT5, %%ZT6, %%ZT7, \
+ %%IA0, %%IA1, (%%num_initial_blocks - 1), \
+ partial, %%MASKREG
+%endif
+
+%endif
+
+%%store_output:
+ ;; write cipher/plain text back to output
+ ZMM_STORE_MASKED_BLOCKS_0_16 %%num_initial_blocks, %%CYPH_PLAIN_OUT, 0, \
+ %%ZT1, %%ZT2, %%ZT3, %%ZT4, %%MASKREG
+
+%endmacro ; INITIAL_BLOCKS_PARTIAL
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Main CNTR macro
+;;; - operates on single stream
+;;; - encrypts 16 blocks at a time
+%macro ENCRYPT_16_PARALLEL 26
+%define %%KEY %1 ; [in] key pointer
+%define %%CYPH_PLAIN_OUT %2 ; [in] pointer to output buffer
+%define %%PLAIN_CYPH_IN %3 ; [in] pointer to input buffer
+%define %%DATA_OFFSET %4 ; [in] data offset
+%define %%CTR_1_4 %5 ; [in/out] ZMM next 1-4 counter blocks
+%define %%CTR_5_8 %6 ; [in/out] ZMM next 5-8 counter blocks
+%define %%CTR_9_12 %7 ; [in/out] ZMM next 9-12 counter blocks
+%define %%CTR_13_16 %8 ; [in/out] ZMM next 13-16 counter blocks
+%define %%FULL_PARTIAL %9 ; [in] last block type selection "full" or "partial"
+%define %%IA0 %10 ; [clobbered] temporary GP register
+%define %%IA1 %11 ; [clobbered] temporary GP register
+%define %%LENGTH %12 ; [in] length
+%define %%ZT1 %13 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT2 %14 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT3 %15 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT4 %16 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT5 %17 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT6 %18 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT7 %19 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT8 %20 ; [clobbered] temporary ZMM (cipher)
+%define %%MASKREG %21 ; [clobbered] mask register for partial loads/stores
+%define %%SHUFREG %22 ; [in] ZMM register with shuffle mask
+%define %%ADD8REG %23 ; [in] ZMM register with increment by 8 mask
+%define %%NROUNDS %24 ; [in] number of rounds; numerical value
+%define %%CNTR_TYPE %25 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+%define %%RBITS %26 ; [in] Number of remaining bits in last byte
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; load/store mask (partial case) and load the text data
+%ifidn %%FULL_PARTIAL, full
+ vmovdqu8 %%ZT5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vmovdqu8 %%ZT6, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 64]
+ vmovdqu8 %%ZT7, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 128]
+ vmovdqu8 %%ZT8, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 192]
+%else
+ lea %%IA0, [rel byte64_len_to_mask_table]
+ mov %%IA1, %%LENGTH
+ sub %%IA1, (3*64)
+ kmovq %%MASKREG, [%%IA0 + 8*%%IA1]
+ vmovdqu8 %%ZT5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vmovdqu8 %%ZT6, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 64]
+ vmovdqu8 %%ZT7, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 128]
+ vmovdqu8 %%ZT8{%%MASKREG}{z}, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 192]
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; populate counter blocks
+ ;; %%CTR is shuffled outside the scope of this macro
+ ;; it has to be kept in unshuffled form
+ vpaddd %%CTR_1_4, %%CTR_1_4, %%ADD8REG
+ vpaddd %%CTR_5_8, %%CTR_5_8, %%ADD8REG
+ vpaddd %%CTR_9_12, %%CTR_9_12, %%ADD8REG
+ vpaddd %%CTR_13_16, %%CTR_13_16, %%ADD8REG
+ vpshufb %%ZT1, %%CTR_1_4, %%SHUFREG
+ vpshufb %%ZT2, %%CTR_5_8, %%SHUFREG
+ vpshufb %%ZT3, %%CTR_9_12, %%SHUFREG
+ vpshufb %%ZT4, %%CTR_13_16, %%SHUFREG
+
+%assign j 0
+%rep (%%NROUNDS + 2)
+ ZMM_AESENC_ROUND_BLOCKS_0_16 \
+ %%ZT1, %%ZT2, %%ZT3, %%ZT4, ZKEY %+ j, j, \
+ %%ZT5, %%ZT6, %%ZT7, %%ZT8, 16, %%NROUNDS
+%assign j (j + 1)
+%endrep
+
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ ;; Check if this is the last round
+ cmp %%LENGTH, 256
+ jg %%store_output
+ ;; Check if there is a partial byte
+ or %%RBITS, %%RBITS
+ jz %%store_output
+
+ ;; Copy the bits that are not ciphered from the output text,
+ ;; into the last bits of the output block, before writing it out
+ PRESERVE_BITS %%RBITS, %%LENGTH, %%CYPH_PLAIN_OUT, %%ZT4, %%ZT5, %%ZT6, %%ZT7, \
+ %%IA0, %%IA1, 15, %%FULL_PARTIAL, %%MASKREG, %%DATA_OFFSET
+
+%endif
+
+%%store_output:
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; store the text data
+%ifidn %%FULL_PARTIAL, full
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], %%ZT1
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 64], %%ZT2
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 128], %%ZT3
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 192], %%ZT4
+%else
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], %%ZT1
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 64], %%ZT2
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 128], %%ZT3
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 192]{%%MASKREG}, %%ZT4
+%endif
+
+%endmacro ; ENCRYPT_16_PARALLEL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Save register content for the caller
+%macro FUNC_SAVE 1
+%define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+ mov rax, rsp
+
+ sub rsp, STACK_FRAME_SIZE
+ and rsp, ~63
+
+ mov [rsp + 0*8], r12
+ mov [rsp + 1*8], r13
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ mov [rsp + 2*8], r14
+%endif
+ mov [rsp + 3*8], rax ; stack
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 4*8], rdi
+ mov [rsp + 5*8], rsi
+%endif
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Restore register content for the caller
+%macro FUNC_RESTORE 1
+%define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+
+ vzeroupper
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [rsp + 4*8]
+ mov rsi, [rsp + 5*8]
+%endif
+ mov r12, [rsp + 0*8]
+ mov r13, [rsp + 1*8]
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ mov r14, [rsp + 2*8]
+%endif
+ mov rsp, [rsp + 3*8] ; stack
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Cipher payloads shorter than 256 bytes
+;;; - number of blocks in the message comes as argument
+;;; - depending on the number of blocks an optimized variant of
+;;; INITIAL_BLOCKS_PARTIAL is invoked
+%macro CNTR_ENC_DEC_SMALL 21
+%define %%KEY %1 ; [in] key pointer
+%define %%CYPH_PLAIN_OUT %2 ; [in] output buffer
+%define %%PLAIN_CYPH_IN %3 ; [in] input buffer
+%define %%LENGTH %4 ; [in] data length
+%define %%NUM_BLOCKS %5 ; [in] number of blocks to process 1 to 8
+%define %%CTR %6 ; [in/out] XMM counter block
+%define %%ZTMP1 %7 ; [clobbered] ZMM register
+%define %%ZTMP2 %8 ; [clobbered] ZMM register
+%define %%ZTMP3 %9 ; [clobbered] ZMM register
+%define %%ZTMP4 %10 ; [clobbered] ZMM register
+%define %%ZTMP5 %11 ; [clobbered] ZMM register
+%define %%ZTMP6 %12 ; [clobbered] ZMM register
+%define %%ZTMP7 %13 ; [clobbered] ZMM register
+%define %%ZTMP8 %14 ; [clobbered] ZMM register
+%define %%IA0 %15 ; [clobbered] GP register
+%define %%IA1 %16 ; [clobbered] GP register
+%define %%MASKREG %17 ; [clobbered] mask register
+%define %%SHUFREG %18 ; [in] ZMM register with shuffle mask
+%define %%NROUNDS %19 ; [in] number of rounds; numerical value
+%define %%CNTR_TYPE %20 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+%define %%RBITS %21 ; [in] Number of remaining bits in last byte
+
+ cmp %%NUM_BLOCKS, 8
+ je %%_small_initial_num_blocks_is_8
+ jl %%_small_initial_blocks_is_1_7
+
+ ; Initial blocks 9-16
+ cmp %%NUM_BLOCKS, 12
+ je %%_small_initial_num_blocks_is_12
+ jl %%_small_initial_blocks_is_9_11
+
+ ; Initial blocks 13-16
+ cmp %%NUM_BLOCKS, 16
+ je %%_small_initial_num_blocks_is_16
+ cmp %%NUM_BLOCKS, 15
+ je %%_small_initial_num_blocks_is_15
+ cmp %%NUM_BLOCKS, 14
+ je %%_small_initial_num_blocks_is_14
+ cmp %%NUM_BLOCKS, 13
+ je %%_small_initial_num_blocks_is_13
+
+%%_small_initial_blocks_is_9_11:
+ cmp %%NUM_BLOCKS, 11
+ je %%_small_initial_num_blocks_is_11
+ cmp %%NUM_BLOCKS, 10
+ je %%_small_initial_num_blocks_is_10
+ cmp %%NUM_BLOCKS, 9
+ je %%_small_initial_num_blocks_is_9
+
+%%_small_initial_blocks_is_1_7:
+ cmp %%NUM_BLOCKS, 4
+ je %%_small_initial_num_blocks_is_4
+ jl %%_small_initial_blocks_is_1_3
+
+ ; Initial blocks 5-7
+ cmp %%NUM_BLOCKS, 7
+ je %%_small_initial_num_blocks_is_7
+ cmp %%NUM_BLOCKS, 6
+ je %%_small_initial_num_blocks_is_6
+ cmp %%NUM_BLOCKS, 5
+ je %%_small_initial_num_blocks_is_5
+
+%%_small_initial_blocks_is_1_3:
+ cmp %%NUM_BLOCKS, 3
+ je %%_small_initial_num_blocks_is_3
+ cmp %%NUM_BLOCKS, 2
+ je %%_small_initial_num_blocks_is_2
+
+ jmp %%_small_initial_num_blocks_is_1
+
+
+%%_small_initial_num_blocks_is_16:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 16, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_15:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 15, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_14:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 14, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_13:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 13, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_12:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 12, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_11:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 11, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_10:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 10, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_9:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 9, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+%%_small_initial_num_blocks_is_8:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 8, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_7:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 7, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_6:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 6, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_5:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 5, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_4:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 4, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_3:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 3, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_2:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 2, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_1:
+ INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, 1, \
+ %%CTR, \
+ %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \
+ %%ZTMP6, %%ZTMP7, %%ZTMP8, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+%%_small_initial_blocks_encrypted:
+
+%endmacro ; CNTR_ENC_DEC_SMALL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CNTR_ENC_DEC Encodes/Decodes given data.
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: job structure and number of AES rounds
+; Output: job structure
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CNTR_ENC_DEC 3
+%define %%JOB %1 ; [in/out] job
+%define %%NROUNDS %2 ; [in] number of rounds; numerical value
+%define %%CNTR_TYPE %3 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+
+%define %%KEY rax
+%define %%CYPH_PLAIN_OUT rdx
+%define %%PLAIN_CYPH_IN r8
+%define %%LENGTH r9
+%define %%DATA_OFFSET r13
+%define %%RBITS r14
+
+%define %%IA0 r10
+%define %%IA1 r11
+%define %%IA2 r12
+
+%define %%CTR_BLOCKx xmm0
+%define %%CTR_BLOCK_1_4 zmm1
+%define %%CTR_BLOCK_5_8 zmm2
+%define %%CTR_BLOCK_9_12 zmm3
+%define %%CTR_BLOCK_13_16 zmm4
+
+%define %%ZTMP0 zmm5
+%define %%ZTMP1 zmm6
+%define %%ZTMP2 zmm7
+%define %%ZTMP3 zmm8
+%define %%ZTMP4 zmm9
+%define %%ZTMP5 zmm10
+%define %%ZTMP6 zmm11
+%define %%ZTMP7 zmm12
+%define %%SHUFREG zmm13
+%define %%ADD8REG zmm14
+
+%define %%MASKREG k1
+
+;;; Macro flow:
+;;; - calculate the number of 16byte blocks in the message
+;;; - process (number of 16byte blocks) mod 16 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+;;; - process 16x16 byte blocks at a time until all are done in %%_encrypt_by_16_new
+
+ mov %%LENGTH, [%%JOB + _msg_len_to_cipher]
+ ;; calculate len
+ ;; convert bits to bytes (message length in bits for CNTR_BIT)
+%ifidn %%CNTR_TYPE, CNTR_BIT
+ mov %%RBITS, %%LENGTH
+ add %%LENGTH, 7
+ shr %%LENGTH, 3 ; LENGTH will hold number of bytes (including partial byte)
+ and %%RBITS, 7 ; Get remainder bits in last byte (0-7)
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ cmp %%LENGTH, 0
+%else
+ or %%LENGTH, %%LENGTH
+%endif
+ je %%_enc_dec_done
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+
+ mov %%PLAIN_CYPH_IN, [%%JOB + _src]
+ add %%PLAIN_CYPH_IN, [%%JOB + _cipher_start_src_offset_in_bytes]
+ mov %%CYPH_PLAIN_OUT, [%%JOB + _dst]
+ mov %%KEY, [%%JOB + _aes_enc_key_expanded]
+
+ ;; Prepare round keys (only first 10, due to lack of registers)
+%assign i 0
+%rep (%%NROUNDS + 2)
+ vbroadcastf64x2 ZKEY %+ i, [%%KEY + 16*i]
+%assign i (i + 1)
+%endrep
+
+ mov %%IA1, [%%JOB + _iv]
+%ifidn %%CNTR_TYPE, CNTR
+ ;; Prepare initial mask to read 12 IV bytes
+ mov %%IA0, 0x0000_0000_0000_0fff
+ vmovdqa %%CTR_BLOCKx, [rel initial_12_IV_counter]
+ mov %%IA2, [%%JOB + _iv_len_in_bytes]
+ test %%IA2, 16
+ ;; Set mask to read 16 IV bytes if iv_len = 16
+ cmovnz %%IA0, [rel mask_16_bytes]
+
+ kmovq %%MASKREG, %%IA0
+ vmovdqu8 %%CTR_BLOCKx{%%MASKREG}, [%%IA1]
+%else ;; CNTR_BIT
+ ;; Read the full 16 bytes of IV
+ vmovdqu8 %%CTR_BLOCKx, [%%IA1]
+%endif ;; CNTR/CNTR_BIT
+
+ vmovdqa64 %%SHUFREG, [rel SHUF_MASK]
+ ;; store IV as counter in LE format
+ vpshufb %%CTR_BLOCKx, XWORD(%%SHUFREG)
+
+ ;; Determine how many blocks to process in INITIAL
+ mov %%IA1, %%LENGTH
+ shr %%IA1, 4
+ and %%IA1, 0xf
+
+ ;; Process one additional block in INITIAL if there is a partial block
+ mov %%IA0, %%LENGTH
+ and %%IA0, 0xf
+ add %%IA0, 0xf
+ shr %%IA0, 4
+ add %%IA1, %%IA0
+ ;; %%IA1 can be in the range from 0 to 16
+
+ ;; Less than 256B will be handled by the small message code, which
+ ;; can process up to 16 x blocks (16 bytes each)
+ cmp %%LENGTH, 256
+ jge %%_large_message_path
+
+ CNTR_ENC_DEC_SMALL \
+ %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, \
+ %%IA1, %%CTR_BLOCKx, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \
+ %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%IA0, %%IA2, %%MASKREG, %%SHUFREG, %%NROUNDS, \
+ %%CNTR_TYPE, %%RBITS
+
+ jmp %%_enc_dec_done
+
+%%_large_message_path:
+ ;; Still, don't allow 16 INITIAL blocks since this will
+ ;; can be handled by the x16 partial loop.
+ and %%IA1, 0xf
+ je %%_initial_num_blocks_is_0
+ cmp %%IA1, 15
+ je %%_initial_num_blocks_is_15
+ cmp %%IA1, 14
+ je %%_initial_num_blocks_is_14
+ cmp %%IA1, 13
+ je %%_initial_num_blocks_is_13
+ cmp %%IA1, 12
+ je %%_initial_num_blocks_is_12
+ cmp %%IA1, 11
+ je %%_initial_num_blocks_is_11
+ cmp %%IA1, 10
+ je %%_initial_num_blocks_is_10
+ cmp %%IA1, 9
+ je %%_initial_num_blocks_is_9
+ cmp %%IA1, 8
+ je %%_initial_num_blocks_is_8
+ cmp %%IA1, 7
+ je %%_initial_num_blocks_is_7
+ cmp %%IA1, 6
+ je %%_initial_num_blocks_is_6
+ cmp %%IA1, 5
+ je %%_initial_num_blocks_is_5
+ cmp %%IA1, 4
+ je %%_initial_num_blocks_is_4
+ cmp %%IA1, 3
+ je %%_initial_num_blocks_is_3
+ cmp %%IA1, 2
+ je %%_initial_num_blocks_is_2
+ jmp %%_initial_num_blocks_is_1
+
+ and %%IA1, 0xf
+ je %%_initial_num_blocks_is_0
+
+ cmp %%IA1, 8
+ je %%_initial_num_blocks_is_8
+ jl %%_initial_blocks_is_1_7
+
+ ; Initial blocks 9-15
+ cmp %%IA1, 12
+ je %%_initial_num_blocks_is_12
+ jl %%_initial_blocks_is_9_11
+
+ ; Initial blocks 13-15
+ cmp %%IA1, 15
+ je %%_initial_num_blocks_is_15
+ cmp %%IA1, 14
+ je %%_initial_num_blocks_is_14
+ cmp %%IA1, 13
+ je %%_initial_num_blocks_is_13
+
+%%_initial_blocks_is_9_11:
+ cmp %%IA1, 11
+ je %%_initial_num_blocks_is_11
+ cmp %%IA1, 10
+ je %%_initial_num_blocks_is_10
+ cmp %%IA1, 9
+ je %%_initial_num_blocks_is_9
+
+%%_initial_blocks_is_1_7:
+ cmp %%IA1, 4
+ je %%_initial_num_blocks_is_4
+ jl %%_initial_blocks_is_1_3
+
+ ; Initial blocks 5-7
+ cmp %%IA1, 7
+ je %%_initial_num_blocks_is_7
+ cmp %%IA1, 6
+ je %%_initial_num_blocks_is_6
+ cmp %%IA1, 5
+ je %%_initial_num_blocks_is_5
+
+%%_initial_blocks_is_1_3:
+ cmp %%IA1, 3
+ je %%_initial_num_blocks_is_3
+ cmp %%IA1, 2
+ je %%_initial_num_blocks_is_2
+
+ jmp %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_15:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 15, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_14:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 14, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_13:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 13, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_12:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 12, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_11:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 11, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_10:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 10, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_9:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 9, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_8:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 8, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_7:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 7, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_6:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 6, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_5:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 5, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_4:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 4, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_3:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 3, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_2:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 2, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_1:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 1, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_0:
+ INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, 0, %%CTR_BLOCKx, \
+ %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \
+ %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \
+ %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS
+
+%%_initial_blocks_encrypted:
+ or %%LENGTH, %%LENGTH
+ je %%_enc_dec_done
+
+ vmovdqa64 %%ADD8REG, [rel ddq_add_16]
+ ;; Process 15 full blocks plus a partial block
+ cmp %%LENGTH, 256
+ jl %%_encrypt_by_16_partial
+
+%%_encrypt_by_16:
+ ENCRYPT_16_PARALLEL %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%DATA_OFFSET, %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, \
+ %%CTR_BLOCK_9_12, %%CTR_BLOCK_13_16, \
+ full, %%IA0, %%IA1, %%LENGTH, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \
+ %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%MASKREG, %%SHUFREG, %%ADD8REG, %%NROUNDS, %%CNTR_TYPE, \
+ %%RBITS
+ add %%DATA_OFFSET, 256
+ sub %%LENGTH, 256
+ cmp %%LENGTH, 256
+ jge %%_encrypt_by_16
+
+%%_encrypt_by_16_done:
+ ;; Test to see if we need a by 16 with partial block. At this point
+ ;; bytes remaining should be either zero or between 241-255.
+ or %%LENGTH, %%LENGTH
+ je %%_enc_dec_done
+
+%%_encrypt_by_16_partial:
+
+ ENCRYPT_16_PARALLEL %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%DATA_OFFSET, %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, \
+ %%CTR_BLOCK_9_12, %%CTR_BLOCK_13_16, \
+ partial, %%IA0, %%IA1, %%LENGTH, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \
+ %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%MASKREG, %%SHUFREG, %%ADD8REG, %%NROUNDS, %%CNTR_TYPE, \
+ %%RBITS
+
+%%_enc_dec_done:
+
+%endmacro ; CNTR_ENC_DEC
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_cntr_128_submit_vaes_avx512 (JOB_AES_HMAC *job)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cntr_128_submit_vaes_avx512,function,internal)
+aes_cntr_128_submit_vaes_avx512:
+ FUNC_SAVE CNTR
+ ;; arg1 - [in] job
+ ;; arg2 - [in] NROUNDS
+ ;; arg3 - [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+ CNTR_ENC_DEC arg1, 9, CNTR
+ FUNC_RESTORE CNTR
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_cntr_192_submit_vaes_avx512 (JOB_AES_HMAC *job)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cntr_192_submit_vaes_avx512,function,internal)
+aes_cntr_192_submit_vaes_avx512:
+ FUNC_SAVE CNTR
+ ;; arg1 - [in] job
+ ;; arg2 - [in] NROUNDS
+ ;; arg3 - [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+ CNTR_ENC_DEC arg1, 11, CNTR
+ FUNC_RESTORE CNTR
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_cntr_256_submit_vaes_avx512 (JOB_AES_HMAC *job)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cntr_256_submit_vaes_avx512,function,internal)
+aes_cntr_256_submit_vaes_avx512:
+ FUNC_SAVE CNTR
+ ;; arg1 - [in] job
+ ;; arg2 - [in] NROUNDS
+ ;; arg3 - [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+ CNTR_ENC_DEC arg1, 13, CNTR
+ FUNC_RESTORE CNTR
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_cntr_bit_128_submit_vaes_avx512 (JOB_AES_HMAC *job)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cntr_bit_128_submit_vaes_avx512,function,internal)
+aes_cntr_bit_128_submit_vaes_avx512:
+ FUNC_SAVE CNTR_BIT
+ ;; arg1 - [in] job
+ ;; arg2 - [in] NROUNDS
+ ;; arg3 - [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+ CNTR_ENC_DEC arg1, 9, CNTR_BIT
+ FUNC_RESTORE CNTR_BIT
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_cntr_bit_192_submit_vaes_avx512 (JOB_AES_HMAC *job)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cntr_bit_192_submit_vaes_avx512,function,internal)
+aes_cntr_bit_192_submit_vaes_avx512:
+ FUNC_SAVE CNTR_BIT
+ ;; arg1 - [in] job
+ ;; arg2 - [in] NROUNDS
+ ;; arg3 - [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+ CNTR_ENC_DEC arg1, 11, CNTR_BIT
+ FUNC_RESTORE CNTR_BIT
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_cntr_bit_256_submit_vaes_avx512 (JOB_AES_HMAC *job)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(aes_cntr_bit_256_submit_vaes_avx512,function,internal)
+aes_cntr_bit_256_submit_vaes_avx512:
+ FUNC_SAVE CNTR_BIT
+ ;; arg1 - [in] job
+ ;; arg2 - [in] NROUNDS
+ ;; arg3 - [in] Type of CNTR operation to do (CNTR/CNTR_BIT)
+ CNTR_ENC_DEC arg1, 13, CNTR_BIT
+ FUNC_RESTORE CNTR_BIT
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/des_x16_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/des_x16_avx512.asm
new file mode 100644
index 000000000..656752941
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/des_x16_avx512.asm
@@ -0,0 +1,2382 @@
+;;
+;; Copyright (c) 2017-2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; Authors:
+;; Shay Gueron (1, 2), Regev Shemy (2), Tomasz kantecki (2)
+;; (1) University of Haifa, Israel
+;; (2) Intel Corporation
+
+;; In System V AMD64 ABI
+;; calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RAX R8 R9 R10 R11
+;; Windows preserves: RBX RCX RDX RBP RSI RDI R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Linux clobbers: RAX RCX RDX R10 R11
+;; Linux preserves: RBX RBP RSI RDI R8 R9 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Clobbers ZMM0-31 and K1 to K7
+
+%include "include/os.asm"
+%include "include/reg_sizes.asm"
+%include "mb_mgr_datastruct.asm"
+%include "constants.asm"
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%define arg4 rcx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 r8
+%define arg4 r9
+%endif
+
+%define STATE arg1
+%define SIZE arg2
+
+%define OFFSET rax
+
+%define IA0 arg3
+%define IA1 arg4
+%define IA2 r10
+
+%define INP0 r11
+%define INP1 r12
+%define INP2 r13
+%define INP3 r14
+%define INP4 r15
+
+%define KSOFFSET r11
+
+%define ZW0 zmm0
+%define ZW1 zmm1
+%define ZW2 zmm2
+%define ZW3 zmm3
+%define ZW4 zmm4
+%define ZW5 zmm5
+%define ZW6 zmm6
+%define ZW7 zmm7
+%define ZW8 zmm8
+%define ZW9 zmm9
+%define ZW10 zmm10
+%define ZW11 zmm11
+%define ZW12 zmm12
+%define ZW13 zmm13
+%define ZW14 zmm14
+%define ZW15 zmm15
+
+%define ZIV0 zmm16
+%define ZIV1 zmm17
+
+%define ZTMP0 zmm18
+%define ZTMP1 zmm19
+%define ZTMP2 zmm20
+%define ZTMP3 zmm21
+%define ZTMP4 zmm22
+%define ZTMP5 zmm23
+%define ZTMP6 zmm24
+%define ZTMP7 zmm25
+%define ZTMP8 zmm26
+%define ZTMP9 zmm27
+%define ZTMP10 zmm28
+%define ZTMP11 zmm29
+%define ZTMP12 zmm30
+%define ZTMP13 zmm31
+
+struc STACKFRAME
+_key_sched: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048
+_key_sched2: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048
+_key_sched3: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048
+_tmp_iv: resq 16 ; 2 x 64 bytes
+_tmp_in: resq 16 ; 2 x 64 bytes
+_tmp_out: resq 16 ; 2 x 64 bytes
+_tmp_mask: resd 16 ; 1 x 64 bytes
+_gpr_save: resq 4 ; r12 to r15
+_rsp_save: resq 1
+_mask_save: resq 1
+_size_save: resq 1
+endstruc
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; MACROS
+;;; ===========================================================================
+;;; ===========================================================================
+
+;;; ===========================================================================
+;;; CLEAR TRANSPOSED KEY SCHEDULE (if SAFE_DATA is selected)
+;;; ===========================================================================
+%macro CLEAR_KEY_SCHEDULE 2
+%define %%ALG %1 ; [in] DES or 3DES
+%define %%ZT %2 ; [clobbered] temporary ZMM register
+
+%ifdef SAFE_DATA
+ vpxorq %%ZT, %%ZT
+%assign rep_num (2048 / 64)
+%ifidn %%ALG, 3DES
+%assign rep_num (rep_num * 3)
+%endif
+
+%assign offset 0
+%rep rep_num
+ vmovdqa64 [rsp + _key_sched + offset], %%ZT
+%assign offset (offset + 64)
+%endrep
+
+%endif ; SAFE_DATA
+
+%endmacro
+
+;;; ===========================================================================
+;;; PERMUTE
+;;; ===========================================================================
+;;; A [in/out] - zmm register
+;;; B [in/out] - zmm register
+;;; NSHIFT [in] - constant to shift words by
+;;; MASK [in] - zmm or m512 with mask
+;;; T0 [clobbered] - temporary zmm register
+%macro PERMUTE 5
+%define %%A %1
+%define %%B %2
+%define %%NSHIFT %3
+%define %%MASK %4
+%define %%T0 %5
+
+ vpsrld %%T0, %%A, %%NSHIFT
+ vpxord %%T0, %%T0, %%B
+ vpandd %%T0, %%T0, %%MASK
+ vpxord %%B, %%B, %%T0
+ vpslld %%T0, %%T0, %%NSHIFT
+ vpxord %%A, %%A, %%T0
+%endmacro
+
+;;; ===========================================================================
+;;; INITIAL PERMUTATION
+;;; ===========================================================================
+;;; L [in/out] - zmm register
+;;; R [in/out] - zmm register
+;;; T0 [clobbered] - temporary zmm register
+%macro IP_Z 3
+%define %%L %1
+%define %%R %2
+%define %%T0 %3
+ PERMUTE %%R, %%L, 4, [rel init_perm_consts + 0*64], %%T0
+ PERMUTE %%L, %%R, 16, [rel init_perm_consts + 1*64], %%T0
+ PERMUTE %%R, %%L, 2, [rel init_perm_consts + 2*64], %%T0
+ PERMUTE %%L, %%R, 8, [rel init_perm_consts + 3*64], %%T0
+ PERMUTE %%R, %%L, 1, [rel init_perm_consts + 4*64], %%T0
+%endmacro
+
+;;; ===========================================================================
+;;; FINAL PERMUTATION
+;;; ===========================================================================
+;;; L [in/out] - zmm register
+;;; R [in/out] - zmm register
+;;; T0 [clobbered] - temporary zmm register
+%macro FP_Z 3
+%define %%L %1
+%define %%R %2
+%define %%T0 %3
+ PERMUTE %%L, %%R, 1, [rel init_perm_consts + 4*64], %%T0
+ PERMUTE %%R, %%L, 8, [rel init_perm_consts + 3*64], %%T0
+ PERMUTE %%L, %%R, 2, [rel init_perm_consts + 2*64], %%T0
+ PERMUTE %%R, %%L, 16, [rel init_perm_consts + 1*64], %%T0
+ PERMUTE %%L, %%R, 4, [rel init_perm_consts + 0*64], %%T0
+%endmacro
+
+;;; ===========================================================================
+;;; P PHASE
+;;; ===========================================================================
+;;; W0 [in/out] - zmm register
+;;; in: vector of 16 x 32bits from S phase
+;;; out: permuted in vector
+;;; T0-T3 [clobbered] - temporary zmm register
+%macro P_PHASE 5
+%define %%W0 %1
+%define %%T0 %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+
+ vprord %%T0, %%W0, 3
+ vpandd %%T0, %%T0, [rel mask_values + 0*64]
+ vprord %%T1, %%W0, 5
+ vpandd %%T1, %%T1, [rel mask_values + 1*64]
+ vpord %%T0, %%T0, %%T1
+
+ vprord %%T1, %%W0, 24
+ vpandd %%T1, %%T1, [rel mask_values + 2*64]
+ vprord %%T2, %%W0, 26
+ vpandd %%T2, %%T2, [rel mask_values + 3*64]
+ vpord %%T1, %%T1, %%T2
+ vpord %%T0, %%T0, %%T1
+
+ vprord %%T1, %%W0, 15
+ vpandd %%T1, %%T1, [rel mask_values + 4*64]
+ vprord %%T2, %%W0, 17
+ vpandd %%T2, %%T2, [rel mask_values + 5*64]
+ vpord %%T1, %%T1, %%T2
+
+ vprord %%T2, %%W0, 6
+ vpandd %%T2, %%T2, [rel mask_values + 6*64]
+ vprord %%T3, %%W0, 21
+ vpandd %%T3, %%T3, [rel mask_values + 7*64]
+ vpord %%T2, %%T2, %%T3
+ vpord %%T1, %%T1, %%T2
+ vpord %%T0, %%T0, %%T1
+
+ vprord %%T1, %%W0, 12
+ vpandd %%T1, %%T1, [rel mask_values + 8*64]
+ vprord %%T2, %%W0, 14
+ vpandd %%T2, %%T2, [rel mask_values + 9*64]
+ vpord %%T1, %%T1, %%T2
+
+ vprord %%T2, %%W0, 4
+ vpandd %%T2, %%T2, [rel mask_values + 10*64]
+ vprord %%T3, %%W0, 11
+ vpandd %%T3, %%T3, [rel mask_values + 11*64]
+ vpord %%T2, %%T2, %%T3
+ vpord %%T1, %%T1, %%T2
+ vpord %%T0, %%T0, %%T1
+
+ vprord %%T1, %%W0, 16
+ vpandd %%T1, %%T1, [rel mask_values + 12*64]
+ vprord %%T2, %%W0, 22
+ vpandd %%T2, %%T2, [rel mask_values + 13*64]
+ vpord %%T1, %%T1, %%T2
+
+ vprord %%T2, %%W0, 19
+ vpandd %%T2, %%T2, [rel mask_values + 14*64]
+ vprord %%T3, %%W0, 10
+ vpandd %%T3, %%T3, [rel mask_values + 15*64]
+ vpord %%T2, %%T2, %%T3
+ vpord %%T1, %%T1, %%T2
+ vpord %%T0, %%T0, %%T1
+
+ vprord %%T1, %%W0, 9
+ vpandd %%T1, %%T1, [rel mask_values + 16*64]
+ vprord %%T2, %%W0, 13
+ vpandd %%T2, %%T2, [rel mask_values + 17*64]
+ vpord %%T1, %%T1, %%T2
+
+ vprord %%T2, %%W0, 25
+ vpandd %%T2, %%T2, [rel mask_values + 18*64]
+ vpord %%T1, %%T1, %%T2
+ vpord %%W0, %%T0, %%T1
+%endmacro
+
+;;; ===========================================================================
+;;; E PHASE
+;;; ===========================================================================
+;;;
+;;; Expands 16x32-bit words into 16x48-bit words
+;;; plus XOR's result with the key schedule.
+;;; The output is adjusted to be friendly as S phase input.
+;;;
+;;; in [in] - zmm register
+;;; out0a [out] - zmm register
+;;; out0b [out] - zmm register
+;;; out1a [out] - zmm register
+;;; out1b [out] - zmm register
+;;; k0 [in] - key schedule; zmm or m512
+;;; k1 [in] - key schedule; zmm or m512
+;;; t0-t1 [clobbered] - temporary zmm register
+%macro E_PHASE 9
+%define %%IN %1
+%define %%OUT0A %2
+%define %%OUT0B %3
+%define %%OUT1A %4
+%define %%OUT1B %5
+%define %%K0 %6
+%define %%K1 %7
+%define %%T0 %8
+%define %%T1 %9
+
+ vprord %%T0, %%IN, 31
+ vprord %%T1, %%IN, 3
+ vpshufb %%T0, %%T0, [rel idx_e]
+ vpshufb %%T1, %%T1, [rel idx_e]
+ vpunpcklbw %%OUT0A, %%T0, %%T1
+ vpunpckhbw %%OUT1A, %%T0, %%T1
+ vpxord %%OUT0A, %%OUT0A, %%K0
+ vpxord %%OUT1A, %%OUT1A, %%K1
+ vpandd %%OUT0B, %%OUT0A, [rel and_eu]
+ vpsrlw %%OUT0B, %%OUT0B, 8
+ vpandd %%OUT0A, %%OUT0A, [rel and_ed]
+ vpandd %%OUT1B, %%OUT1A, [rel and_eu]
+ vpsrlw %%OUT1B, %%OUT1B, 8
+ vpandd %%OUT1A, %%OUT1A, [rel and_ed]
+%endmacro
+
+;;; ===========================================================================
+;;; S-BOX
+;;; ===========================================================================
+;;;
+;;; NOTE: clobbers k1-k6 OpMask registers
+;;;
+;;; IN0A [in] - zmm register; output from E-phase
+;;; IN0B [in] - zmm register; output from E-phase
+;;; IN1A [in] - zmm register; output from E-phase
+;;; IN1B [in] - zmm register; output from E-phase
+;;; OUT [out] - zmm register; output from E-phase
+;;; T0-T5 [clobbered] - temporary zmm register
+%macro S_PHASE 11
+%define %%IN0A %1
+%define %%IN0B %2
+%define %%IN1A %3
+%define %%IN1B %4
+%define %%OUT %5
+%define %%T0 %6
+%define %%T1 %7
+%define %%T2 %8
+%define %%T3 %9
+%define %%T4 %10
+%define %%T5 %11
+
+ vmovdqa64 %%T0, [rel reg_values16bit_7]
+ vpcmpuw k3, %%IN0A, %%T0, 2 ; 2 -> LE
+ vpcmpuw k4, %%IN0B, %%T0, 2 ; 2 -> LE
+ vpcmpuw k5, %%IN1A, %%T0, 2 ; 2 -> LE
+ vpcmpuw k6, %%IN1B, %%T0, 2 ; 2 -> LE
+
+ mov DWORD(IA0), 0x55555555
+ kmovd k1, DWORD(IA0)
+ mov DWORD(IA0), 0xaaaaaaaa
+ kmovd k2, DWORD(IA0)
+
+ vpermw %%T0{k1}{z}, %%IN0A, [rel S_box_flipped + 0*64]
+ vpermw %%T1{k1}{z}, %%IN0A, [rel S_box_flipped + 1*64]
+ vpermw %%T2{k2}{z}, %%IN0A, [rel S_box_flipped + 4*64]
+ vpermw %%T3{k2}{z}, %%IN0A, [rel S_box_flipped + 5*64]
+ vpxord %%T0, %%T0, %%T2
+ vpxord %%OUT, %%T1, %%T3
+ vmovdqu16 %%OUT{k3}, %%T0
+
+ vpermw %%T0{k1}{z}, %%IN0B, [rel S_box_flipped + 2*64]
+ vpermw %%T1{k1}{z}, %%IN0B, [rel S_box_flipped + 3*64]
+ vpermw %%T2{k2}{z}, %%IN0B, [rel S_box_flipped + 6*64]
+ vpermw %%T3{k2}{z}, %%IN0B, [rel S_box_flipped + 7*64]
+ vpxord %%T0, %%T0, %%T2
+ vpxord %%T3, %%T1, %%T3
+ vmovdqu16 %%T3{k4}, %%T0
+ vpsllw %%T3, %%T3, 4
+ vpxord %%OUT, %%OUT, %%T3
+
+ vpermw %%T0{k1}{z}, %%IN1A, [rel S_box_flipped + 8*64]
+ vpermw %%T1{k1}{z}, %%IN1A, [rel S_box_flipped + 9*64]
+ vpermw %%T2{k2}{z}, %%IN1A, [rel S_box_flipped + 12*64]
+ vpermw %%T3{k2}{z}, %%IN1A, [rel S_box_flipped + 13*64]
+ vpxord %%T0, %%T0, %%T2
+ vpxord %%T4, %%T1, %%T3
+ vmovdqu16 %%T4{k5}, %%T0
+
+ vpermw %%T0{k1}{z}, %%IN1B, [rel S_box_flipped + 10*64]
+ vpermw %%T1{k1}{z}, %%IN1B, [rel S_box_flipped + 11*64]
+ vpermw %%T2{k2}{z}, %%IN1B, [rel S_box_flipped + 14*64]
+ vpermw %%T3{k2}{z}, %%IN1B, [rel S_box_flipped + 15*64]
+ vpxord %%T0, %%T0, %%T2
+ vpxord %%T5, %%T1, %%T3
+ vmovdqu16 %%T5{k6}, %%T0
+ vpsllw %%T5, %%T5, 4
+
+ vpxord %%T4, %%T4, %%T5
+ vpsllw %%T4, %%T4, 8
+ vpxord %%OUT, %%OUT, %%T4
+ vpshufb %%OUT, %%OUT, [rel shuffle_reg]
+%endmacro
+
+;;; ===========================================================================
+;;; DES encryption/decryption round
+;;; ===========================================================================
+;;;
+;;; Clobbers k1-k6 OpMask registers
+;;;
+;;; ENC_DEC [in] - ENC for encryption, DEC for decryption
+;;; R [in/out] - zmm register; plain text in & cipher text out
+;;; L [in/out] - zmm register; plain text in & cipher text out
+;;; KS [in] - pointer to the key schedule
+;;; T0-T11 [clobbered] - temporary zmm register
+%macro DES_ENC_DEC 16
+%define %%ENC_DEC %1
+%define %%R %2
+%define %%L %3
+%define %%KS %4
+%define %%T0 %5
+%define %%T1 %6
+%define %%T2 %7
+%define %%T3 %8
+%define %%T4 %9
+%define %%T5 %10
+%define %%T6 %11
+%define %%T7 %12
+%define %%T8 %13
+%define %%T9 %14
+%define %%T10 %15
+%define %%T11 %16
+
+ IP_Z %%R, %%L, %%T0
+
+%ifidn %%ENC_DEC, ENC
+ ;; ENCRYPTION
+ xor KSOFFSET, KSOFFSET
+%%_des_enc_loop:
+ E_PHASE %%R, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET + (0*64)], [%%KS + KSOFFSET + (1*64)], %%T6, %%T7
+ S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
+ P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4
+ vpxord %%L, %%L, %%T0
+
+ E_PHASE %%L, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET + (2*64)], [%%KS + KSOFFSET + (3*64)], %%T6, %%T7
+ S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
+ P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4
+ vpxord %%R, %%R, %%T0
+
+ add KSOFFSET, (4*64)
+ cmp KSOFFSET, (8*(4*64))
+ jb %%_des_enc_loop
+
+%else
+ ;; DECRYPTION
+ mov KSOFFSET, (8*(4*64))
+%%_des_dec_loop:
+ E_PHASE %%R, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET - (2*64)], [%%KS + KSOFFSET - (1*64)], %%T6, %%T7
+ S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
+ P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4
+ vpxord %%L, %%L, %%T0
+
+ E_PHASE %%L, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET - (4*64)], [%%KS + KSOFFSET - (3*64)], %%T6, %%T7
+ S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11
+ P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4
+ vpxord %%R, %%R, %%T0
+ sub KSOFFSET, (4*64)
+ jnz %%_des_dec_loop
+%endif ; DECRYPTION
+
+ FP_Z %%R, %%L, %%T0
+%endmacro
+
+;;; ===========================================================================
+;;; DATA TRANSPOSITION AT DATA INPUT
+;;; ===========================================================================
+;;;
+;;; IN00 - IN15 [in/out]:
+;;; in: IN00 - lane 0 data, IN01 - lane 1 data, ... IN15 - lane 15 data
+;;; out: R0 - 16 x word0, L0 - 16 x word1, ... L7 - 16 x word15
+;;; T0-T3 [clobbered] - temporary zmm registers
+;;; K0-K5 [clobbered] - temporary zmm registers
+;;; H0-H3 [clobbered] - temporary zmm registers
+%macro TRANSPOSE_IN 30
+%define %%IN00 %1 ; R0
+%define %%IN01 %2 ; L0
+%define %%IN02 %3 ; R1
+%define %%IN03 %4 ; L1
+%define %%IN04 %5 ; R2
+%define %%IN05 %6 ; L2
+%define %%IN06 %7 ; R3
+%define %%IN07 %8 ; L3
+%define %%IN08 %9 ; R4
+%define %%IN09 %10 ; L4
+%define %%IN10 %11 ; R5
+%define %%IN11 %12 ; L5
+%define %%IN12 %13 ; R6
+%define %%IN13 %14 ; L6
+%define %%IN14 %15 ; R7
+%define %%IN15 %16 ; L7
+%define %%T0 %17
+%define %%T1 %18
+%define %%T2 %19
+%define %%T3 %20
+%define %%K0 %21
+%define %%K1 %22
+%define %%K2 %23
+%define %%K3 %24
+%define %%K4 %25
+%define %%K5 %26
+%define %%H0 %27
+%define %%H1 %28
+%define %%H2 %29
+%define %%H3 %30
+
+ vpunpckldq %%K0, %%IN00, %%IN01
+ vpunpckhdq %%K1, %%IN00, %%IN01
+ vpunpckldq %%T0, %%IN02, %%IN03
+ vpunpckhdq %%T1, %%IN02, %%IN03
+
+ vpunpckldq %%IN00, %%IN04, %%IN05
+ vpunpckhdq %%IN01, %%IN04, %%IN05
+ vpunpckldq %%IN02, %%IN06, %%IN07
+ vpunpckhdq %%IN03, %%IN06, %%IN07
+
+ vpunpcklqdq %%K2, %%K0, %%T0
+ vpunpckhqdq %%T2, %%K0, %%T0
+ vpunpcklqdq %%K3, %%K1, %%T1
+ vpunpckhqdq %%T3, %%K1, %%T1
+
+ vpunpcklqdq %%K0, %%IN00, %%IN02
+ vpunpckhqdq %%K1, %%IN00, %%IN02
+ vpunpcklqdq %%T0, %%IN01, %%IN03
+ vpunpckhqdq %%T1, %%IN01, %%IN03
+
+ vpunpckldq %%K4, %%IN08, %%IN09
+ vpunpckhdq %%K5, %%IN08, %%IN09
+ vpunpckldq %%IN04, %%IN10, %%IN11
+ vpunpckhdq %%IN05, %%IN10, %%IN11
+ vpunpckldq %%IN06, %%IN12, %%IN13
+ vpunpckhdq %%IN07, %%IN12, %%IN13
+ vpunpckldq %%IN10, %%IN14, %%IN15
+ vpunpckhdq %%IN11, %%IN14, %%IN15
+
+ vpunpcklqdq %%IN12, %%K4, %%IN04
+ vpunpckhqdq %%IN13, %%K4, %%IN04
+ vpunpcklqdq %%IN14, %%K5, %%IN05
+ vpunpckhqdq %%IN15, %%K5, %%IN05
+ vpunpcklqdq %%IN00, %%IN06, %%IN10
+ vpunpckhqdq %%IN01, %%IN06, %%IN10
+ vpunpcklqdq %%IN02, %%IN07, %%IN11
+ vpunpckhqdq %%IN03, %%IN07, %%IN11
+
+ vshufi64x2 %%H0, %%K2, %%K0, 0x44
+ vshufi64x2 %%H1, %%K2, %%K0, 0xee
+ vshufi64x2 %%H2, %%IN12, %%IN00, 0x44
+ vshufi64x2 %%H3, %%IN12, %%IN00, 0xee
+ vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0
+ vshufi64x2 %%IN04, %%H0, %%H2, 0xdd ; R2
+ vshufi64x2 %%IN08, %%H1, %%H3, 0x88 ; R4
+ vshufi64x2 %%IN12, %%H1, %%H3, 0xdd ; R6
+
+ vshufi64x2 %%H0, %%T2, %%K1, 0x44
+ vshufi64x2 %%H1, %%T2, %%K1, 0xee
+ vshufi64x2 %%H2, %%IN13, %%IN01, 0x44
+ vshufi64x2 %%H3, %%IN13, %%IN01, 0xee
+ vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0
+ vshufi64x2 %%IN05, %%H0, %%H2, 0xdd ; L2
+ vshufi64x2 %%IN09, %%H1, %%H3, 0x88 ; L4
+ vshufi64x2 %%IN13, %%H1, %%H3, 0xdd ; L6
+
+ vshufi64x2 %%H0, %%K3, %%T0, 0x44
+ vshufi64x2 %%H1, %%K3, %%T0, 0xee
+ vshufi64x2 %%H2, %%IN14, %%IN02, 0x44
+ vshufi64x2 %%H3, %%IN14, %%IN02, 0xee
+ vshufi64x2 %%IN02, %%H0, %%H2, 0x88 ; R1
+ vshufi64x2 %%IN06, %%H0, %%H2, 0xdd ; R3
+ vshufi64x2 %%IN10, %%H1, %%H3, 0x88 ; R5
+ vshufi64x2 %%IN14, %%H1, %%H3, 0xdd ; R7
+
+ vshufi64x2 %%H0, %%T3, %%T1, 0x44
+ vshufi64x2 %%H1, %%T3, %%T1, 0xee
+ vshufi64x2 %%H2, %%IN15, %%IN03, 0x44
+ vshufi64x2 %%H3, %%IN15, %%IN03, 0xee
+ vshufi64x2 %%IN03, %%H0, %%H2, 0x88 ; L1
+ vshufi64x2 %%IN07, %%H0, %%H2, 0xdd ; L3
+ vshufi64x2 %%IN11, %%H1, %%H3, 0x88 ; L5
+ vshufi64x2 %%IN15, %%H1, %%H3, 0xdd ; L7
+%endmacro
+
+;;; ===========================================================================
+;;; DATA TRANSPOSITION AT DATA OUTPUT
+;;; ===========================================================================
+;;;
+;;; IN00-IN15 aka R0/L0 - R7/L7 [in/out]:
+;;; in: R0 - 16 x word0, L0 - 16 x word1, ... L7 - 16 x word15
+;;; out: R0 - lane 0 data, L0 - lane 1 data, ... L7 - lane 15 data
+;;; T0-T3 [clobbered] - temporary zmm registers
+;;; K0-K5 [clobbered] - temporary zmm registers
+;;; H0-H3 [clobbered] - temporary zmm registers
+%macro TRANSPOSE_OUT 30
+%define %%IN00 %1 ; R0
+%define %%IN01 %2 ; L0
+%define %%IN02 %3 ; R1
+%define %%IN03 %4 ; L1
+%define %%IN04 %5 ; R2
+%define %%IN05 %6 ; L2
+%define %%IN06 %7 ; R3
+%define %%IN07 %8 ; L3
+%define %%IN08 %9 ; R4
+%define %%IN09 %10 ; L4
+%define %%IN10 %11 ; R5
+%define %%IN11 %12 ; L5
+%define %%IN12 %13 ; R6
+%define %%IN13 %14 ; L6
+%define %%IN14 %15 ; R7
+%define %%IN15 %16 ; L7
+%define %%T0 %17
+%define %%T1 %18
+%define %%T2 %19
+%define %%T3 %20
+%define %%K0 %21
+%define %%K1 %22
+%define %%K2 %23
+%define %%K3 %24
+%define %%K4 %25
+%define %%K5 %26
+%define %%H0 %27
+%define %%H1 %28
+%define %%H2 %29
+%define %%H3 %30
+
+ vpunpckldq %%K0, %%IN01, %%IN00
+ vpunpckhdq %%K1, %%IN01, %%IN00
+ vpunpckldq %%T0, %%IN03, %%IN02
+ vpunpckhdq %%T1, %%IN03, %%IN02
+
+ vpunpckldq %%IN00, %%IN05, %%IN04
+ vpunpckhdq %%IN01, %%IN05, %%IN04
+ vpunpckldq %%IN02, %%IN07, %%IN06
+ vpunpckhdq %%IN03, %%IN07, %%IN06
+
+ vpunpcklqdq %%K2, %%K0, %%T0
+ vpunpckhqdq %%T2, %%K0, %%T0
+ vpunpcklqdq %%K3, %%K1, %%T1
+ vpunpckhqdq %%T3, %%K1, %%T1
+
+ vpunpcklqdq %%K0, %%IN00, %%IN02
+ vpunpckhqdq %%K1, %%IN00, %%IN02
+ vpunpcklqdq %%T0, %%IN01, %%IN03
+ vpunpckhqdq %%T1, %%IN01, %%IN03
+
+ vpunpckldq %%K4, %%IN09, %%IN08
+ vpunpckhdq %%K5, %%IN09, %%IN08
+ vpunpckldq %%IN04, %%IN11, %%IN10
+ vpunpckhdq %%IN05, %%IN11, %%IN10
+ vpunpckldq %%IN06, %%IN13, %%IN12
+ vpunpckhdq %%IN07, %%IN13, %%IN12
+ vpunpckldq %%IN10, %%IN15, %%IN14
+ vpunpckhdq %%IN11, %%IN15, %%IN14
+
+ vpunpcklqdq %%IN12, %%K4, %%IN04
+ vpunpckhqdq %%IN13, %%K4, %%IN04
+ vpunpcklqdq %%IN14, %%K5, %%IN05
+ vpunpckhqdq %%IN15, %%K5, %%IN05
+ vpunpcklqdq %%IN00, %%IN06, %%IN10
+ vpunpckhqdq %%IN01, %%IN06, %%IN10
+ vpunpcklqdq %%IN02, %%IN07, %%IN11
+ vpunpckhqdq %%IN03, %%IN07, %%IN11
+
+ vshufi64x2 %%H0, %%K2, %%K0, 0x44
+ vshufi64x2 %%H1, %%K2, %%K0, 0xee
+ vshufi64x2 %%H2, %%IN12, %%IN00, 0x44
+ vshufi64x2 %%H3, %%IN12, %%IN00, 0xee
+ vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0
+ vshufi64x2 %%IN04, %%H0, %%H2, 0xdd ; R2
+ vshufi64x2 %%IN08, %%H1, %%H3, 0x88 ; R4
+ vshufi64x2 %%IN12, %%H1, %%H3, 0xdd ; R6
+
+ vshufi64x2 %%H0, %%T2, %%K1, 0x44
+ vshufi64x2 %%H1, %%T2, %%K1, 0xee
+ vshufi64x2 %%H2, %%IN13, %%IN01, 0x44
+ vshufi64x2 %%H3, %%IN13, %%IN01, 0xee
+ vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0
+ vshufi64x2 %%IN05, %%H0, %%H2, 0xdd ; L2
+ vshufi64x2 %%IN09, %%H1, %%H3, 0x88 ; L4
+ vshufi64x2 %%IN13, %%H1, %%H3, 0xdd ; L6
+
+ vshufi64x2 %%H0, %%K3, %%T0, 0x44
+ vshufi64x2 %%H1, %%K3, %%T0, 0xee
+ vshufi64x2 %%H2, %%IN14, %%IN02, 0x44
+ vshufi64x2 %%H3, %%IN14, %%IN02, 0xee
+ vshufi64x2 %%IN02, %%H0, %%H2, 0x88 ; R1
+ vshufi64x2 %%IN06, %%H0, %%H2, 0xdd ; R3
+ vshufi64x2 %%IN10, %%H1, %%H3, 0x88 ; R5
+ vshufi64x2 %%IN14, %%H1, %%H3, 0xdd ; R7
+
+ vshufi64x2 %%H0, %%T3, %%T1, 0x44
+ vshufi64x2 %%H1, %%T3, %%T1, 0xee
+ vshufi64x2 %%H2, %%IN15, %%IN03, 0x44
+ vshufi64x2 %%H3, %%IN15, %%IN03, 0xee
+ vshufi64x2 %%IN03, %%H0, %%H2, 0x88 ; L1
+ vshufi64x2 %%IN07, %%H0, %%H2, 0xdd ; L3
+ vshufi64x2 %%IN11, %%H1, %%H3, 0x88 ; L5
+ vshufi64x2 %%IN15, %%H1, %%H3, 0xdd ; L7
+%endmacro
+
+;;; ===========================================================================
+;;; DATA TRANSPOSITION OF ONE DES BLOCK AT DATA INPUT
+;;; ===========================================================================
+;;;
+;;; IN00-IN15 / R0/L0-R7/L7 [in/out]:
+;;; in: IN00 - lane 0 data, IN01 - lane 1 data, ... IN15 - lane 15 data
+;;; out: R0 - 16 x word0, L0 - 16 x word1
+;;; T0,T2 [clobbered] - temporary zmm registers
+;;; K0-K4 [clobbered] - temporary zmm registers
+;;; H0,H2 [clobbered] - temporary zmm registers
+%macro TRANSPOSE_IN_ONE 24
+%define %%IN00 %1 ; R0
+%define %%IN01 %2 ; L0
+%define %%IN02 %3 ; R1
+%define %%IN03 %4 ; L1
+%define %%IN04 %5 ; R2
+%define %%IN05 %6 ; L2
+%define %%IN06 %7 ; R3
+%define %%IN07 %8 ; L3
+%define %%IN08 %9 ; R4
+%define %%IN09 %10 ; L4
+%define %%IN10 %11 ; R5
+%define %%IN11 %12 ; L5
+%define %%IN12 %13 ; R6
+%define %%IN13 %14 ; L6
+%define %%IN14 %15 ; R7
+%define %%IN15 %16 ; L7
+%define %%T0 %17
+%define %%T2 %18
+%define %%K0 %19
+%define %%K1 %20
+%define %%K2 %21
+%define %%K4 %22
+%define %%H0 %23
+%define %%H2 %24
+
+ vpunpckldq %%K0, %%IN00, %%IN01
+ vpunpckhdq %%K1, %%IN00, %%IN01
+ vpunpckldq %%T0, %%IN02, %%IN03
+
+ vpunpckldq %%IN00, %%IN04, %%IN05
+ vpunpckhdq %%IN01, %%IN04, %%IN05
+ vpunpckldq %%IN02, %%IN06, %%IN07
+
+ vpunpcklqdq %%K2, %%K0, %%T0
+ vpunpckhqdq %%T2, %%K0, %%T0
+
+ vpunpcklqdq %%K0, %%IN00, %%IN02
+ vpunpckhqdq %%K1, %%IN00, %%IN02
+
+ vpunpckldq %%K4, %%IN08, %%IN09
+ vpunpckldq %%IN04, %%IN10, %%IN11
+ vpunpckldq %%IN06, %%IN12, %%IN13
+ vpunpckldq %%IN10, %%IN14, %%IN15
+
+ vpunpcklqdq %%IN12, %%K4, %%IN04
+ vpunpckhqdq %%IN13, %%K4, %%IN04
+ vpunpcklqdq %%IN00, %%IN06, %%IN10
+ vpunpckhqdq %%IN01, %%IN06, %%IN10
+
+ vshufi64x2 %%H0, %%K2, %%K0, 0x44
+ vshufi64x2 %%H2, %%IN12, %%IN00, 0x44
+ vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0
+
+ vshufi64x2 %%H0, %%T2, %%K1, 0x44
+ vshufi64x2 %%H2, %%IN13, %%IN01, 0x44
+ vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0
+%endmacro
+
+;;; ===========================================================================
+;;; DATA TRANSPOSITION OF ONE DES BLOCK AT DATA OUTPUT
+;;; ===========================================================================
+;;;
+;;; IN00-IN15 aka R0/L0 - R7/L7 [in/out]:
+;;; in: R0 - 16 x word0, L0 - 16 x word1
+;;; out: R0 - lane 0 data, L0 - lane 1 data, ... L7 - lane 15 data
+;;; T0-T3 [clobbered] - temporary zmm registers
+;;; K0-K3 [clobbered] - temporary zmm registers
+;;; H0,H1 [clobbered] - temporary zmm registers
+%macro TRANSPOSE_OUT_ONE 25
+%define %%IN00 %1 ; R0
+%define %%IN01 %2 ; L0
+%define %%IN02 %3 ; R1
+%define %%IN03 %4 ; L1
+%define %%IN04 %5 ; R2
+%define %%IN05 %6 ; L2
+%define %%IN06 %7 ; R3
+%define %%IN07 %8 ; L3
+%define %%IN08 %9 ; R4
+%define %%IN09 %10 ; L4
+%define %%IN10 %11 ; R5
+%define %%IN11 %12 ; L5
+%define %%IN12 %13 ; R6
+%define %%IN13 %14 ; L6
+%define %%IN14 %15 ; R7
+%define %%IN15 %16 ; L7
+%define %%T0 %17
+%define %%T2 %18
+%define %%T3 %19
+%define %%K0 %20
+%define %%K1 %21
+%define %%K2 %22
+%define %%K3 %23
+%define %%H0 %24
+%define %%H1 %25
+
+ vpxord %%T0, %%T0, %%T0
+
+ vpunpckldq %%K0, %%IN01, %%IN00
+ vpunpckhdq %%K1, %%IN01, %%IN00
+
+ vpunpcklqdq %%K2, %%K0, %%T0
+ vpunpckhqdq %%T2, %%K0, %%T0
+ vpunpcklqdq %%K3, %%K1, %%T0
+ vpunpckhqdq %%T3, %%K1, %%T0
+
+ vshufi64x2 %%H0, %%K2, %%T0, 0x44
+ vshufi64x2 %%H1, %%K2, %%T0, 0xee
+ vshufi64x2 %%IN00, %%H0, %%T0, 0x88 ; R0
+ vshufi64x2 %%IN04, %%H0, %%T0, 0xdd ; R2
+ vshufi64x2 %%IN08, %%H1, %%T0, 0x88 ; R4
+ vshufi64x2 %%IN12, %%H1, %%T0, 0xdd ; R6
+
+ vshufi64x2 %%H0, %%T2, %%T0, 0x44
+ vshufi64x2 %%H1, %%T2, %%T0, 0xee
+ vshufi64x2 %%IN01, %%H0, %%T0, 0x88 ; L0
+ vshufi64x2 %%IN05, %%H0, %%T0, 0xdd ; L2
+ vshufi64x2 %%IN09, %%H1, %%T0, 0x88 ; L4
+ vshufi64x2 %%IN13, %%H1, %%T0, 0xdd ; L6
+
+ vshufi64x2 %%H0, %%K3, %%T0, 0x44
+ vshufi64x2 %%H1, %%K3, %%T0, 0xee
+ vshufi64x2 %%IN02, %%H0, %%T0, 0x88 ; R1
+ vshufi64x2 %%IN06, %%H0, %%T0, 0xdd ; R3
+ vshufi64x2 %%IN10, %%H1, %%T0, 0x88 ; R5
+ vshufi64x2 %%IN14, %%H1, %%T0, 0xdd ; R7
+
+ vshufi64x2 %%H0, %%T3, %%T0, 0x44
+ vshufi64x2 %%H1, %%T3, %%T0, 0xee
+ vshufi64x2 %%IN03, %%H0, %%T0, 0x88 ; L1
+ vshufi64x2 %%IN07, %%H0, %%T0, 0xdd ; L3
+ vshufi64x2 %%IN11, %%H1, %%T0, 0x88 ; L5
+ vshufi64x2 %%IN15, %%H1, %%T0, 0xdd ; L7
+%endmacro
+
+;;; ===========================================================================
+;;; DES INITIALIZATION
+;;; key schedule transposition and IV set up
+;;; ===========================================================================
+;;;
+;;; STATE_KEYS [in] - KEYS in DES OOO STATE
+;;; STATE_IV [ in] - IV in DES OOO STATE
+;;; KS [out] - place to store transposed key schedule or NULL
+;;; IV0 [out] - r512; initialization vector
+;;; IV1 [out] - r512; initialization vector
+;;; T0-T27 [clobbered] - temporary r512
+%macro DES_INIT 33
+%define %%STATE_KEYS %1
+%define %%STATE_IV %2
+%define %%KS %3
+%define %%IV0 %4
+%define %%IV1 %5
+%define %%T0 %6
+%define %%T1 %7
+%define %%T2 %8
+%define %%T3 %9
+%define %%T4 %10
+%define %%T5 %11
+%define %%T6 %12
+%define %%T7 %13
+%define %%T8 %14
+%define %%T9 %15
+%define %%T10 %16
+%define %%T11 %17
+%define %%T12 %18
+%define %%T13 %19
+%define %%T14 %20
+%define %%T15 %21
+%define %%T16 %22
+%define %%T17 %23
+%define %%T18 %24
+%define %%T19 %25
+%define %%T20 %26
+%define %%T21 %27
+%define %%T22 %28
+%define %%T23 %29
+%define %%T24 %30
+%define %%T25 %31
+%define %%T26 %32
+%define %%T27 %33
+
+ ;; set up the key schedule
+ ;; - load first half of the keys & transpose
+ ;; - transpose and store
+ ;; note: we can use IV registers as temprary ones here
+%assign IDX 0
+%rep 16
+ mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)]
+ vmovdqu64 %%T %+ IDX, [IA0]
+%assign IDX (IDX + 1)
+%endrep
+ TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1
+%assign IDX 0
+%rep 16
+ vmovdqu64 [%%KS + (IDX * 64)], %%T %+ IDX
+%assign IDX (IDX + 1)
+%endrep
+ ;; - load second half of the keys & transpose
+ ;; - transpose and store
+ ;; note: we can use IV registers as temprary ones here
+%assign IDX 0
+%rep 16
+ mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)]
+ vmovdqu64 %%T %+ IDX, [IA0 + 64]
+%assign IDX (IDX + 1)
+%endrep
+ TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1
+%assign IDX 0
+%rep 16
+ vmovdqu64 [%%KS + (16 * 64) + (IDX * 64)], %%T %+ IDX
+%assign IDX (IDX + 1)
+%endrep
+
+ ;; set up IV
+ ;; - they are already kept transposed so this is enough to load them
+ vmovdqu64 %%IV0, [%%STATE_IV + (0 * 64)]
+ vmovdqu64 %%IV1, [%%STATE_IV + (1 * 64)]
+%endmacro
+
+;;; ===========================================================================
+;;; 3DES INITIALIZATION
+;;; key schedule transposition and IV set up
+;;; ===========================================================================
+;;;
+;;; STATE_KEYS [in] - KEYS in 3DES OOO STATE
+;;; STATE_IV [ in] - IV in 3DES OOO STATE
+;;; KS1 [out] - place to store transposed key schedule or NULL
+;;; KS2 [out] - place to store transposed key schedule or NULL
+;;; KS3 [out] - place to store transposed key schedule or NULL
+;;; IV0 [out] - r512; initialization vector
+;;; IV1 [out] - r512; initialization vector
+;;; T0-T27 [clobbered] - temporary r512
+;;; DIR [in] - ENC/DEC (keys arranged in different order for enc/dec)
+%macro DES3_INIT 36
+%define %%STATE_KEYS %1
+%define %%STATE_IV %2
+%define %%KS1 %3
+%define %%KS2 %4
+%define %%KS3 %5
+%define %%IV0 %6
+%define %%IV1 %7
+%define %%T0 %8
+%define %%T1 %9
+%define %%T2 %10
+%define %%T3 %11
+%define %%T4 %12
+%define %%T5 %13
+%define %%T6 %14
+%define %%T7 %15
+%define %%T8 %16
+%define %%T9 %17
+%define %%T10 %18
+%define %%T11 %19
+%define %%T12 %20
+%define %%T13 %21
+%define %%T14 %22
+%define %%T15 %23
+%define %%T16 %24
+%define %%T17 %25
+%define %%T18 %26
+%define %%T19 %27
+%define %%T20 %28
+%define %%T21 %29
+%define %%T22 %30
+%define %%T23 %31
+%define %%T24 %32
+%define %%T25 %33
+%define %%T26 %34
+%define %%T27 %35
+%define %%DIR %36
+
+%ifidn %%DIR, ENC
+%assign KEY_IDX 0
+%else
+%assign KEY_IDX 2
+%endif
+%assign KS_IDX 1
+
+%rep 3
+ ;; set up the key schedule
+ ;; - load first half of the keys & transpose
+ ;; - transpose and store
+ ;; note: we can use IV registers as temprary ones here
+
+%assign IDX 0
+%rep 16
+ mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)]
+ mov IA0, [IA0 + (KEY_IDX * PTR_SZ)]
+ vmovdqu64 %%T %+ IDX, [IA0]
+%assign IDX (IDX + 1)
+%endrep
+ TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1
+%assign IDX 0
+%rep 16
+ vmovdqu64 [%%KS %+ KS_IDX + (IDX * 64)], %%T %+ IDX
+%assign IDX (IDX + 1)
+%endrep
+ ;; - load second half of the keys & transpose
+ ;; - transpose and store
+ ;; note: we can use IV registers as temprary ones here
+%assign IDX 0
+%rep 16
+ mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)]
+ mov IA0, [IA0 + (KEY_IDX * PTR_SZ)]
+ vmovdqu64 %%T %+ IDX, [IA0 + 64]
+%assign IDX (IDX + 1)
+%endrep
+ TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1
+%assign IDX 0
+%rep 16
+ vmovdqu64 [%%KS %+ KS_IDX + (16 * 64) + (IDX * 64)], %%T %+ IDX
+%assign IDX (IDX + 1)
+%endrep
+
+%ifidn %%DIR, ENC
+%assign KEY_IDX (KEY_IDX + 1)
+%else
+%assign KEY_IDX (KEY_IDX - 1)
+%endif
+%assign KS_IDX (KS_IDX + 1)
+%endrep ; KEY_IDX / KS_IDX
+
+ ;; set up IV
+ ;; - they are already kept transposed so this is enough to load them
+ vmovdqu64 %%IV0, [%%STATE_IV + (0 * 64)]
+ vmovdqu64 %%IV1, [%%STATE_IV + (1 * 64)]
+
+%endmacro
+
+;;; ===========================================================================
+;;; DES FINISH
+;;; Update in/out pointers and store IV
+;;; ===========================================================================
+;;;
+;;; Needs: STATE & SIZE
+;;; IV0 [in] - r512; initialization vector
+;;; IV1 [in] - r512; initialization vector
+;;; T0-T4 [clobbered] - temporary r512 registers
+%macro DES_FINISH 7
+%define %%IV0 %1
+%define %%IV1 %2
+%define %%T0 %3
+%define %%T1 %4
+%define %%T2 %5
+%define %%T3 %6
+%define %%T4 %7
+
+ vpbroadcastq %%T4, SIZE
+ vmovdqu64 %%T0, [STATE + _des_args_in + (0 * PTR_SZ)]
+ vmovdqu64 %%T1, [STATE + _des_args_in + (8 * PTR_SZ)]
+ vmovdqu64 %%T2, [STATE + _des_args_out + (0 * PTR_SZ)]
+ vmovdqu64 %%T3, [STATE + _des_args_out + (8 * PTR_SZ)]
+ vpaddq %%T0, %%T0, %%T4
+ vpaddq %%T1, %%T1, %%T4
+ vpaddq %%T2, %%T2, %%T4
+ vpaddq %%T3, %%T3, %%T4
+ vmovdqu64 [STATE + _des_args_in + (0 * PTR_SZ)], %%T0
+ vmovdqu64 [STATE + _des_args_in + (8 * PTR_SZ)], %%T1
+ vmovdqu64 [STATE + _des_args_out + (0 * PTR_SZ)], %%T2
+ vmovdqu64 [STATE + _des_args_out + (8 * PTR_SZ)], %%T3
+
+ vmovdqu64 [STATE + _des_args_IV + (0 * 64)], %%IV0
+ vmovdqu64 [STATE + _des_args_IV + (1 * 64)], %%IV1
+%endmacro
+
+;;; ===========================================================================
+;;; DES CFB ENCRYPT/DECRYPT - ONE BLOCK ONLY
+;;; ===========================================================================
+;;;
+;;; Needs: STATE, IA0-IA2
+;;; ENC_DEC [in] - encyrpt (ENC) or decrypt (DEC) selection
+;;; KS [in] - key schedule
+;;; T0-T24 [clobbered] - temporary r512
+;;; T_IN [in] - 16 * 8 byte storage
+;;; T_OUT [in] - 16 * 8 byte storage
+;;; T_MASK [in] - 16 * 4 byte storage
+;;; T_IV [in] - 16 * 8 byte storage
+;;;
+;;; NOTE: clobbers OpMask registers
+%macro DES_CFB_ONE 31
+%define %%ENC_DEC %1
+%define %%KS %2
+%define %%T0 %3
+%define %%T1 %4
+%define %%T2 %5
+%define %%T3 %6
+%define %%T4 %7
+%define %%T5 %8
+%define %%T6 %9
+%define %%T7 %10
+%define %%T8 %11
+%define %%T9 %12
+%define %%T10 %13
+%define %%T11 %14
+%define %%T12 %15
+%define %%T13 %16
+%define %%T14 %17
+%define %%T15 %18
+%define %%T16 %19
+%define %%T17 %20
+%define %%T18 %21
+%define %%T19 %22
+%define %%T20 %23
+%define %%T21 %24
+%define %%T22 %25
+%define %%T23 %26
+%define %%T24 %27
+%define %%T_IN %28
+%define %%T_OUT %29
+%define %%T_IV %30
+%define %%T_MASK %31
+
+ ;; - find mask for non-zero partial lengths
+ vpxord %%T10, %%T10, %%T10
+ vmovdqu64 %%T0, [STATE + _des_args_PLen]
+ vpcmpd k3, %%T0, %%T10, 4 ; NEQ
+ kmovw DWORD(IA0), k3
+ movzx DWORD(IA0), WORD(IA0)
+ or DWORD(IA0), DWORD(IA0)
+ jz %%_des_cfb_one_end ; no non-zero partial lengths
+
+%ifidn %%ENC_DEC, ENC
+ ;; For encyrption case we need to make sure that
+ ;; all full blocks are complete before proceeding
+ ;; with CFB partial block.
+ ;; To do that current out position is compared against
+ ;; calculated last full block position.
+ vmovdqu64 %%T1, [STATE + _des_args_out + (0*8)]
+ vmovdqu64 %%T2, [STATE + _des_args_LOut + (0*8)]
+ vmovdqu64 %%T3, [STATE + _des_args_out + (8*8)]
+ vmovdqu64 %%T4, [STATE + _des_args_LOut + (8*8)]
+ vpcmpq k4, %%T1, %%T2, 0 ; EQ
+ vpcmpq k5, %%T3, %%T4, 0 ; EQ
+ kmovw DWORD(IA1), k4
+ movzx DWORD(IA1), BYTE(IA1)
+ kmovw DWORD(IA2), k5
+ movzx DWORD(IA2), BYTE(IA2)
+ shl DWORD(IA2), 8
+ or DWORD(IA2), DWORD(IA1)
+ and DWORD(IA0), DWORD(IA2)
+ jz %%_des_cfb_one_end ; no non-zero lengths left
+ kmovw k3, DWORD(IA0)
+%endif
+ ;; Calculate ((1 << partial_bytes) - 1)
+ ;; in order to get the mask for loads and stores
+ ;; k3 & IA0 - hold valid mask
+ vmovdqa64 %%T1, [rel vec_ones_32b]
+ vpsllvd %%T2{k3}{z}, %%T1, %%T0
+ vpsubd %%T2{k3}{z}, %%T2, %%T1
+ vmovdqu64 [%%T_MASK], %%T2
+
+ ;; clear selected partial lens not to do them twice
+ vmovdqu32 [STATE + _des_args_PLen]{k3}, %%T10
+
+ ;; copy IV, in and out pointers
+ vmovdqu64 %%T1, [STATE + _des_args_in + (0*PTR_SZ)]
+ vmovdqu64 %%T2, [STATE + _des_args_in + (8*PTR_SZ)]
+ vmovdqu64 %%T3, [STATE + _des_args_out + (0*PTR_SZ)]
+ vmovdqu64 %%T4, [STATE + _des_args_out + (8*PTR_SZ)]
+ vmovdqu64 %%T5, [STATE + _des_args_IV + (0*64)]
+ vmovdqu64 %%T6, [STATE + _des_args_IV + (1*64)]
+ vmovdqu64 [%%T_IN + (0*PTR_SZ)], %%T1
+ vmovdqu64 [%%T_IN + (8*PTR_SZ)], %%T2
+ vmovdqu64 [%%T_OUT + (0*PTR_SZ)], %%T3
+ vmovdqu64 [%%T_OUT + (8*PTR_SZ)], %%T4
+ vmovdqu64 [%%T_IV + (0*64)], %%T5
+ vmovdqu64 [%%T_IV + (1*64)], %%T6
+
+ ;; calculate last block case mask
+ ;; - first block case requires no modifications to in/out/IV
+ vmovdqu64 %%T1, [STATE + _des_args_BLen]
+ vpcmpd k2, %%T1, %%T10, 4 ; NEQ
+ kmovw DWORD(IA1), k2
+ and DWORD(IA1), DWORD(IA0)
+ jz %%_des_cfb_one_no_last_blocks
+
+ ;; set up IV, in and out for the last block case
+ ;; - Last block needs in and out to be set differently (decryption only)
+ ;; - IA1 holds the last block mask
+%ifidn %%ENC_DEC, DEC
+ mov DWORD(IA0), DWORD(IA1)
+ mov DWORD(IA2), DWORD(IA1)
+ shr DWORD(IA1), 8
+ and DWORD(IA2), 0xff
+ kmovw k4, DWORD(IA2)
+ kmovw k5, DWORD(IA1)
+ vmovdqu64 %%T1, [STATE + _des_args_LOut + (0*PTR_SZ)]
+ vmovdqu64 %%T2, [STATE + _des_args_LOut + (8*PTR_SZ)]
+ vmovdqu64 %%T3, [STATE + _des_args_LIn + (0*PTR_SZ)]
+ vmovdqu64 %%T4, [STATE + _des_args_LIn + (8*PTR_SZ)]
+ vmovdqu64 [%%T_OUT + (0*PTR_SZ)]{k4}, %%T1
+ vmovdqu64 [%%T_OUT + (8*PTR_SZ)]{k5}, %%T2
+ vmovdqu64 [%%T_IN + (0*PTR_SZ)]{k4}, %%T3
+ vmovdqu64 [%%T_IN + (8*PTR_SZ)]{k5}, %%T4
+%endif ; decryption
+ ;; - IV has to be set differently for CFB as well
+ ;; - IA0 holds the last block mask
+%assign IDX 0
+%rep 16
+ test DWORD(IA0), (1 << IDX)
+ jz %%_des_cfb_one_copy_iv_next %+ IDX
+%ifidn %%ENC_DEC, ENC
+ mov IA2, [STATE + _des_args_LOut + (IDX*PTR_SZ)]
+%else
+ mov IA2, [STATE + _des_args_LIn + (IDX*PTR_SZ)]
+%endif
+ mov IA2, [IA2 - 8]
+ mov [%%T_IV + (0*4) + (IDX*4)], DWORD(IA2)
+ shr IA2, 32
+ mov [%%T_IV + (16*4) + (IDX*4)], DWORD(IA2)
+%%_des_cfb_one_copy_iv_next %+ IDX:
+%assign IDX (IDX + 1)
+%endrep
+
+%%_des_cfb_one_no_last_blocks:
+ ;; Uffff ... finally let's do some DES CFB
+ ;; - let's use T_IN, T_OUT, T_IV and T_MASK
+
+ ;; - load data with the corresponding masks & transpose
+ ;; - T0 to T15 will hold the data
+ xor IA0, IA0
+%assign IDX 0
+%assign K_IDX 1
+%rep 16
+ mov IA1, [%%T_IN + (IDX*PTR_SZ)]
+ mov DWORD(IA0), [%%T_MASK + (IDX*4)]
+ kmovq k %+ K_IDX, IA0
+ vmovdqu8 %%T %+ IDX{k %+ K_IDX}{z}, [IA1]
+%assign IDX (IDX + 1)
+%assign K_IDX (K_IDX + 1)
+%if K_IDX > 7
+%assign K_IDX 1 ; iterate through K1 to K7
+%endif
+%endrep
+ ;; - transpose the data in T0 to T15, T16 to T23 are clobbered
+ TRANSPOSE_IN_ONE %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23
+
+ ;; - set up IV and %%T16 & %%T17 used as IV0 and IV1
+ vmovdqu64 %%T16, [%%T_IV + (0 * 64)] ;IV0
+ vmovdqu64 %%T17, [%%T_IV + (1 * 64)] ;IV1
+ ;; DES encrypt
+ ;; - R0 - %%T0
+ ;; - L0 - %%T1
+ DES_ENC_DEC ENC, %%T16, %%T17, %%KS, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13
+ ;; CFB style xor with R0/L0 with IV
+ ;; - IV0 - %%T16
+ ;; - IV1 - %%T17
+ vpxord %%T2, %%T17, %%T0 ; R0 ^ IV1
+ vpxord %%T0, %%T16, %%T1 ; L0 ^ IV0
+ vmovdqa64 %%T1, %%T2
+ ;; - new R0 = L0 ^ IV0 (%%T0)
+ ;; - new L0 = R0 ^ IV1 (%%T1)
+
+ ;; Transpose the data out
+ ;; - %%T2 to %%T24 clobbered
+ TRANSPOSE_OUT_ONE %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24
+
+ ;; Store the transposed data
+ ;; - T0 to T15 will hold the data
+ xor IA0, IA0
+%assign IDX 0
+%assign K_IDX 1
+%rep 16
+ mov IA1, [%%T_OUT + (IDX*PTR_SZ)]
+ mov DWORD(IA0), [%%T_MASK + (IDX*4)]
+ kmovq k %+ K_IDX, IA0
+ vmovdqu8 [IA1]{k %+ K_IDX}, %%T %+ IDX
+%assign IDX (IDX + 1)
+%assign K_IDX (K_IDX + 1)
+%if K_IDX > 7
+%assign K_IDX 1 ; iterate through K1 to K7
+%endif
+%endrep
+
+%ifdef SAFE_DATA
+ ;; Clear copied IV's
+ vpxorq %%T5, %%T5
+ vmovdqu64 [%%T_IV + (0*64)], %%T5
+ vmovdqu64 [%%T_IV + (1*64)], %%T5
+%endif
+
+%%_des_cfb_one_end:
+
+%endmacro
+
+;;; ===========================================================================
+;;; Converts length into mask of DES blocks
+;;; ===========================================================================
+;;;
+;;; MASK [out] - mask8 for value; for masked 64b loads and stores (r64)
+;;; USES: IA0, IA1 IA2
+;;; ASSUMES: SIZE - OFFSET < 64
+%macro GET_MASK8 1
+%define %%MASK %1
+
+%ifidn IA1, rcx
+%define myrcx IA1
+%else
+%define myrcx rcx
+ mov IA1, rcx
+%endif
+ mov myrcx, SIZE
+ sub myrcx, OFFSET
+ ;; - myrcx - remaining length
+ ;; - divide by 8 (DES block size)
+ ;; - create bit mask of the result
+ mov DWORD(%%MASK), 1
+ shr DWORD(myrcx), 3
+ shl DWORD(%%MASK), BYTE(myrcx)
+ sub DWORD(%%MASK), 1
+%ifnidn IA1, rcx
+ mov rcx, IA1
+%endif
+%endmacro
+
+;;; ===========================================================================
+;;; DES CBC ENCRYPT CIPHER ONLY (1 to 8 DES blocks only)
+;;; ===========================================================================
+;;;
+;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only
+;;; DES_KS [in] - pointer to transposed key schedule
+;;;
+;;; NOTE: clobbers OpMask registers
+;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1
+%macro GEN_DES_ENC_CIPHER 2
+%define %%NUM_DES_BLOCKS %1
+%define %%DES_KS %2
+
+%assign RN 0
+%assign LN 1
+%assign RNN 2
+%assign LNN 3
+%rep %%NUM_DES_BLOCKS - 1
+ DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
+ vpxord ZW %+ RNN, ZW %+ RNN, ZW %+ LN ; R1 = R1 ^ L0
+ vpxord ZW %+ LNN, ZW %+ LNN, ZW %+ RN ; L1 = L1 ^ R0
+%assign RN (RN + 2)
+%assign LN (LN + 2)
+%assign RNN (RNN + 2)
+%assign LNN (LNN + 2)
+%endrep
+ DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
+ vmovdqa64 ZIV0, ZW %+ LN ; IV0 = L7
+ vmovdqa64 ZIV1, ZW %+ RN ; IV1 = R7
+%endmacro
+
+;;; ===========================================================================
+;;; DES CBC DECRYPT CIPHER ONLY (1 to 8 DES blocks only)
+;;; ===========================================================================
+;;;
+;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only
+;;; DES_KS [in] - pointer to transposed key schedule
+;;;
+;;; NOTE: clobbers OpMask registers
+;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1
+%macro GEN_DES_DEC_CIPHER 2
+%define %%NUM_DES_BLOCKS %1
+%define %%DES_KS %2
+
+%assign RN 0
+%assign LN 1
+%rep %%NUM_DES_BLOCKS
+ vmovdqa64 ZTMP12, ZW %+ RN ; keep R0 as IV for the next round
+ vmovdqa64 ZTMP13, ZW %+ LN ; keep L0 as IV for the next round
+ DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
+ vpxord ZW %+ RN, ZW %+ RN, ZIV1 ; R0 = R0 ^ IV1
+ vpxord ZW %+ LN, ZW %+ LN, ZIV0 ; L0 = L0 ^ IV0
+ vmovdqa64 ZIV0, ZTMP12
+ vmovdqa64 ZIV1, ZTMP13
+%assign RN (RN + 2)
+%assign LN (LN + 2)
+%endrep
+%endmacro
+
+;;; ===========================================================================
+;;; 3DES CBC ENCRYPT CIPHER ONLY (1 to 8 DES blocks only)
+;;; ===========================================================================
+;;;
+;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only
+;;; DES_KS1 [in] - pointer to transposed key schedule 1
+;;; DES_KS2 [in] - pointer to transposed key schedule 2
+;;; DES_KS3 [in] - pointer to transposed key schedule 3
+;;;
+;;; NOTE: clobbers OpMask registers
+;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1
+%macro GEN_3DES_ENC_CIPHER 4
+%define %%NUM_DES_BLOCKS %1
+%define %%DES_KS1 %2
+%define %%DES_KS2 %3
+%define %%DES_KS3 %4
+
+%assign RN 0
+%assign LN 1
+%assign RNN 2
+%assign LNN 3
+%rep %%NUM_DES_BLOCKS
+ ;; ENC
+ DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
+ ;; DEC
+ DES_ENC_DEC DEC, ZW %+ LN, ZW %+ RN, %%DES_KS2, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
+ ;; ENC
+ DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS3, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
+%if (RNN < (%%NUM_DES_BLOCKS * 2))
+ vpxord ZW %+ RNN, ZW %+ RNN, ZW %+ LN ; R1 = R1 ^ L0
+ vpxord ZW %+ LNN, ZW %+ LNN, ZW %+ RN ; L1 = L1 ^ R0
+%else
+ vmovdqa64 ZIV0, ZW %+ LN ; IV0 = L7
+ vmovdqa64 ZIV1, ZW %+ RN ; IV1 = R7
+%endif
+
+%assign RN (RN + 2)
+%assign LN (LN + 2)
+%assign RNN (RNN + 2)
+%assign LNN (LNN + 2)
+%endrep
+
+%endmacro
+
+;;; ===========================================================================
+;;; 3DES CBC DECRYPT CIPHER ONLY (1 to 8 DES blocks only)
+;;; ===========================================================================
+;;;
+;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only
+;;; DES_KS1 [in] - pointer to transposed key schedule 1
+;;; DES_KS2 [in] - pointer to transposed key schedule 2
+;;; DES_KS3 [in] - pointer to transposed key schedule 3
+;;;
+;;; NOTE: clobbers OpMask registers
+;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1
+%macro GEN_3DES_DEC_CIPHER 4
+%define %%NUM_DES_BLOCKS %1
+%define %%DES_KS1 %2
+%define %%DES_KS2 %3
+%define %%DES_KS3 %4
+
+%assign RN 0
+%assign LN 1
+%rep %%NUM_DES_BLOCKS
+ vmovdqa64 ZTMP12, ZW %+ RN ; keep R0 as IV for the next round
+ vmovdqa64 ZTMP13, ZW %+ LN ; keep L0 as IV for the next round
+ ;; DEC
+ DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
+ ;; ENC
+ DES_ENC_DEC ENC, ZW %+ LN, ZW %+ RN, %%DES_KS2, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
+ ;; DEC
+ DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS3, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
+ vpxord ZW %+ RN, ZW %+ RN, ZIV1 ; R0 = R0 ^ IV1
+ vpxord ZW %+ LN, ZW %+ LN, ZIV0 ; L0 = L0 ^ IV0
+ vmovdqa64 ZIV0, ZTMP12
+ vmovdqa64 ZIV1, ZTMP13
+
+%assign RN (RN + 2)
+%assign LN (LN + 2)
+%endrep
+
+%endmacro
+
+;;; ===========================================================================
+;;; DES CBC / DOCSIS DES ENCRYPT
+;;; ===========================================================================
+;;;
+;;; DES_DOCSIS [in] - select between DES (DES CBC), DOCSIS (DOCSIS DES) and
+;;; 3DES (3DES CBC)
+;;;
+;;; NOTE: clobbers OpMask registers
+%macro GENERIC_DES_ENC 1
+%define %%DES_DOCSIS %1
+
+ ;; push the registers and allocate the stack frame
+ mov rax, rsp
+ sub rsp, STACKFRAME_size
+ and rsp, -64
+ mov [rsp + _rsp_save], rax ; original SP
+ mov [rsp + _gpr_save + 0*8], r12
+ mov [rsp + _gpr_save + 1*8], r13
+ mov [rsp + _gpr_save + 2*8], r14
+ mov [rsp + _gpr_save + 3*8], r15
+
+%ifnidn %%DES_DOCSIS, 3DES
+ ;; DES and DOCSIS DES
+ DES_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
+%else
+ ;; 3DES
+ DES3_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ENC
+%endif
+ mov [rsp + _size_save], SIZE
+ and SIZE, -64
+ xor OFFSET, OFFSET
+ ;; This loop processes message in blocks of 64 bytes.
+ ;; Anything smaller than 64 bytes is handled separately after the loop.
+%%_gen_des_enc_loop:
+ cmp OFFSET, SIZE
+ jz %%_gen_des_enc_loop_end
+ ;; run loads
+ mov IA0, [STATE + _des_args_in + (0*PTR_SZ)]
+ mov IA1, [STATE + _des_args_in + (1*PTR_SZ)]
+ mov IA2, [STATE + _des_args_in + (2*PTR_SZ)]
+ mov INP0, [STATE + _des_args_in + (3*PTR_SZ)]
+ mov INP1, [STATE + _des_args_in + (4*PTR_SZ)]
+ mov INP2, [STATE + _des_args_in + (5*PTR_SZ)]
+ mov INP3, [STATE + _des_args_in + (6*PTR_SZ)]
+ mov INP4, [STATE + _des_args_in + (7*PTR_SZ)]
+ vmovdqu64 ZW0, [IA0 + OFFSET]
+ vmovdqu64 ZW1, [IA1 + OFFSET]
+ vmovdqu64 ZW2, [IA2 + OFFSET]
+ vmovdqu64 ZW3, [INP0 + OFFSET]
+ vmovdqu64 ZW4, [INP1 + OFFSET]
+ vmovdqu64 ZW5, [INP2 + OFFSET]
+ vmovdqu64 ZW6, [INP3 + OFFSET]
+ vmovdqu64 ZW7, [INP4 + OFFSET]
+
+ mov IA0, [STATE + _des_args_in + (8*PTR_SZ)]
+ mov IA1, [STATE + _des_args_in + (9*PTR_SZ)]
+ mov IA2, [STATE + _des_args_in + (10*PTR_SZ)]
+ mov INP0, [STATE + _des_args_in + (11*PTR_SZ)]
+ mov INP1, [STATE + _des_args_in + (12*PTR_SZ)]
+ mov INP2, [STATE + _des_args_in + (13*PTR_SZ)]
+ mov INP3, [STATE + _des_args_in + (14*PTR_SZ)]
+ mov INP4, [STATE + _des_args_in + (15*PTR_SZ)]
+ vmovdqu64 ZW8, [IA0 + OFFSET]
+ vmovdqu64 ZW9, [IA1 + OFFSET]
+ vmovdqu64 ZW10, [IA2 + OFFSET]
+ vmovdqu64 ZW11, [INP0 + OFFSET]
+ vmovdqu64 ZW12, [INP1 + OFFSET]
+ vmovdqu64 ZW13, [INP2 + OFFSET]
+ vmovdqu64 ZW14, [INP3 + OFFSET]
+ vmovdqu64 ZW15, [INP4 + OFFSET]
+
+ ;; Transpose input
+ TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
+
+ ;; DES CBC ENC comes here
+ vpxord ZW0, ZW0, ZIV0 ; R0 = R0 ^ IV0
+ vpxord ZW1, ZW1, ZIV1 ; L0 = L0 ^ IV1
+
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_ENC_CIPHER 8, rsp + _key_sched
+%else
+ GEN_3DES_ENC_CIPHER 8, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+
+ ;; transpose data on output
+ TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
+ ;; run stores
+ mov IA0, [STATE + _des_args_out + (0*PTR_SZ)]
+ mov IA1, [STATE + _des_args_out + (1*PTR_SZ)]
+ mov IA2, [STATE + _des_args_out + (2*PTR_SZ)]
+ mov INP0, [STATE + _des_args_out + (3*PTR_SZ)]
+ mov INP1, [STATE + _des_args_out + (4*PTR_SZ)]
+ mov INP2, [STATE + _des_args_out + (5*PTR_SZ)]
+ mov INP3, [STATE + _des_args_out + (6*PTR_SZ)]
+ mov INP4, [STATE + _des_args_out + (7*PTR_SZ)]
+ vmovdqu64 [IA0 + OFFSET], ZW0
+ vmovdqu64 [IA1 + OFFSET], ZW1
+ vmovdqu64 [IA2 + OFFSET], ZW2
+ vmovdqu64 [INP0 + OFFSET], ZW3
+ vmovdqu64 [INP1 + OFFSET], ZW4
+ vmovdqu64 [INP2 + OFFSET], ZW5
+ vmovdqu64 [INP3 + OFFSET], ZW6
+ vmovdqu64 [INP4 + OFFSET], ZW7
+
+ mov IA0, [STATE + _des_args_out + (8*PTR_SZ)]
+ mov IA1, [STATE + _des_args_out + (9*PTR_SZ)]
+ mov IA2, [STATE + _des_args_out + (10*PTR_SZ)]
+ mov INP0, [STATE + _des_args_out + (11*PTR_SZ)]
+ mov INP1, [STATE + _des_args_out + (12*PTR_SZ)]
+ mov INP2, [STATE + _des_args_out + (13*PTR_SZ)]
+ mov INP3, [STATE + _des_args_out + (14*PTR_SZ)]
+ mov INP4, [STATE + _des_args_out + (15*PTR_SZ)]
+ vmovdqu64 [IA0 + OFFSET], ZW8
+ vmovdqu64 [IA1 + OFFSET], ZW9
+ vmovdqu64 [IA2 + OFFSET], ZW10
+ vmovdqu64 [INP0 + OFFSET], ZW11
+ vmovdqu64 [INP1 + OFFSET], ZW12
+ vmovdqu64 [INP2 + OFFSET], ZW13
+ vmovdqu64 [INP3 + OFFSET], ZW14
+ vmovdqu64 [INP4 + OFFSET], ZW15
+
+ add OFFSET, 64
+ jmp %%_gen_des_enc_loop
+%%_gen_des_enc_loop_end:
+ ;; This is where we check if there is anything less than 64 bytes
+ ;; of message left for processing.
+ mov SIZE, [rsp + _size_save]
+ cmp OFFSET, SIZE
+ jz %%_gen_des_enc_part_end
+ ;; calculate min of bytes_left and 64, convert to qword mask
+ GET_MASK8 IA0 ; IA0 = mask
+
+ kmovw k7, DWORD(IA0)
+ mov [rsp + _mask_save], IA0
+ ;; run masked loads
+ mov IA0, [STATE + _des_args_in + (0*PTR_SZ)]
+ mov IA1, [STATE + _des_args_in + (1*PTR_SZ)]
+ mov IA2, [STATE + _des_args_in + (2*PTR_SZ)]
+ mov INP0, [STATE + _des_args_in + (3*PTR_SZ)]
+ mov INP1, [STATE + _des_args_in + (4*PTR_SZ)]
+ mov INP2, [STATE + _des_args_in + (5*PTR_SZ)]
+ mov INP3, [STATE + _des_args_in + (6*PTR_SZ)]
+ mov INP4, [STATE + _des_args_in + (7*PTR_SZ)]
+ vmovdqu64 ZW0{k7}{z}, [IA0 + OFFSET]
+ vmovdqu64 ZW1{k7}{z}, [IA1 + OFFSET]
+ vmovdqu64 ZW2{k7}{z}, [IA2 + OFFSET]
+ vmovdqu64 ZW3{k7}{z}, [INP0 + OFFSET]
+ vmovdqu64 ZW4{k7}{z}, [INP1 + OFFSET]
+ vmovdqu64 ZW5{k7}{z}, [INP2 + OFFSET]
+ vmovdqu64 ZW6{k7}{z}, [INP3 + OFFSET]
+ vmovdqu64 ZW7{k7}{z}, [INP4 + OFFSET]
+
+ mov IA0, [STATE + _des_args_in + (8*PTR_SZ)]
+ mov IA1, [STATE + _des_args_in + (9*PTR_SZ)]
+ mov IA2, [STATE + _des_args_in + (10*PTR_SZ)]
+ mov INP0, [STATE + _des_args_in + (11*PTR_SZ)]
+ mov INP1, [STATE + _des_args_in + (12*PTR_SZ)]
+ mov INP2, [STATE + _des_args_in + (13*PTR_SZ)]
+ mov INP3, [STATE + _des_args_in + (14*PTR_SZ)]
+ mov INP4, [STATE + _des_args_in + (15*PTR_SZ)]
+ vmovdqu64 ZW8{k7}{z}, [IA0 + OFFSET]
+ vmovdqu64 ZW9{k7}{z}, [IA1 + OFFSET]
+ vmovdqu64 ZW10{k7}{z}, [IA2 + OFFSET]
+ vmovdqu64 ZW11{k7}{z}, [INP0 + OFFSET]
+ vmovdqu64 ZW12{k7}{z}, [INP1 + OFFSET]
+ vmovdqu64 ZW13{k7}{z}, [INP2 + OFFSET]
+ vmovdqu64 ZW14{k7}{z}, [INP3 + OFFSET]
+ vmovdqu64 ZW15{k7}{z}, [INP4 + OFFSET]
+
+ ;; Transpose input
+ TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
+
+ ;; DES CBC ENC comes here
+ vpxord ZW0, ZW0, ZIV0 ; R0 = R0 ^ IV0
+ vpxord ZW1, ZW1, ZIV1 ; L0 = L0 ^ IV1
+
+ mov IA0, [rsp + _mask_save]
+ cmp BYTE(IA0), 0x0f
+ ja %%_gt_4
+ jz %%_blocks_4
+
+ cmp BYTE(IA0), 0x03
+ ja %%_blocks_3
+ jz %%_blocks_2
+
+ ;; process one block and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_ENC_CIPHER 1, rsp + _key_sched
+%else
+ GEN_3DES_ENC_CIPHER 1, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+ jmp %%_transpose_out
+
+%%_blocks_2:
+ ;; process two blocks and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_ENC_CIPHER 2, rsp + _key_sched
+%else
+ GEN_3DES_ENC_CIPHER 2, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+ jmp %%_transpose_out
+
+%%_blocks_3:
+ ;; process three blocks and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_ENC_CIPHER 3, rsp + _key_sched
+%else
+ GEN_3DES_ENC_CIPHER 3, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+ jmp %%_transpose_out
+
+%%_blocks_4:
+ ;; process four blocks and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_ENC_CIPHER 4, rsp + _key_sched
+%else
+ GEN_3DES_ENC_CIPHER 4, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+ jmp %%_transpose_out
+
+%%_gt_4:
+ cmp BYTE(IA0), 0x3f
+ ja %%_blocks_7
+ jz %%_blocks_6
+%%_blocks_5:
+ ;; process five blocks and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_ENC_CIPHER 5, rsp + _key_sched
+%else
+ GEN_3DES_ENC_CIPHER 5, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+ jmp %%_transpose_out
+
+%%_blocks_6:
+ ;; process six blocks and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_ENC_CIPHER 6, rsp + _key_sched
+%else
+ GEN_3DES_ENC_CIPHER 6, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+ jmp %%_transpose_out
+
+%%_blocks_7:
+ ;; process seven blocks and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_ENC_CIPHER 7, rsp + _key_sched
+%else
+ GEN_3DES_ENC_CIPHER 7, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+
+%%_transpose_out:
+ ;; transpose data on output
+ TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
+
+ ;; run masked stores
+ mov IA0, [STATE + _des_args_out + (0*PTR_SZ)]
+ mov IA1, [STATE + _des_args_out + (1*PTR_SZ)]
+ mov IA2, [STATE + _des_args_out + (2*PTR_SZ)]
+ mov INP0, [STATE + _des_args_out + (3*PTR_SZ)]
+ mov INP1, [STATE + _des_args_out + (4*PTR_SZ)]
+ mov INP2, [STATE + _des_args_out + (5*PTR_SZ)]
+ mov INP3, [STATE + _des_args_out + (6*PTR_SZ)]
+ mov INP4, [STATE + _des_args_out + (7*PTR_SZ)]
+ vmovdqu64 [IA0 + OFFSET]{k7}, ZW0
+ vmovdqu64 [IA1 + OFFSET]{k7}, ZW1
+ vmovdqu64 [IA2 + OFFSET]{k7}, ZW2
+ vmovdqu64 [INP0 + OFFSET]{k7}, ZW3
+ vmovdqu64 [INP1 + OFFSET]{k7}, ZW4
+ vmovdqu64 [INP2 + OFFSET]{k7}, ZW5
+ vmovdqu64 [INP3 + OFFSET]{k7}, ZW6
+ vmovdqu64 [INP4 + OFFSET]{k7}, ZW7
+
+ mov IA0, [STATE + _des_args_out + (8*PTR_SZ)]
+ mov IA1, [STATE + _des_args_out + (9*PTR_SZ)]
+ mov IA2, [STATE + _des_args_out + (10*PTR_SZ)]
+ mov INP0, [STATE + _des_args_out + (11*PTR_SZ)]
+ mov INP1, [STATE + _des_args_out + (12*PTR_SZ)]
+ mov INP2, [STATE + _des_args_out + (13*PTR_SZ)]
+ mov INP3, [STATE + _des_args_out + (14*PTR_SZ)]
+ mov INP4, [STATE + _des_args_out + (15*PTR_SZ)]
+ vmovdqu64 [IA0 + OFFSET]{k7}, ZW8
+ vmovdqu64 [IA1 + OFFSET]{k7}, ZW9
+ vmovdqu64 [IA2 + OFFSET]{k7}, ZW10
+ vmovdqu64 [INP0 + OFFSET]{k7}, ZW11
+ vmovdqu64 [INP1 + OFFSET]{k7}, ZW12
+ vmovdqu64 [INP2 + OFFSET]{k7}, ZW13
+ vmovdqu64 [INP3 + OFFSET]{k7}, ZW14
+ vmovdqu64 [INP4 + OFFSET]{k7}, ZW15
+%%_gen_des_enc_part_end:
+
+ ;; store IV and update pointers
+ DES_FINISH ZIV0, ZIV1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4
+
+ ;; CFB part for DOCSIS
+%ifidn %%DES_DOCSIS, DOCSIS
+ DES_CFB_ONE ENC, rsp + _key_sched, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, rsp + _tmp_in, rsp + _tmp_out, rsp + _tmp_iv, rsp + _tmp_mask
+%endif
+
+ CLEAR_KEY_SCHEDULE %%DES_DOCSIS, ZW0
+
+ ;; restore stack pointer and registers
+ mov r12, [rsp + _gpr_save + 0*8]
+ mov r13, [rsp + _gpr_save + 1*8]
+ mov r14, [rsp + _gpr_save + 2*8]
+ mov r15, [rsp + _gpr_save + 3*8]
+ mov rsp, [rsp + _rsp_save] ; original SP
+%endmacro
+
+;;; ===========================================================================
+;;; DES CBC / DOCSIS DES DECRYPT
+;;; ===========================================================================
+;;;
+;;; DES_DOCSIS [in] - select between DES (DES CBC), DOCSIS (DOCSIS DES) and
+;;; 3DES (3DES CBC)
+;;;
+;;; NOTE: clobbers OpMask registers
+%macro GENERIC_DES_DEC 1
+%define %%DES_DOCSIS %1
+
+ ;; push the registers and allocate the stack frame
+ mov rax, rsp
+ sub rsp, STACKFRAME_size
+ and rsp, -64
+ mov [rsp + _rsp_save], rax ; original SP
+ mov [rsp + _gpr_save + 0*8], r12
+ mov [rsp + _gpr_save + 1*8], r13
+ mov [rsp + _gpr_save + 2*8], r14
+ mov [rsp + _gpr_save + 3*8], r15
+
+%ifnidn %%DES_DOCSIS, 3DES
+ ;; DES and DOCSIS
+ DES_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11
+%else
+ ;; 3DES
+ DES3_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, DEC
+%endif
+
+ ;; CFB part for DOCSIS
+%ifidn %%DES_DOCSIS, DOCSIS
+ DES_CFB_ONE DEC, rsp + _key_sched, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, rsp + _tmp_in, rsp + _tmp_out, rsp + _tmp_iv, rsp + _tmp_mask
+%endif
+
+ mov [rsp + _size_save], SIZE
+ and SIZE, -64
+ xor OFFSET, OFFSET
+ ;; This loop processes message in blocks of 64 bytes.
+ ;; Anything smaller than 64 bytes is handled separately after the loop.
+%%_gen_des_dec_loop:
+ cmp OFFSET, SIZE
+ jz %%_gen_des_dec_loop_end
+ ;; run loads
+ mov IA0, [STATE + _des_args_in + (0*PTR_SZ)]
+ mov IA1, [STATE + _des_args_in + (1*PTR_SZ)]
+ mov IA2, [STATE + _des_args_in + (2*PTR_SZ)]
+ mov INP0, [STATE + _des_args_in + (3*PTR_SZ)]
+ mov INP1, [STATE + _des_args_in + (4*PTR_SZ)]
+ mov INP2, [STATE + _des_args_in + (5*PTR_SZ)]
+ mov INP3, [STATE + _des_args_in + (6*PTR_SZ)]
+ mov INP4, [STATE + _des_args_in + (7*PTR_SZ)]
+ vmovdqu64 ZW0, [IA0 + OFFSET]
+ vmovdqu64 ZW1, [IA1 + OFFSET]
+ vmovdqu64 ZW2, [IA2 + OFFSET]
+ vmovdqu64 ZW3, [INP0 + OFFSET]
+ vmovdqu64 ZW4, [INP1 + OFFSET]
+ vmovdqu64 ZW5, [INP2 + OFFSET]
+ vmovdqu64 ZW6, [INP3 + OFFSET]
+ vmovdqu64 ZW7, [INP4 + OFFSET]
+
+ mov IA0, [STATE + _des_args_in + (8*PTR_SZ)]
+ mov IA1, [STATE + _des_args_in + (9*PTR_SZ)]
+ mov IA2, [STATE + _des_args_in + (10*PTR_SZ)]
+ mov INP0, [STATE + _des_args_in + (11*PTR_SZ)]
+ mov INP1, [STATE + _des_args_in + (12*PTR_SZ)]
+ mov INP2, [STATE + _des_args_in + (13*PTR_SZ)]
+ mov INP3, [STATE + _des_args_in + (14*PTR_SZ)]
+ mov INP4, [STATE + _des_args_in + (15*PTR_SZ)]
+ vmovdqu64 ZW8, [IA0 + OFFSET]
+ vmovdqu64 ZW9, [IA1 + OFFSET]
+ vmovdqu64 ZW10, [IA2 + OFFSET]
+ vmovdqu64 ZW11, [INP0 + OFFSET]
+ vmovdqu64 ZW12, [INP1 + OFFSET]
+ vmovdqu64 ZW13, [INP2 + OFFSET]
+ vmovdqu64 ZW14, [INP3 + OFFSET]
+ vmovdqu64 ZW15, [INP4 + OFFSET]
+
+ ;; Transpose input
+ TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
+
+%ifnidn %%DES_DOCSIS, 3DES
+ ;; DES CBC DEC comes here
+ GEN_DES_DEC_CIPHER 8, rsp + _key_sched
+%else
+ ;; 3DES CBC DEC comes here
+ GEN_3DES_DEC_CIPHER 8, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+
+ ;; transpose data on output
+ TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
+
+ ;; run stores
+ mov IA0, [STATE + _des_args_out + (0*PTR_SZ)]
+ mov IA1, [STATE + _des_args_out + (1*PTR_SZ)]
+ mov IA2, [STATE + _des_args_out + (2*PTR_SZ)]
+ mov INP0, [STATE + _des_args_out + (3*PTR_SZ)]
+ mov INP1, [STATE + _des_args_out + (4*PTR_SZ)]
+ mov INP2, [STATE + _des_args_out + (5*PTR_SZ)]
+ mov INP3, [STATE + _des_args_out + (6*PTR_SZ)]
+ mov INP4, [STATE + _des_args_out + (7*PTR_SZ)]
+ vmovdqu64 [IA0 + OFFSET], ZW0
+ vmovdqu64 [IA1 + OFFSET], ZW1
+ vmovdqu64 [IA2 + OFFSET], ZW2
+ vmovdqu64 [INP0 + OFFSET], ZW3
+ vmovdqu64 [INP1 + OFFSET], ZW4
+ vmovdqu64 [INP2 + OFFSET], ZW5
+ vmovdqu64 [INP3 + OFFSET], ZW6
+ vmovdqu64 [INP4 + OFFSET], ZW7
+
+ mov IA0, [STATE + _des_args_out + (8*PTR_SZ)]
+ mov IA1, [STATE + _des_args_out + (9*PTR_SZ)]
+ mov IA2, [STATE + _des_args_out + (10*PTR_SZ)]
+ mov INP0, [STATE + _des_args_out + (11*PTR_SZ)]
+ mov INP1, [STATE + _des_args_out + (12*PTR_SZ)]
+ mov INP2, [STATE + _des_args_out + (13*PTR_SZ)]
+ mov INP3, [STATE + _des_args_out + (14*PTR_SZ)]
+ mov INP4, [STATE + _des_args_out + (15*PTR_SZ)]
+ vmovdqu64 [IA0 + OFFSET], ZW8
+ vmovdqu64 [IA1 + OFFSET], ZW9
+ vmovdqu64 [IA2 + OFFSET], ZW10
+ vmovdqu64 [INP0 + OFFSET], ZW11
+ vmovdqu64 [INP1 + OFFSET], ZW12
+ vmovdqu64 [INP2 + OFFSET], ZW13
+ vmovdqu64 [INP3 + OFFSET], ZW14
+ vmovdqu64 [INP4 + OFFSET], ZW15
+
+ add OFFSET, 64
+ jmp %%_gen_des_dec_loop
+%%_gen_des_dec_loop_end:
+ ;; This is where we check if there is anything less than 64 bytes
+ ;; of message left for processing.
+ mov SIZE, [rsp + _size_save]
+ cmp OFFSET, SIZE
+ jz %%_gen_des_dec_part_end
+ ;; calculate min of bytes_left and 64, convert to qword mask
+ GET_MASK8 IA0 ; IA0 = mask
+
+ kmovw k7, DWORD(IA0)
+ mov [rsp + _mask_save], IA0
+ ;; run masked loads
+ mov IA0, [STATE + _des_args_in + (0*PTR_SZ)]
+ mov IA1, [STATE + _des_args_in + (1*PTR_SZ)]
+ mov IA2, [STATE + _des_args_in + (2*PTR_SZ)]
+ mov INP0, [STATE + _des_args_in + (3*PTR_SZ)]
+ mov INP1, [STATE + _des_args_in + (4*PTR_SZ)]
+ mov INP2, [STATE + _des_args_in + (5*PTR_SZ)]
+ mov INP3, [STATE + _des_args_in + (6*PTR_SZ)]
+ mov INP4, [STATE + _des_args_in + (7*PTR_SZ)]
+ vmovdqu64 ZW0{k7}{z}, [IA0 + OFFSET]
+ vmovdqu64 ZW1{k7}{z}, [IA1 + OFFSET]
+ vmovdqu64 ZW2{k7}{z}, [IA2 + OFFSET]
+ vmovdqu64 ZW3{k7}{z}, [INP0 + OFFSET]
+ vmovdqu64 ZW4{k7}{z}, [INP1 + OFFSET]
+ vmovdqu64 ZW5{k7}{z}, [INP2 + OFFSET]
+ vmovdqu64 ZW6{k7}{z}, [INP3 + OFFSET]
+ vmovdqu64 ZW7{k7}{z}, [INP4 + OFFSET]
+
+ mov IA0, [STATE + _des_args_in + (8*PTR_SZ)]
+ mov IA1, [STATE + _des_args_in + (9*PTR_SZ)]
+ mov IA2, [STATE + _des_args_in + (10*PTR_SZ)]
+ mov INP0, [STATE + _des_args_in + (11*PTR_SZ)]
+ mov INP1, [STATE + _des_args_in + (12*PTR_SZ)]
+ mov INP2, [STATE + _des_args_in + (13*PTR_SZ)]
+ mov INP3, [STATE + _des_args_in + (14*PTR_SZ)]
+ mov INP4, [STATE + _des_args_in + (15*PTR_SZ)]
+ vmovdqu64 ZW8{k7}{z}, [IA0 + OFFSET]
+ vmovdqu64 ZW9{k7}{z}, [IA1 + OFFSET]
+ vmovdqu64 ZW10{k7}{z}, [IA2 + OFFSET]
+ vmovdqu64 ZW11{k7}{z}, [INP0 + OFFSET]
+ vmovdqu64 ZW12{k7}{z}, [INP1 + OFFSET]
+ vmovdqu64 ZW13{k7}{z}, [INP2 + OFFSET]
+ vmovdqu64 ZW14{k7}{z}, [INP3 + OFFSET]
+ vmovdqu64 ZW15{k7}{z}, [INP4 + OFFSET]
+
+ ;; Transpose input
+ TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
+
+ ;; DES CBC DEC comes here
+ mov IA0, [rsp + _mask_save]
+ cmp BYTE(IA0), 0x0f
+ ja %%_gt_4
+ jz %%_blocks_4
+
+ cmp BYTE(IA0), 0x03
+ ja %%_blocks_3
+ jz %%_blocks_2
+ ;; process one block and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_DEC_CIPHER 1, rsp + _key_sched
+%else
+ GEN_3DES_DEC_CIPHER 1, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+ jmp %%_transpose_out
+
+%%_blocks_2:
+ ;; process two blocks and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_DEC_CIPHER 2, rsp + _key_sched
+%else
+ GEN_3DES_DEC_CIPHER 2, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+ jmp %%_transpose_out
+
+%%_blocks_3:
+ ;; process three blocks and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_DEC_CIPHER 3, rsp + _key_sched
+%else
+ GEN_3DES_DEC_CIPHER 3, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+ jmp %%_transpose_out
+
+%%_blocks_4:
+ ;; process four blocks and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_DEC_CIPHER 4, rsp + _key_sched
+%else
+ GEN_3DES_DEC_CIPHER 4, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+ jmp %%_transpose_out
+
+%%_gt_4:
+ cmp BYTE(IA0), 0x3f
+ ja %%_blocks_7
+ jz %%_blocks_6
+%%_blocks_5:
+ ;; process five blocks and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_DEC_CIPHER 5, rsp + _key_sched
+%else
+ GEN_3DES_DEC_CIPHER 5, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+ jmp %%_transpose_out
+
+%%_blocks_6:
+ ;; process six blocks and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_DEC_CIPHER 6, rsp + _key_sched
+%else
+ GEN_3DES_DEC_CIPHER 6, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+ jmp %%_transpose_out
+
+%%_blocks_7:
+ ;; process seven blocks and move to transpose out
+%ifnidn %%DES_DOCSIS, 3DES
+ GEN_DES_DEC_CIPHER 7, rsp + _key_sched
+%else
+ GEN_3DES_DEC_CIPHER 7, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3
+%endif
+
+%%_transpose_out:
+ ;; transpose data on output
+ TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13
+
+ ;; run masked stores
+ mov IA0, [STATE + _des_args_out + (0*PTR_SZ)]
+ mov IA1, [STATE + _des_args_out + (1*PTR_SZ)]
+ mov IA2, [STATE + _des_args_out + (2*PTR_SZ)]
+ mov INP0, [STATE + _des_args_out + (3*PTR_SZ)]
+ mov INP1, [STATE + _des_args_out + (4*PTR_SZ)]
+ mov INP2, [STATE + _des_args_out + (5*PTR_SZ)]
+ mov INP3, [STATE + _des_args_out + (6*PTR_SZ)]
+ mov INP4, [STATE + _des_args_out + (7*PTR_SZ)]
+ vmovdqu64 [IA0 + OFFSET]{k7}, ZW0
+ vmovdqu64 [IA1 + OFFSET]{k7}, ZW1
+ vmovdqu64 [IA2 + OFFSET]{k7}, ZW2
+ vmovdqu64 [INP0 + OFFSET]{k7}, ZW3
+ vmovdqu64 [INP1 + OFFSET]{k7}, ZW4
+ vmovdqu64 [INP2 + OFFSET]{k7}, ZW5
+ vmovdqu64 [INP3 + OFFSET]{k7}, ZW6
+ vmovdqu64 [INP4 + OFFSET]{k7}, ZW7
+
+ mov IA0, [STATE + _des_args_out + (8*PTR_SZ)]
+ mov IA1, [STATE + _des_args_out + (9*PTR_SZ)]
+ mov IA2, [STATE + _des_args_out + (10*PTR_SZ)]
+ mov INP0, [STATE + _des_args_out + (11*PTR_SZ)]
+ mov INP1, [STATE + _des_args_out + (12*PTR_SZ)]
+ mov INP2, [STATE + _des_args_out + (13*PTR_SZ)]
+ mov INP3, [STATE + _des_args_out + (14*PTR_SZ)]
+ mov INP4, [STATE + _des_args_out + (15*PTR_SZ)]
+ vmovdqu64 [IA0 + OFFSET]{k7}, ZW8
+ vmovdqu64 [IA1 + OFFSET]{k7}, ZW9
+ vmovdqu64 [IA2 + OFFSET]{k7}, ZW10
+ vmovdqu64 [INP0 + OFFSET]{k7}, ZW11
+ vmovdqu64 [INP1 + OFFSET]{k7}, ZW12
+ vmovdqu64 [INP2 + OFFSET]{k7}, ZW13
+ vmovdqu64 [INP3 + OFFSET]{k7}, ZW14
+ vmovdqu64 [INP4 + OFFSET]{k7}, ZW15
+%%_gen_des_dec_part_end:
+
+ ;; store IV and update pointers
+ DES_FINISH ZIV0, ZIV1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4
+
+ CLEAR_KEY_SCHEDULE %%DES_DOCSIS, ZW0
+
+ ;; restore stack pointer and registers
+ mov r12, [rsp + _gpr_save + 0*8]
+ mov r13, [rsp + _gpr_save + 1*8]
+ mov r14, [rsp + _gpr_save + 2*8]
+ mov r15, [rsp + _gpr_save + 3*8]
+ mov rsp, [rsp + _rsp_save] ; original SP
+%endmacro
+
+
+;;; ========================================================
+;;; DATA
+
+section .data
+default rel
+align 64
+mask_values:
+ dd 0x04000000, 0x04000000, 0x04000000, 0x04000000
+ dd 0x04000000, 0x04000000, 0x04000000, 0x04000000
+ dd 0x04000000, 0x04000000, 0x04000000, 0x04000000
+ dd 0x04000000, 0x04000000, 0x04000000, 0x04000000
+ dd 0x40240202, 0x40240202, 0x40240202, 0x40240202
+ dd 0x40240202, 0x40240202, 0x40240202, 0x40240202
+ dd 0x40240202, 0x40240202, 0x40240202, 0x40240202
+ dd 0x40240202, 0x40240202, 0x40240202, 0x40240202
+ dd 0x00001110, 0x00001110, 0x00001110, 0x00001110
+ dd 0x00001110, 0x00001110, 0x00001110, 0x00001110
+ dd 0x00001110, 0x00001110, 0x00001110, 0x00001110
+ dd 0x00001110, 0x00001110, 0x00001110, 0x00001110
+ dd 0x01088000, 0x01088000, 0x01088000, 0x01088000
+ dd 0x01088000, 0x01088000, 0x01088000, 0x01088000
+ dd 0x01088000, 0x01088000, 0x01088000, 0x01088000
+ dd 0x01088000, 0x01088000, 0x01088000, 0x01088000
+ dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
+ dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
+ dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
+ dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
+ dd 0x0081000C, 0x0081000C, 0x0081000C, 0x0081000C
+ dd 0x0081000C, 0x0081000C, 0x0081000C, 0x0081000C
+ dd 0x0081000C, 0x0081000C, 0x0081000C, 0x0081000C
+ dd 0x0081000C, 0x0081000C, 0x0081000C, 0x0081000C
+ dd 0x00000020, 0x00000020, 0x00000020, 0x00000020
+ dd 0x00000020, 0x00000020, 0x00000020, 0x00000020
+ dd 0x00000020, 0x00000020, 0x00000020, 0x00000020
+ dd 0x00000020, 0x00000020, 0x00000020, 0x00000020
+ dd 0x00000040, 0x00000040, 0x00000040, 0x00000040
+ dd 0x00000040, 0x00000040, 0x00000040, 0x00000040
+ dd 0x00000040, 0x00000040, 0x00000040, 0x00000040
+ dd 0x00000040, 0x00000040, 0x00000040, 0x00000040
+ dd 0x00400400, 0x00400400, 0x00400400, 0x00400400
+ dd 0x00400400, 0x00400400, 0x00400400, 0x00400400
+ dd 0x00400400, 0x00400400, 0x00400400, 0x00400400
+ dd 0x00400400, 0x00400400, 0x00400400, 0x00400400
+ dd 0x00000800, 0x00000800, 0x00000800, 0x00000800
+ dd 0x00000800, 0x00000800, 0x00000800, 0x00000800
+ dd 0x00000800, 0x00000800, 0x00000800, 0x00000800
+ dd 0x00000800, 0x00000800, 0x00000800, 0x00000800
+ dd 0x00002000, 0x00002000, 0x00002000, 0x00002000
+ dd 0x00002000, 0x00002000, 0x00002000, 0x00002000
+ dd 0x00002000, 0x00002000, 0x00002000, 0x00002000
+ dd 0x00002000, 0x00002000, 0x00002000, 0x00002000
+ dd 0x00100000, 0x00100000, 0x00100000, 0x00100000
+ dd 0x00100000, 0x00100000, 0x00100000, 0x00100000
+ dd 0x00100000, 0x00100000, 0x00100000, 0x00100000
+ dd 0x00100000, 0x00100000, 0x00100000, 0x00100000
+ dd 0x00004000, 0x00004000, 0x00004000, 0x00004000
+ dd 0x00004000, 0x00004000, 0x00004000, 0x00004000
+ dd 0x00004000, 0x00004000, 0x00004000, 0x00004000
+ dd 0x00004000, 0x00004000, 0x00004000, 0x00004000
+ dd 0x00020000, 0x00020000, 0x00020000, 0x00020000
+ dd 0x00020000, 0x00020000, 0x00020000, 0x00020000
+ dd 0x00020000, 0x00020000, 0x00020000, 0x00020000
+ dd 0x00020000, 0x00020000, 0x00020000, 0x00020000
+ dd 0x02000000, 0x02000000, 0x02000000, 0x02000000
+ dd 0x02000000, 0x02000000, 0x02000000, 0x02000000
+ dd 0x02000000, 0x02000000, 0x02000000, 0x02000000
+ dd 0x02000000, 0x02000000, 0x02000000, 0x02000000
+ dd 0x08000000, 0x08000000, 0x08000000, 0x08000000
+ dd 0x08000000, 0x08000000, 0x08000000, 0x08000000
+ dd 0x08000000, 0x08000000, 0x08000000, 0x08000000
+ dd 0x08000000, 0x08000000, 0x08000000, 0x08000000
+ dd 0x00000080, 0x00000080, 0x00000080, 0x00000080
+ dd 0x00000080, 0x00000080, 0x00000080, 0x00000080
+ dd 0x00000080, 0x00000080, 0x00000080, 0x00000080
+ dd 0x00000080, 0x00000080, 0x00000080, 0x00000080
+ dd 0x20000000, 0x20000000, 0x20000000, 0x20000000
+ dd 0x20000000, 0x20000000, 0x20000000, 0x20000000
+ dd 0x20000000, 0x20000000, 0x20000000, 0x20000000
+ dd 0x20000000, 0x20000000, 0x20000000, 0x20000000
+ dd 0x90000000, 0x90000000, 0x90000000, 0x90000000
+ dd 0x90000000, 0x90000000, 0x90000000, 0x90000000
+ dd 0x90000000, 0x90000000, 0x90000000, 0x90000000
+ dd 0x90000000, 0x90000000, 0x90000000, 0x90000000
+
+align 64
+init_perm_consts:
+ dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f
+ dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f
+ dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f
+ dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f
+ dd 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
+ dd 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
+ dd 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
+ dd 0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff
+ dd 0x33333333, 0x33333333, 0x33333333, 0x33333333
+ dd 0x33333333, 0x33333333, 0x33333333, 0x33333333
+ dd 0x33333333, 0x33333333, 0x33333333, 0x33333333
+ dd 0x33333333, 0x33333333, 0x33333333, 0x33333333
+ dd 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff
+ dd 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff
+ dd 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff
+ dd 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff
+ dd 0x55555555, 0x55555555, 0x55555555, 0x55555555
+ dd 0x55555555, 0x55555555, 0x55555555, 0x55555555
+ dd 0x55555555, 0x55555555, 0x55555555, 0x55555555
+ dd 0x55555555, 0x55555555, 0x55555555, 0x55555555
+
+;;; S-Box table
+align 64
+S_box_flipped:
+ ;; SBOX0
+ dw 0x07, 0x02, 0x0c, 0x0f, 0x04, 0x0b, 0x0a, 0x0c
+ dw 0x0b, 0x07, 0x06, 0x09, 0x0d, 0x04, 0x00, 0x0a
+ dw 0x02, 0x08, 0x05, 0x03, 0x0f, 0x06, 0x09, 0x05
+ dw 0x08, 0x01, 0x03, 0x0e, 0x01, 0x0d, 0x0e, 0x00
+ dw 0x00, 0x0f, 0x05, 0x0a, 0x07, 0x02, 0x09, 0x05
+ dw 0x0e, 0x01, 0x03, 0x0c, 0x0b, 0x08, 0x0c, 0x06
+ dw 0x0f, 0x03, 0x06, 0x0d, 0x04, 0x09, 0x0a, 0x00
+ dw 0x02, 0x04, 0x0d, 0x07, 0x08, 0x0e, 0x01, 0x0b
+ ;; SBOX1
+ dw 0x0f, 0x00, 0x09, 0x0a, 0x06, 0x05, 0x03, 0x09
+ dw 0x01, 0x0e, 0x04, 0x03, 0x0c, 0x0b, 0x0a, 0x04
+ dw 0x08, 0x07, 0x0e, 0x01, 0x0d, 0x02, 0x00, 0x0c
+ dw 0x07, 0x0d, 0x0b, 0x06, 0x02, 0x08, 0x05, 0x0f
+ dw 0x0c, 0x0b, 0x03, 0x0d, 0x0f, 0x0c, 0x06, 0x00
+ dw 0x02, 0x05, 0x08, 0x0e, 0x01, 0x02, 0x0d, 0x07
+ dw 0x0b, 0x01, 0x00, 0x06, 0x04, 0x0f, 0x09, 0x0a
+ dw 0x0e, 0x08, 0x05, 0x03, 0x07, 0x04, 0x0a, 0x09
+ ;; SBOX2
+ dw 0x05, 0x0b, 0x08, 0x0d, 0x06, 0x01, 0x0d, 0x0a
+ dw 0x09, 0x02, 0x03, 0x04, 0x0f, 0x0c, 0x04, 0x07
+ dw 0x00, 0x06, 0x0b, 0x08, 0x0c, 0x0f, 0x02, 0x05
+ dw 0x07, 0x09, 0x0e, 0x03, 0x0a, 0x00, 0x01, 0x0e
+ dw 0x0b, 0x08, 0x04, 0x02, 0x0c, 0x06, 0x03, 0x0d
+ dw 0x00, 0x0b, 0x0a, 0x07, 0x06, 0x01, 0x0f, 0x04
+ dw 0x0e, 0x05, 0x01, 0x0f, 0x02, 0x09, 0x0d, 0x0a
+ dw 0x09, 0x00, 0x07, 0x0c, 0x05, 0x0e, 0x08, 0x03
+ ;; SBOX3
+ dw 0x0e, 0x05, 0x08, 0x0f, 0x00, 0x03, 0x0d, 0x0a
+ dw 0x07, 0x09, 0x01, 0x0c, 0x09, 0x0e, 0x02, 0x01
+ dw 0x0b, 0x06, 0x04, 0x08, 0x06, 0x0d, 0x03, 0x04
+ dw 0x0c, 0x00, 0x0a, 0x07, 0x05, 0x0b, 0x0f, 0x02
+ dw 0x0b, 0x0c, 0x02, 0x09, 0x06, 0x05, 0x08, 0x03
+ dw 0x0d, 0x00, 0x04, 0x0a, 0x00, 0x0b, 0x07, 0x04
+ dw 0x01, 0x0f, 0x0e, 0x02, 0x0f, 0x08, 0x05, 0x0e
+ dw 0x0a, 0x06, 0x03, 0x0d, 0x0c, 0x01, 0x09, 0x07
+ ;; SBOX4
+ dw 0x04, 0x02, 0x01, 0x0f, 0x0e, 0x05, 0x0b, 0x06
+ dw 0x02, 0x08, 0x0c, 0x03, 0x0d, 0x0e, 0x07, 0x00
+ dw 0x03, 0x04, 0x0a, 0x09, 0x05, 0x0b, 0x00, 0x0c
+ dw 0x08, 0x0d, 0x0f, 0x0a, 0x06, 0x01, 0x09, 0x07
+ dw 0x07, 0x0d, 0x0a, 0x06, 0x02, 0x08, 0x0c, 0x05
+ dw 0x04, 0x03, 0x0f, 0x00, 0x0b, 0x04, 0x01, 0x0a
+ dw 0x0d, 0x01, 0x00, 0x0f, 0x0e, 0x07, 0x09, 0x02
+ dw 0x03, 0x0e, 0x05, 0x09, 0x08, 0x0b, 0x06, 0x0c
+ ;; SBOX5
+ dw 0x03, 0x09, 0x00, 0x0e, 0x09, 0x04, 0x07, 0x08
+ dw 0x05, 0x0f, 0x0c, 0x02, 0x06, 0x03, 0x0a, 0x0d
+ dw 0x08, 0x07, 0x0b, 0x00, 0x04, 0x01, 0x0e, 0x0b
+ dw 0x0f, 0x0a, 0x02, 0x05, 0x01, 0x0c, 0x0d, 0x06
+ dw 0x05, 0x02, 0x06, 0x0d, 0x0e, 0x09, 0x00, 0x06
+ dw 0x02, 0x04, 0x0b, 0x08, 0x09, 0x0f, 0x0c, 0x01
+ dw 0x0f, 0x0c, 0x08, 0x07, 0x03, 0x0a, 0x0d, 0x00
+ dw 0x04, 0x03, 0x07, 0x0e, 0x0a, 0x05, 0x01, 0x0b
+ ;; SBOX6
+ dw 0x02, 0x08, 0x0c, 0x05, 0x0f, 0x03, 0x0a, 0x00
+ dw 0x04, 0x0d, 0x09, 0x06, 0x01, 0x0e, 0x06, 0x09
+ dw 0x0d, 0x02, 0x03, 0x0f, 0x00, 0x0c, 0x05, 0x0a
+ dw 0x07, 0x0b, 0x0e, 0x01, 0x0b, 0x07, 0x08, 0x04
+ dw 0x0b, 0x06, 0x07, 0x09, 0x02, 0x08, 0x04, 0x07
+ dw 0x0d, 0x0b, 0x0a, 0x00, 0x08, 0x05, 0x01, 0x0c
+ dw 0x00, 0x0d, 0x0c, 0x0a, 0x09, 0x02, 0x0f, 0x04
+ dw 0x0e, 0x01, 0x03, 0x0f, 0x05, 0x0e, 0x06, 0x03
+ ;; SBOX7
+ dw 0x0b, 0x0e, 0x05, 0x00, 0x06, 0x09, 0x0a, 0x0f
+ dw 0x01, 0x02, 0x0c, 0x05, 0x0d, 0x07, 0x03, 0x0a
+ dw 0x04, 0x0d, 0x09, 0x06, 0x0f, 0x03, 0x00, 0x0c
+ dw 0x02, 0x08, 0x07, 0x0b, 0x08, 0x04, 0x0e, 0x01
+ dw 0x08, 0x04, 0x03, 0x0f, 0x05, 0x02, 0x00, 0x0c
+ dw 0x0b, 0x07, 0x06, 0x09, 0x0e, 0x01, 0x09, 0x06
+ dw 0x0f, 0x08, 0x0a, 0x03, 0x0c, 0x05, 0x07, 0x0a
+ dw 0x01, 0x0e, 0x0d, 0x00, 0x02, 0x0b, 0x04, 0x0d
+
+;;; Used in DOCSIS DES partial block scheduling 16 x 32bit of value 1
+align 64
+vec_ones_32b:
+ dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+
+align 64
+and_eu:
+ dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00
+ dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00
+ dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00
+ dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00
+
+align 64
+and_ed:
+ dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
+ dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
+ dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
+ dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
+
+align 64
+idx_e:
+ dq 0x0d0c090805040100, 0x0f0e0b0a07060302
+ dq 0x1d1c191815141110, 0x1f1e1b1a17161312
+ dq 0x2d2c292825242120, 0x2f2e2b2a27262322
+ dq 0x3d3c393835343130, 0x3f3e3b3a37363332
+
+align 64
+reg_values16bit_7:
+ dq 0x001f001f001f001f, 0x001f001f001f001f
+ dq 0x001f001f001f001f, 0x001f001f001f001f
+ dq 0x001f001f001f001f, 0x001f001f001f001f
+ dq 0x001f001f001f001f, 0x001f001f001f001f
+
+align 64
+shuffle_reg:
+ dq 0x0705060403010200, 0x0f0d0e0c0b090a08
+ dq 0x1715161413111210, 0x1f1d1e1c1b191a18
+ dq 0x2725262423212220, 0x2f2d2e2c2b292a28
+ dq 0x3735363433313230, 0x3f3d3e3c3b393a38
+
+;;; ========================================================
+;;; CODE
+section .text
+
+;;; arg 1 : pointer to DES OOO structure
+;;; arg 2 : size in bytes
+align 64
+MKGLOBAL(des_x16_cbc_enc_avx512,function,internal)
+des_x16_cbc_enc_avx512:
+ GENERIC_DES_ENC DES
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+;;; arg 2 : size in bytes
+align 64
+MKGLOBAL(des_x16_cbc_dec_avx512,function,internal)
+des_x16_cbc_dec_avx512:
+ GENERIC_DES_DEC DES
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+;;; arg 2 : size in bytes
+align 64
+MKGLOBAL(des3_x16_cbc_enc_avx512,function,internal)
+des3_x16_cbc_enc_avx512:
+ GENERIC_DES_ENC 3DES
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+;;; arg 2 : size in bytes
+align 64
+MKGLOBAL(des3_x16_cbc_dec_avx512,function,internal)
+des3_x16_cbc_dec_avx512:
+ GENERIC_DES_DEC 3DES
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+;;; arg 2 : size in bytes
+align 64
+MKGLOBAL(docsis_des_x16_enc_avx512,function,internal)
+docsis_des_x16_enc_avx512:
+ GENERIC_DES_ENC DOCSIS
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+;;; arg 2 : size in bytes
+align 64
+MKGLOBAL(docsis_des_x16_dec_avx512,function,internal)
+docsis_des_x16_dec_avx512:
+ GENERIC_DES_DEC DOCSIS
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/gcm128_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/gcm128_avx512.asm
new file mode 100644
index 000000000..f9f643b40
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/gcm128_avx512.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2018, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM128_MODE 1
+%include "avx512/gcm_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/gcm128_vaes_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/gcm128_vaes_avx512.asm
new file mode 100644
index 000000000..2465b22dd
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/gcm128_vaes_avx512.asm
@@ -0,0 +1,32 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2018-2019, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM128_MODE 1
+;; single buffer implementation
+%include "avx512/gcm_vaes_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/gcm192_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/gcm192_avx512.asm
new file mode 100644
index 000000000..403ab2f7c
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/gcm192_avx512.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2018, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM192_MODE 1
+%include "avx512/gcm_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/gcm192_vaes_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/gcm192_vaes_avx512.asm
new file mode 100644
index 000000000..348190a2a
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/gcm192_vaes_avx512.asm
@@ -0,0 +1,32 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2018-2019, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM192_MODE 1
+;; single buffer implementation
+%include "avx512/gcm_vaes_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/gcm256_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/gcm256_avx512.asm
new file mode 100644
index 000000000..141b4b9ca
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/gcm256_avx512.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2018, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM256_MODE 1
+%include "avx512/gcm_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/gcm256_vaes_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/gcm256_vaes_avx512.asm
new file mode 100644
index 000000000..4daa1b361
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/gcm256_vaes_avx512.asm
@@ -0,0 +1,32 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2018-2019, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM256_MODE 1
+;; single buffer implementation
+%include "avx512/gcm_vaes_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/gcm_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/gcm_avx512.asm
new file mode 100644
index 000000000..db940ffe9
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/gcm_avx512.asm
@@ -0,0 +1,3536 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2018-2019, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+; Tomasz Kantecki
+;
+;
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+; The details of the implementation is explained in:
+; Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+; AAD will be padded with 0 to the next 16byte multiple
+; for example, assume AAD is a u32 vector
+;
+; if AAD is 8 bytes:
+; AAD[3] = {A0, A1};
+; padded AAD in xmm register = {A1 A0 0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A1) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 32-bit Sequence Number (A0) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 32-bit Sequence Number
+;
+; if AAD is 12 bytes:
+; AAD[3] = {A0, A1, A2};
+; padded AAD in xmm register = {A2 A1 A0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A2) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 64-bit Extended Sequence Number {A1,A0} |
+; | |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+; Must be a multiple of 4 bytes and from the definition of the spec.
+; The code additionally supports any aadLen length.
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "include/os.asm"
+%include "include/reg_sizes.asm"
+%include "include/clear_regs.asm"
+%include "include/gcm_defines.asm"
+%include "include/gcm_keys_avx2_avx512.asm"
+
+%include "mb_mgr_datastruct.asm"
+%include "job_aes_hmac.asm"
+%include "include/memcpy.asm"
+
+%ifndef GCM128_MODE
+%ifndef GCM192_MODE
+%ifndef GCM256_MODE
+%error "No GCM mode selected for gcm_avx512.asm!"
+%endif
+%endif
+%endif
+
+;; Decide on AES-GCM key size to compile for
+%ifdef GCM128_MODE
+%define NROUNDS 9
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx512
+%endif
+
+%ifdef GCM192_MODE
+%define NROUNDS 11
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx512
+%endif
+
+%ifdef GCM256_MODE
+%define NROUNDS 13
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx512
+%endif
+
+section .text
+default rel
+
+; need to push 4 registers into stack to maintain
+%define STACK_OFFSET 8*4
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE 16*10
+%else
+ %define XMM_STORAGE 0
+%endif
+
+%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define TMP3 16*1 ; Temporary storage for AES State 3
+%define TMP4 16*2 ; Temporary storage for AES State 4
+%define TMP5 16*3 ; Temporary storage for AES State 5
+%define TMP6 16*4 ; Temporary storage for AES State 6
+%define TMP7 16*5 ; Temporary storage for AES State 7
+%define TMP8 16*6 ; Temporary storage for AES State 8
+%define LOCAL_STORAGE 16*7
+%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GHASH_MUL 7
+%define %%GH %1 ; 16 Bytes
+%define %%HK %2 ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%T2, %%GH, %%HK, 0x00 ; %%T2 = a0*b0
+ vpclmulqdq %%T3, %%GH, %%HK, 0x01 ; %%T3 = a1*b0
+ vpclmulqdq %%GH, %%GH, %%HK, 0x10 ; %%GH = a0*b1
+ vpxor %%GH, %%GH, %%T3
+
+
+ vpsrldq %%T3, %%GH, 8 ; shift-R %%GH 2 DWs
+ vpslldq %%GH, %%GH, 8 ; shift-L %%GH 2 DWs
+
+ vpxor %%T1, %%T1, %%T3
+ vpxor %%GH, %%GH, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu %%T3, [rel POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%GH, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L %%T2 2 DWs
+
+ vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%GH, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%GH, %%T3, %%GH, 0x10
+ vpslldq %%GH, %%GH, 4 ; shift-L %%GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%GH, %%GH, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%GH, %%GH, %%T1 ; the result is in %%GH
+%endmacro
+
+
+; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx512
+; functions, but are kept to allow users to switch cpu architectures between calls
+; of pre, init, update, and finalize.
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+%define %%T6 %8
+
+ ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vmovdqa %%T5, %%HK
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_3], %%T5
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_4], %%T5
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_5], %%T5
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_6], %%T5
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_7], %%T5
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_8], %%T5
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT 4
+%define %%OUTPUT %1 ; %%OUTPUT is an xmm register
+%define %%INPUT %2
+%define %%LENGTH %3
+%define %%TMP1 %4
+
+ lea %%TMP1, [rel byte_len_to_mask_table]
+%ifidn __OUTPUT_FORMAT__, win64
+ add %%TMP1, %%LENGTH
+ add %%TMP1, %%LENGTH
+ kmovw k1, [%%TMP1]
+%else
+ kmovw k1, [%%TMP1 + %%LENGTH*2]
+%endif
+ vmovdqu8 XWORD(%%OUTPUT){k1}{z}, [%%INPUT]
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CALC_AAD_HASH 13
+%define %%A_IN %1
+%define %%A_LEN %2
+%define %%AAD_HASH %3
+%define %%GDATA_KEY %4
+%define %%XTMP0 %5 ; xmm temp reg 5
+%define %%XTMP1 %6 ; xmm temp reg 5
+%define %%XTMP2 %7
+%define %%XTMP3 %8
+%define %%XTMP4 %9
+%define %%XTMP5 %10 ; xmm temp reg 5
+%define %%T1 %11 ; temp reg 1
+%define %%T2 %12
+%define %%T3 %13
+
+
+ mov %%T1, %%A_IN ; T1 = AAD
+ mov %%T2, %%A_LEN ; T2 = aadLen
+ vpxor %%AAD_HASH, %%AAD_HASH
+
+%%_get_AAD_loop128:
+ cmp %%T2, 128
+ jl %%_exit_AAD_loop128
+
+ vmovdqu %%XTMP0, [%%T1 + 16*0]
+ vpshufb %%XTMP0, [rel SHUF_MASK]
+
+ vpxor %%XTMP0, %%AAD_HASH
+
+ vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_8]
+ vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0
+ vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1
+
+%assign i 1
+%assign j 7
+%rep 7
+ vmovdqu %%XTMP0, [%%T1 + 16*i]
+ vpshufb %%XTMP0, [rel SHUF_MASK]
+
+ vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_ %+ j]
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1
+ vpxor %%XTMP1, %%XTMP1, %%XTMP4
+
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0
+ vpxor %%XTMP2, %%XTMP2, %%XTMP4
+
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4
+%assign i (i + 1)
+%assign j (j - 1)
+%endrep
+
+ vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs
+ vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs
+ vpxor %%XTMP2, %%XTMP2, %%XTMP4
+ vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L)
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqa %%XTMP5, [rel POLY2]
+ vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01
+ vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs
+ vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00
+ vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10
+ vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1
+
+ sub %%T2, 128
+ je %%_CALC_AAD_done
+
+ add %%T1, 128
+ jmp %%_get_AAD_loop128
+
+%%_exit_AAD_loop128:
+ cmp %%T2, 16
+ jl %%_get_small_AAD_block
+
+ ;; calculate hash_key position to start with
+ mov %%T3, %%T2
+ and %%T3, -16 ; 1 to 7 blocks possible here
+ neg %%T3
+ add %%T3, HashKey_1 + 16
+ lea %%T3, [%%GDATA_KEY + %%T3]
+
+ vmovdqu %%XTMP0, [%%T1]
+ vpshufb %%XTMP0, [rel SHUF_MASK]
+
+ vpxor %%XTMP0, %%AAD_HASH
+
+ vmovdqu %%XTMP5, [%%T3]
+ vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0
+ vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1
+
+ add %%T3, 16 ; move to next hashkey
+ add %%T1, 16 ; move to next data block
+ sub %%T2, 16
+ cmp %%T2, 16
+ jl %%_AAD_reduce
+
+%%_AAD_blocks:
+ vmovdqu %%XTMP0, [%%T1]
+ vpshufb %%XTMP0, [rel SHUF_MASK]
+
+ vmovdqu %%XTMP5, [%%T3]
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1
+ vpxor %%XTMP1, %%XTMP1, %%XTMP4
+
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0
+ vpxor %%XTMP2, %%XTMP2, %%XTMP4
+
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4
+ vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10
+ vpxor %%XTMP3, %%XTMP3, %%XTMP4
+
+ add %%T3, 16 ; move to next hashkey
+ add %%T1, 16
+ sub %%T2, 16
+ cmp %%T2, 16
+ jl %%_AAD_reduce
+ jmp %%_AAD_blocks
+
+%%_AAD_reduce:
+ vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs
+ vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs
+ vpxor %%XTMP2, %%XTMP2, %%XTMP4
+ vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L)
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqa %%XTMP5, [rel POLY2]
+ vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01
+ vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs
+ vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00
+ vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10
+ vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1
+
+ or %%T2, %%T2
+ je %%_CALC_AAD_done
+
+%%_get_small_AAD_block:
+ vmovdqu %%XTMP0, [%%GDATA_KEY + HashKey]
+ READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [rel SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN),
+; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET),
+; and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 8
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%PLAIN_CYPH_LEN %5
+%define %%DATA_OFFSET %6
+%define %%AAD_HASH %7
+%define %%ENC_DEC %8
+
+ mov r13, [%%GDATA_CTX + PBlockLen]
+ cmp r13, 0
+ je %%_partial_block_done ;Leave Macro if no partial blocks
+
+ cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
+ jl %%_fewer_than_16_bytes
+ VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
+ jmp %%_data_read
+
+%%_fewer_than_16_bytes:
+ lea r10, [%%PLAIN_CYPH_IN]
+ READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax
+
+%%_data_read: ;Finished reading in data
+
+ vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
+ vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+
+ lea r12, [rel SHIFT_MASK]
+
+ add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm9, xmm2 ;shift right r13 bytes
+
+%ifidn %%ENC_DEC, DEC
+ vmovdqa xmm3, xmm1
+%endif
+ vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask:
+
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK]; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+%ifidn %%ENC_DEC, DEC
+ vpand xmm3, xmm1
+ vpshufb xmm3, [rel SHUF_MASK]
+ vpshufb xmm3, xmm2
+ vpxor %%AAD_HASH, xmm3
+%else
+ vpshufb xmm9, [rel SHUF_MASK]
+ vpshufb xmm9, xmm2
+ vpxor %%AAD_HASH, xmm9
+%endif
+ cmp r15,0
+ jl %%_partial_incomplete
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA_CTX + PBlockLen], rax
+ jmp %%_enc_dec_done
+%%_partial_incomplete:
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rax, %%PLAIN_CYPH_LEN
+ add [%%GDATA_CTX + PBlockLen], rax
+%else
+ add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%endif
+%%_enc_dec_done:
+ vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
+
+%ifidn %%ENC_DEC, ENC
+ vpshufb xmm9, [rel SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ vpshufb xmm9, xmm2
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output encrypted Bytes
+ cmp r15,0
+ jl %%_partial_fill
+ mov r12, r13
+ mov r13, 16
+ sub r13, r12 ; Set r13 to be the number of bytes to write out
+ jmp %%_count_set
+%%_partial_fill:
+ mov r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+ lea rax, [rel byte_len_to_mask_table]
+ kmovw k1, [rax + r13*2]
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET]{k1}, xmm9
+ add %%DATA_OFFSET, r13
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+%macro GHASH_SINGLE_MUL 9
+%define %%GDATA %1
+%define %%HASHKEY %2
+%define %%CIPHER %3
+%define %%STATE_11 %4
+%define %%STATE_00 %5
+%define %%STATE_MID %6
+%define %%T1 %7
+%define %%T2 %8
+%define %%FIRST %9
+
+ vmovdqu %%T1, [%%GDATA + %%HASHKEY]
+%ifidn %%FIRST, first
+ vpclmulqdq %%STATE_11, %%CIPHER, %%T1, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%STATE_00, %%CIPHER, %%T1, 0x00 ; %%T4_2 = a0*b0
+ vpclmulqdq %%STATE_MID, %%CIPHER, %%T1, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 ; %%T5 = a0*b1
+ vpxor %%STATE_MID, %%STATE_MID, %%T2
+%else
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x11
+ vpxor %%STATE_11, %%STATE_11, %%T2
+
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x00
+ vpxor %%STATE_00, %%STATE_00, %%T2
+
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x01
+ vpxor %%STATE_MID, %%STATE_MID, %%T2
+
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10
+ vpxor %%STATE_MID, %%STATE_MID, %%T2
+%endif
+
+%endmacro
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 23
+%define %%GDATA_KEY %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%LENGTH %4
+%define %%DATA_OFFSET %5
+%define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%T1 %7
+%define %%T2 %8
+%define %%T3 %9
+%define %%T4 %10
+%define %%T5 %11
+%define %%CTR %12
+%define %%XMM1 %13
+%define %%XMM2 %14
+%define %%XMM3 %15
+%define %%XMM4 %16
+%define %%XMM5 %17
+%define %%XMM6 %18
+%define %%XMM7 %19
+%define %%XMM8 %20
+%define %%T6 %21
+%define %%T_key %22
+%define %%ENC_DEC %23
+
+%assign i (8-%%num_initial_blocks)
+ ;; Move AAD_HASH to temp reg
+ vmovdqu %%T2, %%XMM8
+ ;; Start AES for %%num_initial_blocks blocks
+ ;; vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpaddd %%CTR, %%CTR, [rel ONE] ; INCR Y0
+ vmovdqa reg(i), %%CTR
+ vpshufb reg(i), [rel SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+%if(%%num_initial_blocks>0)
+vmovdqu %%T_key, [%%GDATA_KEY+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpxor reg(i),reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep NROUNDS
+vmovdqu %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+vmovdqu %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%endif ; %if(%%num_initial_blocks>0)
+
+
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vpxor reg(i), reg(i), %%T1
+ ;; Write back ciphertext for %%num_initial_blocks blocks
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa reg(i), %%T1
+ %endif
+ ;; Prepare ciphertext for GHASH computations
+ vpshufb reg(i), [rel SHUF_MASK]
+%assign i (i+1)
+%endrep
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%assign i (9-%%num_initial_blocks)
+%if(%%num_initial_blocks>0)
+ vmovdqa %%T3, reg(i)
+%assign i (i+1)
+%endif
+%if %%num_initial_blocks>1
+%rep %%num_initial_blocks-1
+ vmovdqu [rsp + TMP %+ i], reg(i)
+%assign i (i+1)
+%endrep
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Prepare 8 counter blocks and perform rounds of AES cipher on
+ ;; them, load plain/cipher text and store cipher/plain text.
+ ;; Stitch GHASH computation in between AES rounds.
+ vpaddd %%XMM1, %%CTR, [rel ONE] ; INCR Y0
+ vpaddd %%XMM2, %%CTR, [rel TWO] ; INCR Y0
+ vpaddd %%XMM3, %%XMM1, [rel TWO] ; INCR Y0
+ vpaddd %%XMM4, %%XMM2, [rel TWO] ; INCR Y0
+ vpaddd %%XMM5, %%XMM3, [rel TWO] ; INCR Y0
+ vpaddd %%XMM6, %%XMM4, [rel TWO] ; INCR Y0
+ vpaddd %%XMM7, %%XMM5, [rel TWO] ; INCR Y0
+ vpaddd %%XMM8, %%XMM6, [rel TWO] ; INCR Y0
+ vmovdqa %%CTR, %%XMM8
+
+ vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*0]
+ vpxor %%XMM1, %%XMM1, %%T_key
+ vpxor %%XMM2, %%XMM2, %%T_key
+ vpxor %%XMM3, %%XMM3, %%T_key
+ vpxor %%XMM4, %%XMM4, %%T_key
+ vpxor %%XMM5, %%XMM5, %%T_key
+ vpxor %%XMM6, %%XMM6, %%T_key
+ vpxor %%XMM7, %%XMM7, %%T_key
+ vpxor %%XMM8, %%XMM8, %%T_key
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+%assign k (%%num_initial_blocks)
+
+%define %%T4_2 %%T4
+%if(%%num_initial_blocks>0)
+ ;; Hash in AES state
+ ;; T2 - incoming AAD hash
+ vpxor %%T2, %%T3
+
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, first
+%endif
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*1]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*2]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>1)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*3]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*4]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>2)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>3)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*5]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*6]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>4)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*7]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*8]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>5)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*9]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+%ifndef GCM128_MODE
+ vmovdqu %%T_key, [%%GDATA_KEY+16*10]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>6)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+%ifdef GCM128_MODE
+ vmovdqu %%T_key, [%%GDATA_KEY+16*10]
+ vaesenclast %%XMM1, %%T_key
+ vaesenclast %%XMM2, %%T_key
+ vaesenclast %%XMM3, %%T_key
+ vaesenclast %%XMM4, %%T_key
+ vaesenclast %%XMM5, %%T_key
+ vaesenclast %%XMM6, %%T_key
+ vaesenclast %%XMM7, %%T_key
+ vaesenclast %%XMM8, %%T_key
+%endif
+
+%ifdef GCM192_MODE
+ vmovdqu %%T_key, [%%GDATA_KEY+16*11]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*12]
+ vaesenclast %%XMM1, %%T_key
+ vaesenclast %%XMM2, %%T_key
+ vaesenclast %%XMM3, %%T_key
+ vaesenclast %%XMM4, %%T_key
+ vaesenclast %%XMM5, %%T_key
+ vaesenclast %%XMM6, %%T_key
+ vaesenclast %%XMM7, %%T_key
+ vaesenclast %%XMM8, %%T_key
+%endif
+%ifdef GCM256_MODE
+ vmovdqu %%T_key, [%%GDATA_KEY+16*11]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*12]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>7)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+%ifdef GCM256_MODE ; GCM256
+ vmovdqu %%T_key, [%%GDATA_KEY+16*13]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*14]
+ vaesenclast %%XMM1, %%T_key
+ vaesenclast %%XMM2, %%T_key
+ vaesenclast %%XMM3, %%T_key
+ vaesenclast %%XMM4, %%T_key
+ vaesenclast %%XMM5, %%T_key
+ vaesenclast %%XMM6, %%T_key
+ vaesenclast %%XMM7, %%T_key
+ vaesenclast %%XMM8, %%T_key
+%endif ; GCM256 mode
+
+%if(%%num_initial_blocks>0)
+ vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs
+ vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs
+ vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4
+ vpxor %%T4, %%T6, %%T4
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; First phase of the reduction
+ vmovdqu %%T3, [rel POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T4, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
+
+ ;; First phase of the reduction complete
+ vpxor %%T4, %%T4, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; Second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%T4, 0x00
+ ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+ vpsrldq %%T2, %%T2, 4
+
+ vpclmulqdq %%T4, %%T3, %%T4, 0x10
+ ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
+ vpslldq %%T4, %%T4, 4
+ ;; Second phase of the reduction complete
+ vpxor %%T4, %%T4, %%T2
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; The result is in %%T3
+ vpxor %%T3, %%T1, %%T4
+%else
+ ;; The hash should end up in T3
+ vmovdqa %%T3, %%T2
+%endif
+
+ ;; Final hash is now in T3
+%if %%num_initial_blocks > 0
+ ;; NOTE: obsolete in case %%num_initial_blocks = 0
+ sub %%LENGTH, 16*%%num_initial_blocks
+%endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+ vpxor %%XMM1, %%XMM1, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM1, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+ vpxor %%XMM2, %%XMM2, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM2, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+ vpxor %%XMM3, %%XMM3, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM3, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+ vpxor %%XMM4, %%XMM4, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM4, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+ vpxor %%XMM5, %%XMM5, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM5, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+ vpxor %%XMM6, %%XMM6, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM6, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+ vpxor %%XMM7, %%XMM7, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM7, %%T1
+ %endif
+
+%if %%num_initial_blocks > 0
+ ;; NOTE: 'jl' is never taken for %%num_initial_blocks = 0
+ ;; This macro is executed for lenght 128 and up,
+ ;; zero length is checked in GCM_ENC_DEC.
+ ;; If the last block is partial then the xor will be done later
+ ;; in ENCRYPT_FINAL_PARTIAL_BLOCK.
+ ;; We know it's partial if LENGTH - 16*num_initial_blocks < 128
+ cmp %%LENGTH, 128
+ jl %%_initial_skip_last_word_write
+%endif
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+ vpxor %%XMM8, %%XMM8, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM8, %%T1
+ %endif
+
+ ;; Update %%LENGTH with the number of blocks processed
+ sub %%LENGTH, 16
+ add %%DATA_OFFSET, 16
+%%_initial_skip_last_word_write:
+ sub %%LENGTH, 128-16
+ add %%DATA_OFFSET, 128-16
+
+ vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap
+ ;; Combine GHASHed value with the corresponding ciphertext
+ vpxor %%XMM1, %%XMM1, %%T3
+ vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+;;; INITIAL_BLOCKS macro with support for a partial final block.
+;;; num_initial_blocks is expected to include the partial final block
+;;; in the count.
+%macro INITIAL_BLOCKS_PARTIAL 25
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%LENGTH %5
+%define %%DATA_OFFSET %6
+%define %%num_initial_blocks %7 ; can be 1, 2, 3, 4, 5, 6 or 7 (not 0)
+%define %%T1 %8
+%define %%T2 %9
+%define %%T3 %10 ; [out] hash value
+%define %%T4 %11
+%define %%T5 %12
+%define %%CTR %13
+%define %%XMM1 %14
+%define %%XMM2 %15
+%define %%XMM3 %16
+%define %%XMM4 %17
+%define %%XMM5 %18
+%define %%XMM6 %19
+%define %%XMM7 %20
+%define %%XMM8 %21 ; [in] hash value
+%define %%T6 %22
+%define %%T_key %23
+%define %%ENC_DEC %24
+%define %%INSTANCE_TYPE %25
+
+ ;; Move AAD_HASH to temp reg
+ vmovdqu %%T2, %%XMM8
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ ;; Compute AES counters
+ vpaddd %%CTR, %%CTR, [rel ONE] ; INCR Y0
+ vmovdqa reg(i), %%CTR
+ vpshufb reg(i), [rel SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+vmovdqu %%T_key, [%%GDATA_KEY+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ ; Start AES for %%num_initial_blocks blocks
+ vpxor reg(i),reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep NROUNDS
+vmovdqu %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+vmovdqu %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Hash all but the last block of data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks-1
+ ;; Encrypt the message for all but the last block
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vpxor reg(i), reg(i), %%T1
+ ;; write back ciphertext for %%num_initial_blocks blocks
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
+ add %%DATA_OFFSET, 16
+%ifidn %%ENC_DEC, DEC
+ vmovdqa reg(i), %%T1
+%endif
+ ;; Prepare ciphertext for GHASH computations
+ vpshufb reg(i), [rel SHUF_MASK]
+%assign i (i+1)
+%endrep
+
+%if %%num_initial_blocks > 1
+ ;; The final block of data may be <16B
+ sub %%LENGTH, 16*(%%num_initial_blocks-1)
+%endif
+
+%if %%num_initial_blocks < 8
+ ;; NOTE: the 'jl' is always taken for num_initial_blocks = 8.
+ ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 128.
+ cmp %%LENGTH, 16
+ jl %%_small_initial_partial_block
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Handle a full length final block - encrypt and hash all blocks
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ sub %%LENGTH, 16
+ mov [%%GDATA_CTX + PBlockLen], %%LENGTH
+
+ ;; Encrypt the message
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vpxor reg(i), reg(i), %%T1
+ ;; write back ciphertext for %%num_initial_blocks blocks
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
+ add %%DATA_OFFSET, 16
+%ifidn %%ENC_DEC, DEC
+ vmovdqa reg(i), %%T1
+%endif
+ ;; Prepare ciphertext for GHASH computations
+ vpshufb reg(i), [rel SHUF_MASK]
+
+ ;; Hash all of the data
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+%assign k (%%num_initial_blocks)
+%assign last_block_to_hash 0
+
+%if(%%num_initial_blocks>last_block_to_hash)
+ ;; Hash in AES state
+ vpxor %%T2, reg(j)
+
+ ;; T2 - incoming AAD hash
+ ;; reg(i) holds ciphertext
+ ;; T5 - hash key
+ ;; T6 - updated xor
+ ;; reg(1)/xmm1 should now be available for tmp use
+ vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
+ vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0
+ vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
+ vpxor %%T6, %%T6, %%T5
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%assign rep_count (%%num_initial_blocks-1)
+%rep rep_count
+
+ vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
+ vpclmulqdq %%T3, reg(j), %%T5, 0x11
+ vpxor %%T1, %%T1, %%T3
+
+ vpclmulqdq %%T3, reg(j), %%T5, 0x00
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, reg(j), %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, reg(j), %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%endrep
+
+ ;; Record that a reduction is needed
+ mov r12, 1
+
+ jmp %%_small_initial_compute_hash
+
+
+%endif ; %if %%num_initial_blocks < 8
+
+%%_small_initial_partial_block:
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Handle ghash for a <16B final block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;; In this case if it's a single call to encrypt we can
+ ;; hash all of the data but if it's an init / update / finalize
+ ;; series of call we need to leave the last block if it's
+ ;; less than a full block of data.
+
+ mov [%%GDATA_CTX + PBlockLen], %%LENGTH
+ vmovdqu [%%GDATA_CTX + PBlockEncKey], reg(i)
+ ;; Handle a partial final block
+ ;; GDATA, KEY, T1, T2
+ ;; r13 - length
+ ;; LT16 - indicates type of read and that the buffer is less than 16 bytes long
+ ;; NOTE: could be replaced with %%LENGTH but at this point
+ ;; %%LENGTH is always less than 16.
+ ;; No PLAIN_CYPH_LEN argument available in this macro.
+ ENCRYPT_FINAL_PARTIAL_BLOCK reg(i), %%T1, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, LT16, %%ENC_DEC, %%DATA_OFFSET
+ vpshufb reg(i), [rel SHUF_MASK]
+
+%ifidn %%INSTANCE_TYPE, multi_call
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+%assign k (%%num_initial_blocks-1)
+%assign last_block_to_hash 1
+%else
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+%assign k (%%num_initial_blocks)
+%assign last_block_to_hash 0
+%endif
+
+%if(%%num_initial_blocks>last_block_to_hash)
+ ;; Record that a reduction is needed
+ mov r12, 1
+ ;; Hash in AES state
+ vpxor %%T2, reg(j)
+
+ ;; T2 - incoming AAD hash
+ ;; reg(i) holds ciphertext
+ ;; T5 - hash key
+ ;; T6 - updated xor
+ ;; reg(1)/xmm1 should now be available for tmp use
+ vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
+ vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0
+ vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
+ vpxor %%T6, %%T6, %%T5
+%else
+ ;; Record that a reduction is not needed -
+ ;; In this case no hashes are computed because there
+ ;; is only one initial block and it is < 16B in length.
+ xor r12, r12
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%ifidn %%INSTANCE_TYPE, multi_call
+%assign rep_count (%%num_initial_blocks-2)
+%%_multi_call_hash:
+%else
+%assign rep_count (%%num_initial_blocks-1)
+%endif
+
+%if rep_count < 0
+ ;; fix for negative rep_count
+%assign rep_count 0
+%endif
+
+%rep rep_count
+
+ vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
+ vpclmulqdq %%T3, reg(j), %%T5, 0x11
+ vpxor %%T1, %%T1, %%T3
+
+ vpclmulqdq %%T3, reg(j), %%T5, 0x00
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, reg(j), %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, reg(j), %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%endrep
+
+%%_small_initial_compute_hash:
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Ghash reduction
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%if(%%num_initial_blocks=1)
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; We only need to check if a reduction is needed if
+ ;; initial_blocks == 1 and init/update/final is being used.
+ ;; In this case we may just have a partial block, and that
+ ;; gets hashed in finalize.
+ ;; cmp r12, 0
+ or r12, r12
+ je %%_no_reduction_needed
+%endif
+%endif
+
+ vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs
+ vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs
+ vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4
+ vpxor %%T4, %%T6, %%T4
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; First phase of the reduction
+ vmovdqu %%T3, [rel POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T4, 0x01
+ ;; shift-L xmm2 2 DWs
+ vpslldq %%T2, %%T2, 8
+ vpxor %%T4, %%T4, %%T2
+
+ ;; First phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Second phase of the reduction
+
+ vpclmulqdq %%T2, %%T3, %%T4, 0x00
+ ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+ vpsrldq %%T2, %%T2, 4
+
+ vpclmulqdq %%T4, %%T3, %%T4, 0x10
+ ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
+ vpslldq %%T4, %%T4, 4
+
+ vpxor %%T4, %%T4, %%T2
+ ;; Second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%T3, %%T1, %%T4
+
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; If using init/update/finalize, we need to xor any partial block data
+ ;; into the hash.
+%if %%num_initial_blocks > 1
+ ;; NOTE: for %%num_initial_blocks = 0 the xor never takes place
+%if %%num_initial_blocks != 8
+ ;; NOTE: for %%num_initial_blocks = 8, %%LENGTH, stored in [PBlockLen] is never zero
+ cmp qword [%%GDATA_CTX + PBlockLen], 0
+ je %%_no_partial_block_xor
+%endif ; %%num_initial_blocks != 8
+ vpxor %%T3, %%T3, reg(8)
+%%_no_partial_block_xor:
+%endif ; %%num_initial_blocks > 1
+%endif ; %%INSTANCE_TYPE, multi_call
+
+%if(%%num_initial_blocks=1)
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; NOTE: %%_no_reduction_needed case only valid for
+ ;; multi_call with initial_blocks = 1.
+ ;; Look for comment above around '_no_reduction_needed'
+ ;; The jmp below is obsolete as the code will fall through.
+
+ ;; The result is in %%T3
+ jmp %%_after_reduction
+
+%%_no_reduction_needed:
+ ;; The hash should end up in T3. The only way we should get here is if
+ ;; there is a partial block of data, so xor that into the hash.
+ vpxor %%T3, %%T2, reg(8)
+%endif ; %%INSTANCE_TYPE = multi_call
+%endif ; %%num_initial_blocks=1
+
+%%_after_reduction:
+ ;; Final hash is now in T3
+
+%endmacro ; INITIAL_BLOCKS_PARTIAL
+
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; %%DATA_OFFSET is the data offset value
+%macro GHASH_8_ENCRYPT_8_PARALLEL 23
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%DATA_OFFSET %4
+%define %%T1 %5
+%define %%T2 %6
+%define %%T3 %7
+%define %%T4 %8
+%define %%T5 %9
+%define %%T6 %10
+%define %%CTR %11
+%define %%XMM1 %12
+%define %%XMM2 %13
+%define %%XMM3 %14
+%define %%XMM4 %15
+%define %%XMM5 %16
+%define %%XMM6 %17
+%define %%XMM7 %18
+%define %%XMM8 %19
+%define %%T7 %20
+%define %%loop_idx %21
+%define %%ENC_DEC %22
+%define %%FULL_PARTIAL %23
+
+ vmovdqa %%T2, %%XMM1
+ vmovdqu [rsp + TMP2], %%XMM2
+ vmovdqu [rsp + TMP3], %%XMM3
+ vmovdqu [rsp + TMP4], %%XMM4
+ vmovdqu [rsp + TMP5], %%XMM5
+ vmovdqu [rsp + TMP6], %%XMM6
+ vmovdqu [rsp + TMP7], %%XMM7
+ vmovdqu [rsp + TMP8], %%XMM8
+
+%ifidn %%loop_idx, in_order
+ vpaddd %%XMM1, %%CTR, [rel ONE] ; INCR CNT
+ vmovdqu %%T5, [rel TWO]
+ vpaddd %%XMM2, %%CTR, %%T5
+ vpaddd %%XMM3, %%XMM1, %%T5
+ vpaddd %%XMM4, %%XMM2, %%T5
+ vpaddd %%XMM5, %%XMM3, %%T5
+ vpaddd %%XMM6, %%XMM4, %%T5
+ vpaddd %%XMM7, %%XMM5, %%T5
+ vpaddd %%XMM8, %%XMM6, %%T5
+ vmovdqa %%CTR, %%XMM8
+
+ vmovdqu %%T5, [rel SHUF_MASK]
+ vpshufb %%XMM1, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM2, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM3, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM4, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM5, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM6, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM7, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM8, %%T5 ; perform a 16Byte swap
+%else
+ vpaddd %%XMM1, %%CTR, [rel ONEf] ; INCR CNT
+ vmovdqu %%T5, [rel TWOf]
+ vpaddd %%XMM2, %%CTR, %%T5
+ vpaddd %%XMM3, %%XMM1, %%T5
+ vpaddd %%XMM4, %%XMM2, %%T5
+ vpaddd %%XMM5, %%XMM3, %%T5
+ vpaddd %%XMM6, %%XMM4, %%T5
+ vpaddd %%XMM7, %%XMM5, %%T5
+ vpaddd %%XMM8, %%XMM6, %%T5
+ vmovdqa %%CTR, %%XMM8
+%endif
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T1, [%%GDATA + 16*0]
+ vpxor %%XMM1, %%XMM1, %%T1
+ vpxor %%XMM2, %%XMM2, %%T1
+ vpxor %%XMM3, %%XMM3, %%T1
+ vpxor %%XMM4, %%XMM4, %%T1
+ vpxor %%XMM5, %%XMM5, %%T1
+ vpxor %%XMM6, %%XMM6, %%T1
+ vpxor %%XMM7, %%XMM7, %%T1
+ vpxor %%XMM8, %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T1, [%%GDATA + 16*1]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [%%GDATA + 16*2]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+ vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0
+ vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
+ vpxor %%T6, %%T6, %%T5
+
+ vmovdqu %%T1, [%%GDATA + 16*3]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP2]
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*4]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu %%T1, [rsp + TMP3]
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*5]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [rsp + TMP4]
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*6]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP5]
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*7]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP6]
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*8]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP7]
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + 16*9]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T1, [rsp + TMP8]
+ vmovdqu %%T5, [%%GDATA + HashKey]
+
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T1, %%T4, %%T3
+
+
+ vmovdqu %%T5, [%%GDATA + 16*10]
+ %ifndef GCM128_MODE ; GCM192 or GCM256
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*11]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*12]
+%endif
+%ifdef GCM256_MODE
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*13]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*14]
+%endif ; GCM256
+
+%assign i 0
+%assign j 1
+%rep 8
+
+ ;; SNP TBD: This is pretty ugly - consider whether just XORing the
+ ;; data in after vaesenclast is simpler and performant. Would
+ ;; also have to ripple it through partial block and ghash_mul_8.
+%ifidn %%FULL_PARTIAL, full
+ %ifdef NT_LD
+ VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ vpxor %%T2, %%T2, %%T5
+ %else
+ vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ %endif
+
+ %ifidn %%ENC_DEC, ENC
+ vaesenclast reg(j), reg(j), %%T2
+ %else
+ vaesenclast %%T3, reg(j), %%T2
+ vpxor reg(j), %%T2, %%T5
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
+ %endif
+
+%else
+ ; Don't read the final data during partial block processing
+ %ifdef NT_LD
+ %if (i<7)
+ VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ vpxor %%T2, %%T2, %%T5
+ %else
+ ;; Stage the key directly in T2 rather than hash it with plaintext
+ vmovdqu %%T2, %%T5
+ %endif
+ %else
+ %if (i<7)
+ vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ %else
+ ;; Stage the key directly in T2 rather than hash it with plaintext
+ vmovdqu %%T2, %%T5
+ %endif
+ %endif
+
+ %ifidn %%ENC_DEC, ENC
+ vaesenclast reg(j), reg(j), %%T2
+ %else
+ %if (i<7)
+ vaesenclast %%T3, reg(j), %%T2
+ vpxor reg(j), %%T2, %%T5
+ ;; Do not read the data since it could fault
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
+ %else
+ vaesenclast reg(j), reg(j), %%T2
+ %endif
+ %endif
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs
+ vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs
+ vpxor %%T7, %%T7, %%T3
+ vpxor %%T1, %%T1, %%T6 ; accumulate the results in %%T1:%%T7
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu %%T3, [rel POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T7, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
+
+ vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ %ifidn %%ENC_DEC, ENC
+ ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7
+ %ifidn %%FULL_PARTIAL, full
+ ;; Avoid writing past the buffer if handling a partial block
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8
+ %endif
+ %endif
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%T7, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%T4, %%T3, %%T7, 0x10
+ vpslldq %%T4, %%T4, 4 ; shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%T1, %%T1, %%T4 ; the result is in %%T1
+
+ vpshufb %%XMM1, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [rel SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [rel SHUF_MASK] ; perform a 16Byte swap
+
+
+ vpxor %%XMM1, %%T1
+
+
+%endmacro ; GHASH_8_ENCRYPT_8_PARALLEL
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro GHASH_LAST_8 16
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+%define %%XMM8 %16
+
+ ;; Karatsuba Method
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+
+ vpshufd %%T2, %%XMM1, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM1
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
+ vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
+
+ vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpshufd %%T2, %%XMM2, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM2
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpshufd %%T2, %%XMM3, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM3
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpshufd %%T2, %%XMM4, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM4
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpshufd %%T2, %%XMM5, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM5
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpshufd %%T2, %%XMM6, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM6
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpshufd %%T2, %%XMM7, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM7
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey]
+ vpshufd %%T2, %%XMM8, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM8
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+ vpxor %%XMM1, %%XMM1, %%T6
+ vpxor %%T2, %%XMM1, %%T7
+
+
+
+
+ vpslldq %%T4, %%T2, 8
+ vpsrldq %%T2, %%T2, 8
+
+ vpxor %%T7, %%T7, %%T4
+ vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu %%T3, [rel POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T7, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
+
+ vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%T7, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%T4, %%T3, %%T7, 0x10
+ vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%T6, %%T6, %%T4 ; the result is in %%T6
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro GHASH_LAST_7 15
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+
+ ;; Karatsuba Method
+
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+
+ vpshufd %%T2, %%XMM1, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM1
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
+ vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
+
+ vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpshufd %%T2, %%XMM2, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM2
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpshufd %%T2, %%XMM3, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM3
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpshufd %%T2, %%XMM4, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM4
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpshufd %%T2, %%XMM5, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM5
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpshufd %%T2, %%XMM6, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM6
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_1]
+ vpshufd %%T2, %%XMM7, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM7
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpxor %%XMM1, %%XMM1, %%T6
+ vpxor %%T2, %%XMM1, %%T7
+
+
+
+
+ vpslldq %%T4, %%T2, 8
+ vpsrldq %%T2, %%T2, 8
+
+ vpxor %%T7, %%T7, %%T4
+ vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu %%T3, [rel POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T7, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
+
+ vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%T7, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%T4, %%T3, %%T7, 0x10
+ vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%T6, %%T6, %%T4 ; the result is in %%T6
+%endmacro
+
+
+
+;;; Handle encryption of the final partial block
+;;; IN:
+;;; r13 - Number of bytes to read
+;;; MODIFIES:
+;;; KEY - Key for encrypting the partial block
+;;; SMASHES:
+;;; rax, T1
+;;; Note:
+;;; PLAIN_CYPH_LEN is unused at this stage. Previously:
+;;; it was used to determine if buffer is big enough to do
+;;; a 16 byte read & shift.
+;;; 'LT16' is passed here only if buffer is known to be smaller
+;;; than 16 bytes.
+;;; Any other value passed here will result in 16 byte read
+;;; code path.
+%macro ENCRYPT_FINAL_PARTIAL_BLOCK 7
+%define %%KEY %1
+%define %%T1 %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%PLAIN_CYPH_LEN %5
+%define %%ENC_DEC %6
+%define %%DATA_OFFSET %7
+
+ ;; %%PLAIN_CYPH_IN + %%DATA_OFFSET
+ ;; - input data address
+ ;; r13 - input data length
+ ;; rax - temp registers
+ ;; out:
+ ;; T1 - packed output
+ ;; k1 - valid byte mask
+ READ_SMALL_DATA_INPUT %%T1, %%PLAIN_CYPH_IN+%%DATA_OFFSET, r13, rax
+
+ ;; At this point T1 contains the partial block data
+ ;; Plaintext XOR E(K, Yn)
+ vpxorq %%KEY, %%KEY, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Output r13 Bytes
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET]{k1}, %%KEY
+
+%ifidn %%ENC_DEC, DEC
+ ;; If decrypt, restore the ciphertext into %%KEY
+ vmovdqa64 %%KEY, %%T1
+%else
+ vmovdqu8 %%KEY{k1}{z}, %%KEY
+%endif
+%endmacro ; ENCRYPT_FINAL_PARTIAL_BLOCK
+
+
+
+; Encryption of a single block
+%macro ENCRYPT_SINGLE_BLOCK 2
+%define %%GDATA %1
+%define %%XMM0 %2
+
+ vpxor %%XMM0, %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep NROUNDS
+ vaesenc %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep
+ vaesenclast %%XMM0, [%%GDATA+16*i]
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+ ;; Required for Update/GMC_ENC
+ ;the number of pushes must equal STACK_OFFSET
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r14, rsp
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+ vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+ vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+ vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+ vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+ vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+ vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+ vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+ vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+ vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_zmms_asm
+%endif
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15, [rsp + LOCAL_STORAGE + 9*16]
+ vmovdqu xmm14, [rsp + LOCAL_STORAGE + 8*16]
+ vmovdqu xmm13, [rsp + LOCAL_STORAGE + 7*16]
+ vmovdqu xmm12, [rsp + LOCAL_STORAGE + 6*16]
+ vmovdqu xmm11, [rsp + LOCAL_STORAGE + 5*16]
+ vmovdqu xmm10, [rsp + LOCAL_STORAGE + 4*16]
+ vmovdqu xmm9, [rsp + LOCAL_STORAGE + 3*16]
+ vmovdqu xmm8, [rsp + LOCAL_STORAGE + 2*16]
+ vmovdqu xmm7, [rsp + LOCAL_STORAGE + 1*16]
+ vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+;; Required for Update/GMC_ENC
+ mov rsp, r14
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
+; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV,
+; Additional Authentication data (A_IN), Additional Data length (A_LEN).
+; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA_CTX.
+; Clobbers rax, r10-r13, and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_INIT 8
+%define %%GDATA_KEY %1 ; [in] GCM expanded keys pointer
+%define %%GDATA_CTX %2 ; [in] GCM context pointer
+%define %%IV %3 ; [in] IV pointer
+%define %%A_IN %4 ; [in] AAD pointer
+%define %%A_LEN %5 ; [in] AAD length in bytes
+%define %%GPR1 %6 ; temp GPR
+%define %%GPR2 %7 ; temp GPR
+%define %%GPR3 %8 ; temp GPR
+
+%define %%AAD_HASH xmm14
+
+ CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, %%GPR1, %%GPR2, %%GPR3
+
+ mov %%GPR1, %%A_LEN
+ vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash
+ mov [%%GDATA_CTX + AadLen], %%GPR1 ; ctx_data.aad_length = aad_length
+
+ xor %%GPR1, %%GPR1
+ mov [%%GDATA_CTX + InLen], %%GPR1 ; ctx_data.in_length = 0
+ mov [%%GDATA_CTX + PBlockLen], %%GPR1 ; ctx_data.partial_block_length = 0
+
+ ;; read 12 IV bytes and pad with 0x00000001
+ mov %%GPR2, %%IV
+ vmovd xmm3, [%%GPR2 + 8]
+ vpslldq xmm3, 8
+ vmovq xmm2, [%%GPR2]
+ vmovdqa xmm4, [rel ONEf]
+ vpternlogq xmm2, xmm3, xmm4, 0xfe ; xmm2 = xmm2 or xmm3 or xmm4
+
+ vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv
+
+ ;; store IV as counter in LE format
+ vpshufb xmm2, [rel SHUF_MASK]
+ vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv
+%endmacro
+
+%macro GCM_ENC_DEC_SMALL 12
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%PLAIN_CYPH_LEN %5
+%define %%ENC_DEC %6
+%define %%DATA_OFFSET %7
+%define %%LENGTH %8 ; assumed r13
+%define %%NUM_BLOCKS %9
+%define %%CTR %10 ; assumed xmm9
+%define %%HASH_OUT %11 ; assumed xmm14
+%define %%INSTANCE_TYPE %12
+
+ ;; NOTE: the check below is obsolete in current implementation. The check is already done in GCM_ENC_DEC.
+ ;; cmp %%NUM_BLOCKS, 0
+ ;; je %%_small_initial_blocks_encrypted
+ cmp %%NUM_BLOCKS, 8
+ je %%_small_initial_num_blocks_is_8
+ cmp %%NUM_BLOCKS, 7
+ je %%_small_initial_num_blocks_is_7
+ cmp %%NUM_BLOCKS, 6
+ je %%_small_initial_num_blocks_is_6
+ cmp %%NUM_BLOCKS, 5
+ je %%_small_initial_num_blocks_is_5
+ cmp %%NUM_BLOCKS, 4
+ je %%_small_initial_num_blocks_is_4
+ cmp %%NUM_BLOCKS, 3
+ je %%_small_initial_num_blocks_is_3
+ cmp %%NUM_BLOCKS, 2
+ je %%_small_initial_num_blocks_is_2
+
+ jmp %%_small_initial_num_blocks_is_1
+
+
+%%_small_initial_num_blocks_is_8:
+ ;; r13 - %%LENGTH
+ ;; xmm12 - T1
+ ;; xmm13 - T2
+ ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys
+ ;; xmm15 - T4
+ ;; xmm11 - T5
+ ;; xmm9 - CTR
+ ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys
+ ;; xmm2 - XMM2
+ ;; xmm3 - XMM3
+ ;; xmm4 - XMM4
+ ;; xmm5 - XMM5
+ ;; xmm6 - XMM6
+ ;; xmm7 - XMM7
+ ;; xmm8 - XMM8 - AAD HASH IN
+ ;; xmm10 - T6
+ ;; xmm0 - T_key
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 8, \
+ xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
+ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
+ xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_7:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 7, \
+ xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
+ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
+ xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_6:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 6, \
+ xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
+ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
+ xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_5:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 5, \
+ xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
+ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
+ xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_4:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 4, \
+ xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
+ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
+ xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_3:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 3, \
+ xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
+ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
+ xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_2:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 2, \
+ xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
+ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
+ xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_1:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, 1, \
+ xmm12, xmm13, %%HASH_OUT, xmm15, xmm11, %%CTR, \
+ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, \
+ xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+%%_small_initial_blocks_encrypted:
+
+%endmacro ; GCM_ENC_DEC_SMALL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct
+; has been initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN),
+; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC).
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_ENC_DEC 7
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%PLAIN_CYPH_LEN %5
+%define %%ENC_DEC %6
+%define %%INSTANCE_TYPE %7
+%define %%DATA_OFFSET r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+
+%ifidn __OUTPUT_FORMAT__, win64
+ cmp %%PLAIN_CYPH_LEN, 0
+%else
+ or %%PLAIN_CYPH_LEN, %%PLAIN_CYPH_LEN
+%endif
+ je %%_enc_dec_done
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+ ;; Update length of data processed
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rax, %%PLAIN_CYPH_LEN
+ add [%%GDATA_CTX + InLen], rax
+%else
+ add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN
+%endif
+ vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+ vmovdqu xmm8, [%%GDATA_CTX + AadHash]
+
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; NOTE: partial block processing makes only sense for multi_call here.
+ ;; Used for the update flow - if there was a previous partial
+ ;; block fill the remaining bytes here.
+ PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+%endif
+
+ ;; lift CTR set from initial_blocks to here
+%ifidn %%INSTANCE_TYPE, single_call
+ vmovdqu xmm9, xmm2
+%else
+ vmovdqu xmm9, [%%GDATA_CTX + CurCount]
+%endif
+
+ ;; Save the amount of data left to process in r10
+ mov r13, %%PLAIN_CYPH_LEN
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; NOTE: %%DATA_OFFSET is zero in single_call case.
+ ;; Consequently PLAIN_CYPH_LEN will never be zero after
+ ;; %%DATA_OFFSET subtraction below.
+ sub r13, %%DATA_OFFSET
+
+ ;; There may be no more data if it was consumed in the partial block.
+ cmp r13, 0
+ je %%_enc_dec_done
+%endif ; %%INSTANCE_TYPE, multi_call
+ mov r10, r13
+
+ ;; Determine how many blocks to process in INITIAL
+ mov r12, r13
+ shr r12, 4
+ and r12, 7
+
+ ;; Process one additional block in INITIAL if there is a partial block
+ and r10, 0xf
+ blsmsk r10, r10 ; Set CF if zero
+ cmc ; Flip CF
+ adc r12, 0x0 ; Process an additional INITIAL block if CF set
+
+ ;; Less than 127B will be handled by the small message code, which
+ ;; can process up to 7 16B blocks.
+ cmp r13, 128
+ jge %%_large_message_path
+
+ GCM_ENC_DEC_SMALL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, r13, r12, xmm9, xmm14, %%INSTANCE_TYPE
+ jmp %%_ghash_done
+
+%%_large_message_path:
+ and r12, 0x7 ; Still, don't allow 8 INITIAL blocks since this will
+ ; can be handled by the x8 partial loop.
+
+ cmp r12, 0
+ je %%_initial_num_blocks_is_0
+ cmp r12, 7
+ je %%_initial_num_blocks_is_7
+ cmp r12, 6
+ je %%_initial_num_blocks_is_6
+ cmp r12, 5
+ je %%_initial_num_blocks_is_5
+ cmp r12, 4
+ je %%_initial_num_blocks_is_4
+ cmp r12, 3
+ je %%_initial_num_blocks_is_3
+ cmp r12, 2
+ je %%_initial_num_blocks_is_2
+
+ jmp %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+ ;; r13 - %%LENGTH
+ ;; xmm12 - T1
+ ;; xmm13 - T2
+ ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys
+ ;; xmm15 - T4
+ ;; xmm11 - T5
+ ;; xmm9 - CTR
+ ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys
+ ;; xmm2 - XMM2
+ ;; xmm3 - XMM3
+ ;; xmm4 - XMM4
+ ;; xmm5 - XMM5
+ ;; xmm6 - XMM6
+ ;; xmm7 - XMM7
+ ;; xmm8 - XMM8 - AAD HASH IN
+ ;; xmm10 - T6
+ ;; xmm0 - T_key
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_3:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_2:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+ ;; The entire message was encrypted processed in initial and now need to be hashed
+ cmp r13, 0
+ je %%_encrypt_done
+
+ ;; Encrypt the final <16 byte (partial) block, then hash
+ cmp r13, 16
+ jl %%_encrypt_final_partial
+
+ ;; Process 7 full blocks plus a partial block
+ cmp r13, 128
+ jl %%_encrypt_by_8_partial
+
+
+%%_encrypt_by_8_parallel:
+ ;; in_order vs. out_order is an optimization to increment the counter without shuffling
+ ;; it back into little endian. r15d keeps track of when we need to increent in order so
+ ;; that the carry is handled correctly.
+ vmovd r15d, xmm9
+ and r15d, 255
+ vpshufb xmm9, [rel SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+ cmp r15d, 255-8
+ jg %%_encrypt_by_8
+
+
+
+ ;; xmm0 - T1
+ ;; xmm10 - T2
+ ;; xmm11 - T3
+ ;; xmm12 - T4
+ ;; xmm13 - T5
+ ;; xmm14 - T6
+ ;; xmm9 - CTR
+ ;; xmm1 - XMM1
+ ;; xmm2 - XMM2
+ ;; xmm3 - XMM3
+ ;; xmm4 - XMM4
+ ;; xmm5 - XMM5
+ ;; xmm6 - XMM6
+ ;; xmm7 - XMM7
+ ;; xmm8 - XMM8
+ ;; xmm15 - T7
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC, full
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ cmp r13, 128
+ jge %%_encrypt_by_8_new
+
+ vpshufb xmm9, [rel SHUF_MASK]
+ jmp %%_encrypt_by_8_parallel_done
+
+%%_encrypt_by_8:
+ vpshufb xmm9, [rel SHUF_MASK]
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, full
+ vpshufb xmm9, [rel SHUF_MASK]
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ cmp r13, 128
+ jge %%_encrypt_by_8_new
+ vpshufb xmm9, [rel SHUF_MASK]
+
+
+%%_encrypt_by_8_parallel_done:
+ ;; Test to see if we need a by 8 with partial block. At this point
+ ;; bytes remaining should be either zero or between 113-127.
+ cmp r13, 0
+ je %%_encrypt_done
+
+%%_encrypt_by_8_partial:
+ ;; Shuffle needed to align key for partial block xor. out_order
+ ;; is a little faster because it avoids extra shuffles.
+ ;; TBD: Might need to account for when we don't have room to increment the counter.
+
+
+ ;; Process parallel buffers with a final partial block.
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, partial
+
+
+ add %%DATA_OFFSET, 128-16
+ sub r13, 128-16
+
+%%_encrypt_final_partial:
+
+ vpshufb xmm8, [rel SHUF_MASK]
+ mov [%%GDATA_CTX + PBlockLen], r13
+ vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm8
+
+ ;; xmm8 - Final encrypted counter - need to hash with partial or full block ciphertext
+ ;; GDATA, KEY, T1, T2
+ ENCRYPT_FINAL_PARTIAL_BLOCK xmm8, xmm0, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET
+
+ vpshufb xmm8, [rel SHUF_MASK]
+
+
+%%_encrypt_done:
+
+ ;; Mapping to macro parameters
+ ;; IN:
+ ;; xmm9 contains the counter
+ ;; xmm1-xmm8 contain the xor'd ciphertext
+ ;; OUT:
+ ;; xmm14 contains the final hash
+ ;; GDATA, T1, T2, T3, T4, T5, T6, T7, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+%ifidn %%INSTANCE_TYPE, multi_call
+ mov r13, [%%GDATA_CTX + PBlockLen]
+ cmp r13, 0
+ jz %%_hash_last_8
+ GHASH_LAST_7 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+ ;; XOR the partial word into the hash
+ vpxor xmm14, xmm14, xmm8
+ jmp %%_ghash_done
+%endif
+%%_hash_last_8:
+ GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+%%_ghash_done:
+ vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
+ vmovdqu [%%GDATA_CTX + AadHash], xmm14 ; my_ctx_data.aad hash = xmm14
+
+%%_enc_dec_done:
+
+
+%endmacro ; GCM_ENC_DEC
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX) and whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_COMPLETE 6
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%AUTH_TAG %3
+%define %%AUTH_TAG_LEN %4
+%define %%ENC_DEC %5
+%define %%INSTANCE_TYPE %6
+%define %%PLAIN_CYPH_LEN rax
+
+ vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+ ;; Start AES as early as possible
+ vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0
+ ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0)
+
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; If the GCM function is called as a single function call rather
+ ;; than invoking the individual parts (init, update, finalize) we
+ ;; can remove a write to read dependency on AadHash.
+ vmovdqu xmm14, [%%GDATA_CTX + AadHash]
+
+ ;; Encrypt the final partial block. If we did this as a single call then
+ ;; the partial block was handled in the main GCM_ENC_DEC macro.
+ mov r12, [%%GDATA_CTX + PBlockLen]
+ cmp r12, 0
+
+ je %%_partial_done
+
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ vmovdqu [%%GDATA_CTX + AadHash], xmm14
+
+%%_partial_done:
+
+%endif
+
+ mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes)
+ mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
+
+ shl r12, 3 ; convert into number of bits
+ vmovd xmm15, r12d ; len(A) in xmm15
+
+ shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
+ vmovq xmm1, %%PLAIN_CYPH_LEN
+ vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
+ vpxor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C)
+
+ vpxor xmm14, xmm15
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6
+ vpshufb xmm14, [rel SHUF_MASK] ; perform a 16Byte swap
+
+ vpxor xmm9, xmm9, xmm14
+
+
+%%_return_T:
+ mov r10, %%AUTH_TAG ; r10 = authTag
+ mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
+
+ cmp r11, 16
+ je %%_T_16
+
+ cmp r11, 12
+ je %%_T_12
+
+ cmp r11, 8
+ je %%_T_8
+
+ simd_store_avx r10, xmm9, r11, r12, rax
+ jmp %%_return_T_done
+%%_T_8:
+ vmovq rax, xmm9
+ mov [r10], rax
+ jmp %%_return_T_done
+%%_T_12:
+ vmovq rax, xmm9
+ mov [r10], rax
+ vpsrldq xmm9, xmm9, 8
+ vmovd eax, xmm9
+ mov [r10 + 8], eax
+ jmp %%_return_T_done
+%%_T_16:
+ vmovdqu [r10], xmm9
+
+%%_return_T_done:
+
+%ifdef SAFE_DATA
+ ;; Clear sensitive data from context structure
+ vpxor xmm0, xmm0
+ vmovdqu [%%GDATA_CTX + AadHash], xmm0
+ vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm0
+%endif
+%endmacro ; GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_precomp_128_avx512 /
+; aes_gcm_precomp_192_avx512 /
+; aes_gcm_precomp_256_avx512
+; (struct gcm_key_data *key_data)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(precomp,_),function,)
+FN_NAME(precomp,_):
+;; Parameter is passed through register
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_precomp
+%endif
+
+ push r12
+ push r13
+ push r14
+ push r15
+
+ mov r14, rsp
+
+
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63 ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; only xmm6 needs to be maintained
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+ vpxor xmm6, xmm6
+ ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey
+
+ vpshufb xmm6, [rel SHUF_MASK]
+ ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+ vmovdqa xmm2, xmm6
+ vpsllq xmm6, xmm6, 1
+ vpsrlq xmm2, xmm2, 63
+ vmovdqa xmm1, xmm2
+ vpslldq xmm2, xmm2, 8
+ vpsrldq xmm1, xmm1, 8
+ vpor xmm6, xmm6, xmm2
+ ;reduction
+ vpshufd xmm2, xmm1, 00100100b
+ vpcmpeqd xmm2, [rel TWOONE]
+ vpand xmm2, xmm2, [rel POLY]
+ vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
+
+
+ PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_zmms_asm
+%endif
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+ mov rsp, r14
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+
+exit_precomp:
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_init_128_avx512 / aes_gcm_init_192_avx512 / aes_gcm_init_256_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(init,_),function,)
+FN_NAME(init,_):
+ push r12
+ push r13
+%ifidn __OUTPUT_FORMAT__, win64
+ push r14
+ push r15
+ mov r14, rsp
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 1*16
+ movdqu [rsp + 0*16], xmm6
+%endif
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_init
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_init
+
+ ;; Check IV != NULL
+ cmp arg3, 0
+ jz exit_init
+
+ ;; Check if aad_len == 0
+ cmp arg5, 0
+ jz skip_aad_check_init
+
+ ;; Check aad != NULL (aad_len != 0)
+ cmp arg4, 0
+ jz exit_init
+
+skip_aad_check_init:
+%endif
+ GCM_INIT arg1, arg2, arg3, arg4, arg5, r10, r11, r12
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_zmms_asm
+%endif
+exit_init:
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm6 , [rsp + 0*16]
+ mov rsp, r14
+ pop r15
+ pop r14
+%endif
+ pop r13
+ pop r12
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_update_avx512 / aes_gcm_enc_192_update_avx512 /
+; aes_gcm_enc_256_update_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_update_),function,)
+FN_NAME(enc,_update_):
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_update_enc
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_update_enc
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_update_enc
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_update_enc
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_update_enc
+
+skip_in_out_check_update_enc:
+%endif
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call
+
+exit_update_enc:
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_update_avx512 / aes_gcm_dec_192_update_avx512 /
+; aes_gcm_dec_256_update_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_update_),function,)
+FN_NAME(dec,_update_):
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_update_dec
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_update_dec
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_update_dec
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_update_dec
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_update_dec
+
+skip_in_out_check_update_dec:
+%endif
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call
+
+exit_update_dec:
+ FUNC_RESTORE
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_finalize_avx512 / aes_gcm_enc_192_finalize_avx512 /
+; aes_gcm_enc_256_finalize_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_finalize_),function,)
+FN_NAME(enc,_finalize_):
+
+;; All parameters are passed through registers
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_enc_fin
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_enc_fin
+
+ ;; Check auth_tag != NULL
+ cmp arg3, 0
+ jz exit_enc_fin
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg4, 0
+ jz exit_enc_fin
+
+ cmp arg4, 16
+ ja exit_enc_fin
+%endif
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16], xmm6
+ vmovdqu [rsp + 1*16], xmm9
+ vmovdqu [rsp + 2*16], xmm11
+ vmovdqu [rsp + 3*16], xmm14
+ vmovdqu [rsp + 4*16], xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, arg4, ENC, multi_call
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_zmms_asm
+%endif
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15, [rsp + 4*16]
+ vmovdqu xmm14, [rsp + 3*16]
+ vmovdqu xmm11, [rsp + 2*16]
+ vmovdqu xmm9, [rsp + 1*16]
+ vmovdqu xmm6, [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+
+exit_enc_fin:
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_finalize_avx512 / aes_gcm_dec_192_finalize_avx512
+; aes_gcm_dec_256_finalize_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_finalize_),function,)
+FN_NAME(dec,_finalize_):
+
+;; All parameters are passed through registers
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_dec_fin
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_dec_fin
+
+ ;; Check auth_tag != NULL
+ cmp arg3, 0
+ jz exit_dec_fin
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg4, 0
+ jz exit_dec_fin
+
+ cmp arg4, 16
+ ja exit_dec_fin
+%endif
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16], xmm6
+ vmovdqu [rsp + 1*16], xmm9
+ vmovdqu [rsp + 2*16], xmm11
+ vmovdqu [rsp + 3*16], xmm14
+ vmovdqu [rsp + 4*16], xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, arg4, DEC, multi_call
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_zmms_asm
+%endif
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15, [rsp + 4*16]
+ vmovdqu xmm14, [rsp + 3*16]
+ vmovdqu xmm11, [rsp + 2*16]
+ vmovdqu xmm9, [rsp + 1*16]
+ vmovdqu xmm6, [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+exit_dec_fin:
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_avx512 / aes_gcm_enc_192_avx512 / aes_gcm_enc_256_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_),function,)
+FN_NAME(enc,_):
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_enc
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_enc
+
+ ;; Check IV != NULL
+ cmp arg6, 0
+ jz exit_enc
+
+ ;; Check auth_tag != NULL
+ cmp arg9, 0
+ jz exit_enc
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg10, 0
+ jz exit_enc
+
+ cmp arg10, 16
+ ja exit_enc
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_enc
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_enc
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_enc
+
+skip_in_out_check_enc:
+ ;; Check if aad_len == 0
+ cmp arg8, 0
+ jz skip_aad_check_enc
+
+ ;; Check aad != NULL (aad_len != 0)
+ cmp arg7, 0
+ jz exit_enc
+
+skip_aad_check_enc:
+%endif
+ GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call
+
+ GCM_COMPLETE arg1, arg2, arg9, arg10, ENC, single_call
+
+exit_enc:
+ FUNC_RESTORE
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_avx512 / aes_gcm_dec_192_avx512 / aes_gcm_dec_256_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_),function,)
+FN_NAME(dec,_):
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_dec
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_dec
+
+ ;; Check IV != NULL
+ cmp arg6, 0
+ jz exit_dec
+
+ ;; Check auth_tag != NULL
+ cmp arg9, 0
+ jz exit_dec
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg10, 0
+ jz exit_dec
+
+ cmp arg10, 16
+ ja exit_dec
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_dec
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_dec
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_dec
+
+skip_in_out_check_dec:
+ ;; Check if aad_len == 0
+ cmp arg8, 0
+ jz skip_aad_check_dec
+
+ ;; Check aad != NULL (aad_len != 0)
+ cmp arg7, 0
+ jz exit_dec
+
+skip_aad_check_dec:
+%endif
+
+ GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call
+
+ GCM_COMPLETE arg1, arg2, arg9, arg10, DEC, single_call
+
+exit_dec:
+ FUNC_RESTORE
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/gcm_vaes_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/gcm_vaes_avx512.asm
new file mode 100644
index 000000000..4ef183d31
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/gcm_vaes_avx512.asm
@@ -0,0 +1,4272 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2018-2019, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+; Tomasz Kantecki
+;
+;
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+; The details of the implementation is explained in:
+; Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+; AAD will be padded with 0 to the next 16byte multiple
+; for example, assume AAD is a u32 vector
+;
+; if AAD is 8 bytes:
+; AAD[3] = {A0, A1};
+; padded AAD in xmm register = {A1 A0 0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A1) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 32-bit Sequence Number (A0) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 32-bit Sequence Number
+;
+; if AAD is 12 bytes:
+; AAD[3] = {A0, A1, A2};
+; padded AAD in xmm register = {A2 A1 A0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A2) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 64-bit Extended Sequence Number {A1,A0} |
+; | |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+; Must be a multiple of 4 bytes and from the definition of the spec.
+; The code additionally supports any aadLen length.
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "include/os.asm"
+%include "include/reg_sizes.asm"
+%include "include/clear_regs.asm"
+%include "include/gcm_defines.asm"
+%include "include/gcm_keys_vaes_avx512.asm"
+%include "include/memcpy.asm"
+%include "include/aes_common.asm"
+
+%ifndef GCM128_MODE
+%ifndef GCM192_MODE
+%ifndef GCM256_MODE
+%error "No GCM mode selected for gcm_avx512.asm!"
+%endif
+%endif
+%endif
+
+;; Decide on AES-GCM key size to compile for
+%ifdef GCM128_MODE
+%define NROUNDS 9
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ vaes_avx512
+%endif
+
+%ifdef GCM192_MODE
+%define NROUNDS 11
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ vaes_avx512
+%endif
+
+%ifdef GCM256_MODE
+%define NROUNDS 13
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ vaes_avx512
+%endif
+
+section .text
+default rel
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Stack frame definition
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE (10*16) ; space for 10 XMM registers
+ %define GP_STORAGE ((9*8) + 24) ; space for 9 GP registers + 24 bytes for 64 byte alignment
+%else
+ %define XMM_STORAGE 0
+ %define GP_STORAGE (8*8) ; space for 7 GP registers + 1 for alignment
+%endif
+%ifdef GCM_BIG_DATA
+%define LOCAL_STORAGE (128*16) ; space for up to 128 AES blocks
+%else
+%define LOCAL_STORAGE (48*16) ; space for up to 48 AES blocks
+%endif
+
+;;; sequence is (bottom-up): GP, XMM, local
+%define STACK_GP_OFFSET 0
+%define STACK_XMM_OFFSET (STACK_GP_OFFSET + GP_STORAGE)
+%define STACK_LOCAL_OFFSET (STACK_XMM_OFFSET + XMM_STORAGE)
+%define STACK_FRAME_SIZE (STACK_LOCAL_OFFSET + LOCAL_STORAGE)
+
+;; for compatibility with stack argument definitions in gcm_defines.asm
+%define STACK_OFFSET 0
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; Horizontal XOR - 4 x 128bits xored together
+%macro VHPXORI4x128 2
+%define %%REG %1 ; [in/out] ZMM with 4x128bits to xor; 128bit output
+%define %%TMP %2 ; [clobbered] ZMM temporary register
+ vextracti64x4 YWORD(%%TMP), %%REG, 1
+ vpxorq YWORD(%%REG), YWORD(%%REG), YWORD(%%TMP)
+ vextracti32x4 XWORD(%%TMP), YWORD(%%REG), 1
+ vpxorq XWORD(%%REG), XWORD(%%REG), XWORD(%%TMP)
+%endmacro ; VHPXORI4x128
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; Horizontal XOR - 2 x 128bits xored together
+%macro VHPXORI2x128 2
+%define %%REG %1 ; [in/out] YMM/ZMM with 2x128bits to xor; 128bit output
+%define %%TMP %2 ; [clobbered] XMM/YMM/ZMM temporary register
+ vextracti32x4 XWORD(%%TMP), %%REG, 1
+ vpxorq XWORD(%%REG), XWORD(%%REG), XWORD(%%TMP)
+%endmacro ; VHPXORI2x128
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; schoolbook multiply - 1st step
+%macro VCLMUL_STEP1 6-7
+%define %%KP %1 ; [in] key pointer
+%define %%HI %2 ; [in] previous blocks 4 to 7
+%define %%TMP %3 ; [clobbered] ZMM/YMM/XMM temporary
+%define %%TH %4 ; [out] high product
+%define %%TM %5 ; [out] medium product
+%define %%TL %6 ; [out] low product
+%define %%HKEY %7 ; [in/optional] hash key for multiplication
+
+%if %0 == 6
+ vmovdqu64 %%TMP, [%%KP + HashKey_4]
+%else
+ vmovdqa64 %%TMP, %%HKEY
+%endif
+ vpclmulqdq %%TH, %%HI, %%TMP, 0x11 ; %%T5 = a1*b1
+ vpclmulqdq %%TL, %%HI, %%TMP, 0x00 ; %%T7 = a0*b0
+ vpclmulqdq %%TM, %%HI, %%TMP, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%TMP, %%HI, %%TMP, 0x10 ; %%T4 = a0*b1
+ vpxorq %%TM, %%TM, %%TMP ; [%%TH : %%TM : %%TL]
+%endmacro ; VCLMUL_STEP1
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; schoolbook multiply - 2nd step
+%macro VCLMUL_STEP2 9-11
+%define %%KP %1 ; [in] key pointer
+%define %%HI %2 ; [out] ghash high 128 bits
+%define %%LO %3 ; [in/out] cipher text blocks 0-3 (in); ghash low 128 bits (out)
+%define %%TMP0 %4 ; [clobbered] ZMM/YMM/XMM temporary
+%define %%TMP1 %5 ; [clobbered] ZMM/YMM/XMM temporary
+%define %%TMP2 %6 ; [clobbered] ZMM/YMM/XMM temporary
+%define %%TH %7 ; [in] high product
+%define %%TM %8 ; [in] medium product
+%define %%TL %9 ; [in] low product
+%define %%HKEY %10 ; [in/optional] hash key for multiplication
+%define %%HXOR %11 ; [in/optional] type of horizontal xor (4 - 4x128; 2 - 2x128; 1 - none)
+
+%if %0 == 9
+ vmovdqu64 %%TMP0, [%%KP + HashKey_8]
+%else
+ vmovdqa64 %%TMP0, %%HKEY
+%endif
+ vpclmulqdq %%TMP1, %%LO, %%TMP0, 0x10 ; %%TMP1 = a0*b1
+ vpclmulqdq %%TMP2, %%LO, %%TMP0, 0x11 ; %%TMP2 = a1*b1
+ vpxorq %%TH, %%TH, %%TMP2
+ vpclmulqdq %%TMP2, %%LO, %%TMP0, 0x00 ; %%TMP2 = a0*b0
+ vpxorq %%TL, %%TL, %%TMP2
+ vpclmulqdq %%TMP0, %%LO, %%TMP0, 0x01 ; %%TMP0 = a1*b0
+ vpternlogq %%TM, %%TMP1, %%TMP0, 0x96 ; %%TM = TM xor TMP1 xor TMP0
+
+ ;; finish multiplications
+ vpsrldq %%TMP2, %%TM, 8
+ vpxorq %%HI, %%TH, %%TMP2
+ vpslldq %%TMP2, %%TM, 8
+ vpxorq %%LO, %%TL, %%TMP2
+
+ ;; xor 128bit words horizontally and compute [(X8*H1) + (X7*H2) + ... ((X1+Y0)*H8]
+ ;; note: (X1+Y0) handled elsewhere
+%if %0 < 11
+ VHPXORI4x128 %%HI, %%TMP2
+ VHPXORI4x128 %%LO, %%TMP1
+%else
+%if %%HXOR == 4
+ VHPXORI4x128 %%HI, %%TMP2
+ VHPXORI4x128 %%LO, %%TMP1
+%elif %%HXOR == 2
+ VHPXORI2x128 %%HI, %%TMP2
+ VHPXORI2x128 %%LO, %%TMP1
+%endif ; HXOR
+ ;; for HXOR == 1 there is nothing to be done
+%endif ; !(%0 < 11)
+ ;; HIx holds top 128 bits
+ ;; LOx holds low 128 bits
+ ;; - further reductions to follow
+%endmacro ; VCLMUL_STEP2
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; AVX512 reduction macro
+%macro VCLMUL_REDUCE 6
+%define %%OUT %1 ; [out] zmm/ymm/xmm: result (must not be %%TMP1 or %%HI128)
+%define %%POLY %2 ; [in] zmm/ymm/xmm: polynomial
+%define %%HI128 %3 ; [in] zmm/ymm/xmm: high 128b of hash to reduce
+%define %%LO128 %4 ; [in] zmm/ymm/xmm: low 128b of hash to reduce
+%define %%TMP0 %5 ; [in] zmm/ymm/xmm: temporary register
+%define %%TMP1 %6 ; [in] zmm/ymm/xmm: temporary register
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; first phase of the reduction
+ vpclmulqdq %%TMP0, %%POLY, %%LO128, 0x01
+ vpslldq %%TMP0, %%TMP0, 8 ; shift-L 2 DWs
+ vpxorq %%TMP0, %%LO128, %%TMP0 ; first phase of the reduction complete
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; second phase of the reduction
+ vpclmulqdq %%TMP1, %%POLY, %%TMP0, 0x00
+ vpsrldq %%TMP1, %%TMP1, 4 ; shift-R only 1-DW to obtain 2-DWs shift-R
+
+ vpclmulqdq %%OUT, %%POLY, %%TMP0, 0x10
+ vpslldq %%OUT, %%OUT, 4 ; shift-L 1-DW to obtain result with no shifts
+
+ vpternlogq %%OUT, %%TMP1, %%HI128, 0x96 ; OUT/GHASH = OUT xor TMP1 xor HI128
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endmacro
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; schoolbook multiply (1 to 8 blocks) - 1st step
+%macro VCLMUL_1_TO_8_STEP1 8
+%define %%KP %1 ; [in] key pointer
+%define %%HI %2 ; [in] ZMM ciphered blocks 4 to 7
+%define %%TMP1 %3 ; [clobbered] ZMM temporary
+%define %%TMP2 %4 ; [clobbered] ZMM temporary
+%define %%TH %5 ; [out] ZMM high product
+%define %%TM %6 ; [out] ZMM medium product
+%define %%TL %7 ; [out] ZMM low product
+%define %%NBLOCKS %8 ; [in] number of blocks to ghash (0 to 8)
+
+%if %%NBLOCKS == 8
+ VCLMUL_STEP1 %%KP, %%HI, %%TMP1, %%TH, %%TM, %%TL
+%elif %%NBLOCKS == 7
+ vmovdqu64 %%TMP2, [%%KP + HashKey_3]
+ vmovdqa64 %%TMP1, [rel mask_out_top_block]
+ vpandq %%TMP2, %%TMP1
+ vpandq %%HI, %%TMP1
+ VCLMUL_STEP1 NULL, %%HI, %%TMP1, %%TH, %%TM, %%TL, %%TMP2
+%elif %%NBLOCKS == 6
+ vmovdqu64 YWORD(%%TMP2), [%%KP + HashKey_2]
+ VCLMUL_STEP1 NULL, YWORD(%%HI), YWORD(%%TMP1), \
+ YWORD(%%TH), YWORD(%%TM), YWORD(%%TL), YWORD(%%TMP2)
+%elif %%NBLOCKS == 5
+ vmovdqu64 XWORD(%%TMP2), [%%KP + HashKey_1]
+ VCLMUL_STEP1 NULL, XWORD(%%HI), XWORD(%%TMP1), \
+ XWORD(%%TH), XWORD(%%TM), XWORD(%%TL), XWORD(%%TMP2)
+%else
+ vpxorq %%TH, %%TH
+ vpxorq %%TM, %%TM
+ vpxorq %%TL, %%TL
+%endif
+%endmacro ; VCLMUL_1_TO_8_STEP1
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; schoolbook multiply (1 to 8 blocks) - 2nd step
+%macro VCLMUL_1_TO_8_STEP2 10
+%define %%KP %1 ; [in] key pointer
+%define %%HI %2 ; [out] ZMM ghash high 128bits
+%define %%LO %3 ; [in/out] ZMM ciphered blocks 0 to 3 (in); ghash low 128bits (out)
+%define %%TMP0 %4 ; [clobbered] ZMM temporary
+%define %%TMP1 %5 ; [clobbered] ZMM temporary
+%define %%TMP2 %6 ; [clobbered] ZMM temporary
+%define %%TH %7 ; [in/clobbered] ZMM high sum
+%define %%TM %8 ; [in/clobbered] ZMM medium sum
+%define %%TL %9 ; [in/clobbered] ZMM low sum
+%define %%NBLOCKS %10 ; [in] number of blocks to ghash (0 to 8)
+
+%if %%NBLOCKS == 8
+ VCLMUL_STEP2 %%KP, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL
+%elif %%NBLOCKS == 7
+ vmovdqu64 %%TMP2, [%%KP + HashKey_7]
+ VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4
+%elif %%NBLOCKS == 6
+ vmovdqu64 %%TMP2, [%%KP + HashKey_6]
+ VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4
+%elif %%NBLOCKS == 5
+ vmovdqu64 %%TMP2, [%%KP + HashKey_5]
+ VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4
+%elif %%NBLOCKS == 4
+ vmovdqu64 %%TMP2, [%%KP + HashKey_4]
+ VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4
+%elif %%NBLOCKS == 3
+ vmovdqu64 %%TMP2, [%%KP + HashKey_3]
+ vmovdqa64 %%TMP1, [rel mask_out_top_block]
+ vpandq %%TMP2, %%TMP1
+ vpandq %%LO, %%TMP1
+ VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4
+%elif %%NBLOCKS == 2
+ vmovdqu64 YWORD(%%TMP2), [%%KP + HashKey_2]
+ VCLMUL_STEP2 NULL, YWORD(%%HI), YWORD(%%LO), \
+ YWORD(%%TMP0), YWORD(%%TMP1), YWORD(%%TMP2), \
+ YWORD(%%TH), YWORD(%%TM), YWORD(%%TL), YWORD(%%TMP2), 2
+%elif %%NBLOCKS == 1
+ vmovdqu64 XWORD(%%TMP2), [%%KP + HashKey_1]
+ VCLMUL_STEP2 NULL, XWORD(%%HI), XWORD(%%LO), \
+ XWORD(%%TMP0), XWORD(%%TMP1), XWORD(%%TMP2), \
+ XWORD(%%TH), XWORD(%%TM), XWORD(%%TL), XWORD(%%TMP2), 1
+%else
+ vpxorq %%HI, %%HI
+ vpxorq %%LO, %%LO
+%endif
+%endmacro ; VCLMUL_1_TO_8_STEP2
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; GHASH 1 to 16 blocks of cipher text
+;;; - performs reduction at the end
+;;; - can take intermediate GHASH sums as input
+%macro GHASH_1_TO_16 20
+%define %%KP %1 ; [in] pointer to expanded keys
+%define %%GHASH %2 ; [out] ghash output
+%define %%T1 %3 ; [clobbered] temporary ZMM
+%define %%T2 %4 ; [clobbered] temporary ZMM
+%define %%T3 %5 ; [clobbered] temporary ZMM
+%define %%T4 %6 ; [clobbered] temporary ZMM
+%define %%T5 %7 ; [clobbered] temporary ZMM
+%define %%T6 %8 ; [clobbered] temporary ZMM
+%define %%T7 %9 ; [clobbered] temporary ZMM
+%define %%T8 %10 ; [clobbered] temporary ZMM
+%define %%T9 %11 ; [clobbered] temporary ZMM
+%define %%GH %12 ; [in/cloberred] ghash sum (high) or "no_zmm"
+%define %%GL %13 ; [in/cloberred] ghash sum (low) or "no_zmm"
+%define %%GM %14 ; [in/cloberred] ghash sum (medium) or "no_zmm"
+%define %%AAD_HASH_IN %15 ; [in] input hash value
+%define %%CIPHER_IN0 %16 ; [in] ZMM with cipher text blocks 0-3
+%define %%CIPHER_IN1 %17 ; [in] ZMM with cipher text blocks 4-7
+%define %%CIPHER_IN2 %18 ; [in] ZMM with cipher text blocks 8-11
+%define %%CIPHER_IN3 %19 ; [in] ZMM with cipher text blocks 12-15
+%define %%NUM_BLOCKS %20 ; [in] numerical value, number of blocks
+
+%define %%T0H %%T1
+%define %%T0L %%T2
+%define %%T0M1 %%T3
+%define %%T0M2 %%T4
+
+%define %%T1H %%T5
+%define %%T1L %%T6
+%define %%T1M1 %%T7
+%define %%T1M2 %%T8
+
+%define %%HK %%T9
+
+%assign hashk HashKey_ %+ %%NUM_BLOCKS
+%assign reg_idx 0
+%assign blocks_left %%NUM_BLOCKS
+
+ vpxorq %%CIPHER_IN0, %%CIPHER_IN0, %%AAD_HASH_IN
+
+%assign first_result 1
+
+%ifnidn %%GH, no_zmm
+%ifnidn %%GM, no_zmm
+%ifnidn %%GL, no_zmm
+ ;; GHASH sums passed in to be updated and
+ ;; reduced at the end
+ vmovdqa64 %%T0H, %%GH
+ vmovdqa64 %%T0L, %%GL
+ vmovdqa64 %%T0M1, %%GM
+ vpxorq %%T0M2, %%T0M2
+%assign first_result 0
+%endif
+%endif
+%endif
+
+%rep (blocks_left / 4)
+%xdefine %%REG_IN %%CIPHER_IN %+ reg_idx
+ vmovdqu64 %%HK, [%%KP + hashk]
+%if first_result == 1
+ vpclmulqdq %%T0H, %%REG_IN, %%HK, 0x11 ; H = a1*b1
+ vpclmulqdq %%T0L, %%REG_IN, %%HK, 0x00 ; L = a0*b0
+ vpclmulqdq %%T0M1, %%REG_IN, %%HK, 0x01 ; M1 = a1*b0
+ vpclmulqdq %%T0M2, %%REG_IN, %%HK, 0x10 ; TM2 = a0*b1
+%assign first_result 0
+%else
+ vpclmulqdq %%T1H, %%REG_IN, %%HK, 0x11 ; H = a1*b1
+ vpclmulqdq %%T1L, %%REG_IN, %%HK, 0x00 ; L = a0*b0
+ vpclmulqdq %%T1M1, %%REG_IN, %%HK, 0x01 ; M1 = a1*b0
+ vpclmulqdq %%T1M2, %%REG_IN, %%HK, 0x10 ; M2 = a0*b1
+ vpxorq %%T0H, %%T0H, %%T1H
+ vpxorq %%T0L, %%T0L, %%T1L
+ vpxorq %%T0M1, %%T0M1, %%T1M1
+ vpxorq %%T0M2, %%T0M2, %%T1M2
+%endif
+%undef %%REG_IN
+%assign reg_idx (reg_idx + 1)
+%assign hashk (hashk + 64)
+%assign blocks_left (blocks_left - 4)
+%endrep
+
+%if blocks_left > 0
+;; There are 1, 2 or 3 blocks left to process.
+;; It may also be that they are the only blocks to process.
+
+%xdefine %%REG_IN %%CIPHER_IN %+ reg_idx
+
+%if first_result == 1
+;; Case where %%NUM_BLOCKS = 1, 2 or 3
+%xdefine %%OUT_H %%T0H
+%xdefine %%OUT_L %%T0L
+%xdefine %%OUT_M1 %%T0M1
+%xdefine %%OUT_M2 %%T0M2
+%else
+%xdefine %%OUT_H %%T1H
+%xdefine %%OUT_L %%T1L
+%xdefine %%OUT_M1 %%T1M1
+%xdefine %%OUT_M2 %%T1M2
+%endif
+
+%if blocks_left == 1
+ vmovdqu64 XWORD(%%HK), [%%KP + hashk]
+ vpclmulqdq XWORD(%%OUT_H), XWORD(%%REG_IN), XWORD(%%HK), 0x11 ; %%TH = a1*b1
+ vpclmulqdq XWORD(%%OUT_L), XWORD(%%REG_IN), XWORD(%%HK), 0x00 ; %%TL = a0*b0
+ vpclmulqdq XWORD(%%OUT_M1), XWORD(%%REG_IN), XWORD(%%HK), 0x01 ; %%TM1 = a1*b0
+ vpclmulqdq XWORD(%%OUT_M2), XWORD(%%REG_IN), XWORD(%%HK), 0x10 ; %%TM2 = a0*b1
+%elif blocks_left == 2
+ vmovdqu64 YWORD(%%HK), [%%KP + hashk]
+ vpclmulqdq YWORD(%%OUT_H), YWORD(%%REG_IN), YWORD(%%HK), 0x11 ; %%TH = a1*b1
+ vpclmulqdq YWORD(%%OUT_L), YWORD(%%REG_IN), YWORD(%%HK), 0x00 ; %%TL = a0*b0
+ vpclmulqdq YWORD(%%OUT_M1), YWORD(%%REG_IN), YWORD(%%HK), 0x01 ; %%TM1 = a1*b0
+ vpclmulqdq YWORD(%%OUT_M2), YWORD(%%REG_IN), YWORD(%%HK), 0x10 ; %%TM2 = a0*b1
+%else ; blocks_left == 3
+ vmovdqu64 YWORD(%%HK), [%%KP + hashk]
+ vinserti64x2 %%HK, [%%KP + hashk + 32], 2
+ vpclmulqdq %%OUT_H, %%REG_IN, %%HK, 0x11 ; %%TH = a1*b1
+ vpclmulqdq %%OUT_L, %%REG_IN, %%HK, 0x00 ; %%TL = a0*b0
+ vpclmulqdq %%OUT_M1, %%REG_IN, %%HK, 0x01 ; %%TM1 = a1*b0
+ vpclmulqdq %%OUT_M2, %%REG_IN, %%HK, 0x10 ; %%TM2 = a0*b1
+%endif ; blocks_left
+
+%undef %%REG_IN
+%undef %%OUT_H
+%undef %%OUT_L
+%undef %%OUT_M1
+%undef %%OUT_M2
+
+%if first_result != 1
+ vpxorq %%T0H, %%T0H, %%T1H
+ vpxorq %%T0L, %%T0L, %%T1L
+ vpxorq %%T0M1, %%T0M1, %%T1M1
+ vpxorq %%T0M2, %%T0M2, %%T1M2
+%endif
+
+%endif ; blocks_left > 0
+
+ ;; integrate TM into TH and TL
+ vpxorq %%T0M1, %%T0M1, %%T0M2
+ vpsrldq %%T1M1, %%T0M1, 8
+ vpslldq %%T1M2, %%T0M1, 8
+ vpxorq %%T0H, %%T0H, %%T1M1
+ vpxorq %%T0L, %%T0L, %%T1M2
+
+ ;; add TH and TL 128-bit words horizontally
+ VHPXORI4x128 %%T0H, %%T1M1
+ VHPXORI4x128 %%T0L, %%T1M2
+
+ ;; reduction
+ vmovdqa64 XWORD(%%HK), [rel POLY2]
+ VCLMUL_REDUCE XWORD(%%GHASH), XWORD(%%HK), \
+ XWORD(%%T0H), XWORD(%%T0L), XWORD(%%T0M1), XWORD(%%T0M2)
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+;;; Input: A and B (128-bits each, bit-reflected)
+;;; Output: C = A*B*x mod poly, (i.e. >>1 )
+;;; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+;;; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GHASH_MUL 7
+%define %%GH %1 ; 16 Bytes
+%define %%HK %2 ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%T2, %%GH, %%HK, 0x00 ; %%T2 = a0*b0
+ vpclmulqdq %%T3, %%GH, %%HK, 0x01 ; %%T3 = a1*b0
+ vpclmulqdq %%GH, %%GH, %%HK, 0x10 ; %%GH = a0*b1
+ vpxorq %%GH, %%GH, %%T3
+
+
+ vpsrldq %%T3, %%GH, 8 ; shift-R %%GH 2 DWs
+ vpslldq %%GH, %%GH, 8 ; shift-L %%GH 2 DWs
+
+ vpxorq %%T1, %%T1, %%T3
+ vpxorq %%GH, %%GH, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu64 %%T3, [rel POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%GH, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L %%T2 2 DWs
+
+ vpxorq %%GH, %%GH, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%GH, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R only 1-DW to obtain 2-DWs shift-R
+
+ vpclmulqdq %%GH, %%T3, %%GH, 0x10
+ vpslldq %%GH, %%GH, 4 ; Shift-L 1-DW to obtain result with no shifts
+
+ ; second phase of the reduction complete, the result is in %%GH
+ vpternlogq %%GH, %%T1, %%T2, 0x96 ; GH = GH xor T1 xor T2
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx512
+;;; functions, but are kept to allow users to switch cpu architectures between calls
+;;; of pre, init, update, and finalize.
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+%define %%T6 %8
+
+ vmovdqa %%T5, %%HK
+
+ ;; GHASH keys 2 to 48 or 128
+%ifdef GCM_BIG_DATA
+%assign max_hkey_idx 128
+%else
+%assign max_hkey_idx 48
+%endif
+
+%assign i 2
+%rep (max_hkey_idx - 1)
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^i<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_ %+ i], %%T5 ; [HashKey_i] = %%T5
+%assign i (i + 1)
+%endrep
+
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; READ_SMALL_DATA_INPUT
+;;; Packs xmm register with data when data input is less or equal to 16 bytes
+;;; Returns 0 if data has length 0
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT 5
+%define %%OUTPUT %1 ; [out] xmm register
+%define %%INPUT %2 ; [in] buffer pointer to read from
+%define %%LENGTH %3 ; [in] number of bytes to read
+%define %%TMP1 %4 ; [clobbered]
+%define %%MASK %5 ; [out] k1 to k7 register to store the partial block mask
+
+ cmp %%LENGTH, 16
+ jge %%_read_small_data_ge16
+ lea %%TMP1, [rel byte_len_to_mask_table]
+%ifidn __OUTPUT_FORMAT__, win64
+ add %%TMP1, %%LENGTH
+ add %%TMP1, %%LENGTH
+ kmovw %%MASK, [%%TMP1]
+%else
+ kmovw %%MASK, [%%TMP1 + %%LENGTH*2]
+%endif
+ vmovdqu8 %%OUTPUT{%%MASK}{z}, [%%INPUT]
+ jmp %%_read_small_data_end
+%%_read_small_data_ge16:
+ VX512LDR %%OUTPUT, [%%INPUT]
+ mov %%TMP1, 0xffff
+ kmovq %%MASK, %%TMP1
+%%_read_small_data_end:
+%endmacro ; READ_SMALL_DATA_INPUT
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CALC_AAD_HASH 18
+%define %%A_IN %1 ; [in] AAD text pointer
+%define %%A_LEN %2 ; [in] AAD length
+%define %%AAD_HASH %3 ; [out] xmm ghash value
+%define %%GDATA_KEY %4 ; [in] pointer to keys
+%define %%ZT0 %5 ; [clobbered] ZMM register
+%define %%ZT1 %6 ; [clobbered] ZMM register
+%define %%ZT2 %7 ; [clobbered] ZMM register
+%define %%ZT3 %8 ; [clobbered] ZMM register
+%define %%ZT4 %9 ; [clobbered] ZMM register
+%define %%ZT5 %10 ; [clobbered] ZMM register
+%define %%ZT6 %11 ; [clobbered] ZMM register
+%define %%ZT7 %12 ; [clobbered] ZMM register
+%define %%ZT8 %13 ; [clobbered] ZMM register
+%define %%ZT9 %14 ; [clobbered] ZMM register
+%define %%T1 %15 ; [clobbered] GP register
+%define %%T2 %16 ; [clobbered] GP register
+%define %%T3 %17 ; [clobbered] GP register
+%define %%MASKREG %18 ; [clobbered] mask register
+
+%define %%SHFMSK %%ZT9
+%define %%POLY %%ZT8
+%define %%TH %%ZT7
+%define %%TM %%ZT6
+%define %%TL %%ZT5
+
+ mov %%T1, %%A_IN ; T1 = AAD
+ mov %%T2, %%A_LEN ; T2 = aadLen
+ vpxorq %%AAD_HASH, %%AAD_HASH
+
+ vmovdqa64 %%SHFMSK, [rel SHUF_MASK]
+ vmovdqa64 %%POLY, [rel POLY2]
+
+%%_get_AAD_loop128:
+ cmp %%T2, 128
+ jl %%_exit_AAD_loop128
+
+ vmovdqu64 %%ZT2, [%%T1 + 64*0] ; LO blocks (0-3)
+ vmovdqu64 %%ZT1, [%%T1 + 64*1] ; HI blocks (4-7)
+ vpshufb %%ZT2, %%SHFMSK
+ vpshufb %%ZT1, %%SHFMSK
+
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+
+ VCLMUL_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%TH, %%TM, %%TL
+ VCLMUL_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, %%ZT0, %%ZT3, %%ZT4, %%TH, %%TM, %%TL
+
+ ;; result in %%ZT1(H):%%ZT2(L)
+ ;; reduce and put the result in AAD_HASH
+ VCLMUL_REDUCE %%AAD_HASH, XWORD(%%POLY), XWORD(%%ZT1), XWORD(%%ZT2), \
+ XWORD(%%ZT0), XWORD(%%ZT3)
+
+ sub %%T2, 128
+ je %%_CALC_AAD_done
+
+ add %%T1, 128
+ jmp %%_get_AAD_loop128
+
+%%_exit_AAD_loop128:
+ or %%T2, %%T2
+ jz %%_CALC_AAD_done
+
+ ;; prep mask source address
+ lea %%T3, [rel byte64_len_to_mask_table]
+ lea %%T3, [%%T3 + %%T2*8]
+
+ ;; calculate number of blocks to ghash (including partial bytes)
+ add %%T2, 15
+ and %%T2, -16 ; 1 to 8 blocks possible here
+ shr %%T2, 4
+ cmp %%T2, 7
+ je %%_AAD_blocks_7
+ cmp %%T2, 6
+ je %%_AAD_blocks_6
+ cmp %%T2, 5
+ je %%_AAD_blocks_5
+ cmp %%T2, 4
+ je %%_AAD_blocks_4
+ cmp %%T2, 3
+ je %%_AAD_blocks_3
+ cmp %%T2, 2
+ je %%_AAD_blocks_2
+ cmp %%T2, 1
+ je %%_AAD_blocks_1
+ ;; fall through for 8 blocks
+
+ ;; The flow of each of these cases is identical:
+ ;; - load blocks plain text
+ ;; - shuffle loaded blocks
+ ;; - xor in current hash value into block 0
+ ;; - perform up multiplications with ghash keys
+ ;; - jump to reduction code
+%%_AAD_blocks_8:
+ sub %%T3, (64 * 8)
+ kmovq %%MASKREG, [%%T3]
+ vmovdqu8 %%ZT2, [%%T1 + 64*0]
+ vmovdqu8 %%ZT1{%%MASKREG}{z}, [%%T1 + 64*1]
+ vpshufb %%ZT2, %%SHFMSK
+ vpshufb %%ZT1, %%SHFMSK
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) ; xor in current ghash
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 8
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+ %%ZT0, %%ZT3, %%ZT4, \
+ %%TH, %%TM, %%TL, 8
+ jmp %%_AAD_blocks_done
+
+%%_AAD_blocks_7:
+ sub %%T3, (64 * 8)
+ kmovq %%MASKREG, [%%T3]
+ vmovdqu8 %%ZT2, [%%T1 + 64*0]
+ vmovdqu8 %%ZT1{%%MASKREG}{z}, [%%T1 + 64*1]
+ vpshufb %%ZT2, %%SHFMSK
+ vpshufb %%ZT1, %%SHFMSK
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) ; xor in current ghash
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 7
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+ %%ZT0, %%ZT3, %%ZT4, \
+ %%TH, %%TM, %%TL, 7
+ jmp %%_AAD_blocks_done
+
+%%_AAD_blocks_6:
+ sub %%T3, (64 * 8)
+ kmovq %%MASKREG, [%%T3]
+ vmovdqu8 %%ZT2, [%%T1 + 64*0]
+ vmovdqu8 YWORD(%%ZT1){%%MASKREG}{z}, [%%T1 + 64*1]
+ vpshufb %%ZT2, %%SHFMSK
+ vpshufb YWORD(%%ZT1), YWORD(%%SHFMSK)
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 6
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+ %%ZT0, %%ZT3, %%ZT4, \
+ %%TH, %%TM, %%TL, 6
+ jmp %%_AAD_blocks_done
+
+%%_AAD_blocks_5:
+ sub %%T3, (64 * 8)
+ kmovq %%MASKREG, [%%T3]
+ vmovdqu8 %%ZT2, [%%T1 + 64*0]
+ vmovdqu8 XWORD(%%ZT1){%%MASKREG}{z}, [%%T1 + 64*1]
+ vpshufb %%ZT2, %%SHFMSK
+ vpshufb XWORD(%%ZT1), XWORD(%%SHFMSK)
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 5
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+ %%ZT0, %%ZT3, %%ZT4, \
+ %%TH, %%TM, %%TL, 5
+ jmp %%_AAD_blocks_done
+
+%%_AAD_blocks_4:
+ kmovq %%MASKREG, [%%T3]
+ vmovdqu8 %%ZT2{%%MASKREG}{z}, [%%T1 + 64*0]
+ vpshufb %%ZT2, %%SHFMSK
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 4
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+ %%ZT0, %%ZT3, %%ZT4, \
+ %%TH, %%TM, %%TL, 4
+ jmp %%_AAD_blocks_done
+
+%%_AAD_blocks_3:
+ kmovq %%MASKREG, [%%T3]
+ vmovdqu8 %%ZT2{%%MASKREG}{z}, [%%T1 + 64*0]
+ vpshufb %%ZT2, %%SHFMSK
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 3
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+ %%ZT0, %%ZT3, %%ZT4, \
+ %%TH, %%TM, %%TL, 3
+ jmp %%_AAD_blocks_done
+
+%%_AAD_blocks_2:
+ kmovq %%MASKREG, [%%T3]
+ vmovdqu8 YWORD(%%ZT2){%%MASKREG}{z}, [%%T1 + 64*0]
+ vpshufb YWORD(%%ZT2), YWORD(%%SHFMSK)
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 2
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+ %%ZT0, %%ZT3, %%ZT4, \
+ %%TH, %%TM, %%TL, 2
+ jmp %%_AAD_blocks_done
+
+%%_AAD_blocks_1:
+ kmovq %%MASKREG, [%%T3]
+ vmovdqu8 XWORD(%%ZT2){%%MASKREG}{z}, [%%T1 + 64*0]
+ vpshufb XWORD(%%ZT2), XWORD(%%SHFMSK)
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 1
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+ %%ZT0, %%ZT3, %%ZT4, \
+ %%TH, %%TM, %%TL, 1
+
+%%_AAD_blocks_done:
+ ;; Multiplications have been done. Do the reduction now
+ VCLMUL_REDUCE %%AAD_HASH, XWORD(%%POLY), XWORD(%%ZT1), XWORD(%%ZT2), \
+ XWORD(%%ZT0), XWORD(%%ZT3)
+%%_CALC_AAD_done:
+ ;; result in AAD_HASH
+
+%endmacro ; CALC_AAD_HASH
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; PARTIAL_BLOCK
+;;; Handles encryption/decryption and the tag partial blocks between
+;;; update calls.
+;;; Requires the input data be at least 1 byte long.
+;;; Output:
+;;; A cipher/plain of the first partial block (CYPH_PLAIN_OUT),
+;;; AAD_HASH and updated GDATA_CTX
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 22
+%define %%GDATA_KEY %1 ; [in] key pointer
+%define %%GDATA_CTX %2 ; [in] context pointer
+%define %%CYPH_PLAIN_OUT %3 ; [in] output buffer
+%define %%PLAIN_CYPH_IN %4 ; [in] input buffer
+%define %%PLAIN_CYPH_LEN %5 ; [in] buffer length
+%define %%DATA_OFFSET %6 ; [in/out] data offset (gets updated)
+%define %%AAD_HASH %7 ; [out] updated GHASH value
+%define %%ENC_DEC %8 ; [in] cipher direction
+%define %%GPTMP0 %9 ; [clobbered] GP temporary register
+%define %%GPTMP1 %10 ; [clobbered] GP temporary register
+%define %%GPTMP2 %11 ; [clobbered] GP temporary register
+%define %%ZTMP0 %12 ; [clobbered] ZMM temporary register
+%define %%ZTMP1 %13 ; [clobbered] ZMM temporary register
+%define %%ZTMP2 %14 ; [clobbered] ZMM temporary register
+%define %%ZTMP3 %15 ; [clobbered] ZMM temporary register
+%define %%ZTMP4 %16 ; [clobbered] ZMM temporary register
+%define %%ZTMP5 %17 ; [clobbered] ZMM temporary register
+%define %%ZTMP6 %18 ; [clobbered] ZMM temporary register
+%define %%ZTMP7 %19 ; [clobbered] ZMM temporary register
+%define %%ZTMP8 %20 ; [clobbered] ZMM temporary register
+%define %%ZTMP9 %21 ; [clobbered] ZMM temporary register
+%define %%MASKREG %22 ; [clobbered] mask temporary register
+
+%define %%XTMP0 XWORD(%%ZTMP0)
+%define %%XTMP1 XWORD(%%ZTMP1)
+%define %%XTMP2 XWORD(%%ZTMP2)
+%define %%XTMP3 XWORD(%%ZTMP3)
+%define %%XTMP4 XWORD(%%ZTMP4)
+%define %%XTMP5 XWORD(%%ZTMP5)
+%define %%XTMP6 XWORD(%%ZTMP6)
+%define %%XTMP7 XWORD(%%ZTMP7)
+%define %%XTMP8 XWORD(%%ZTMP8)
+%define %%XTMP9 XWORD(%%ZTMP9)
+
+%define %%LENGTH %%GPTMP0
+%define %%IA0 %%GPTMP1
+%define %%IA1 %%GPTMP2
+
+ mov %%LENGTH, [%%GDATA_CTX + PBlockLen]
+ or %%LENGTH, %%LENGTH
+ je %%_partial_block_done ;Leave Macro if no partial blocks
+
+ READ_SMALL_DATA_INPUT %%XTMP0, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%IA0, %%MASKREG
+
+ ;; XTMP1 = my_ctx_data.partial_block_enc_key
+ vmovdqu64 %%XTMP1, [%%GDATA_CTX + PBlockEncKey]
+ vmovdqu64 %%XTMP2, [%%GDATA_KEY + HashKey]
+
+ ;; adjust the shuffle mask pointer to be able to shift right %%LENGTH bytes
+ ;; (16 - %%LENGTH) is the number of bytes in plaintext mod 16)
+ lea %%IA0, [rel SHIFT_MASK]
+ add %%IA0, %%LENGTH
+ vmovdqu64 %%XTMP3, [%%IA0] ; shift right shuffle mask
+ vpshufb %%XTMP1, %%XTMP3
+
+%ifidn %%ENC_DEC, DEC
+ ;; keep copy of cipher text in %%XTMP4
+ vmovdqa64 %%XTMP4, %%XTMP0
+%endif
+ vpxorq %%XTMP1, %%XTMP0 ; Cyphertext XOR E(K, Yn)
+
+ ;; Set %%IA1 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ ;; Determine if partial block is not being filled and shift mask accordingly
+ mov %%IA1, %%PLAIN_CYPH_LEN
+ add %%IA1, %%LENGTH
+ sub %%IA1, 16
+ jge %%_no_extra_mask
+ sub %%IA0, %%IA1
+%%_no_extra_mask:
+ ;; get the appropriate mask to mask out bottom %%LENGTH bytes of %%XTMP1
+ ;; - mask out bottom %%LENGTH bytes of %%XTMP1
+ vmovdqu64 %%XTMP0, [%%IA0 + ALL_F - SHIFT_MASK]
+ vpand %%XTMP1, %%XTMP0
+
+%ifidn %%ENC_DEC, DEC
+ vpand %%XTMP4, %%XTMP0
+ vpshufb %%XTMP4, [rel SHUF_MASK]
+ vpshufb %%XTMP4, %%XTMP3
+ vpxorq %%AAD_HASH, %%XTMP4
+%else
+ vpshufb %%XTMP1, [rel SHUF_MASK]
+ vpshufb %%XTMP1, %%XTMP3
+ vpxorq %%AAD_HASH, %%XTMP1
+%endif
+ cmp %%IA1, 0
+ jl %%_partial_incomplete
+
+ ;; GHASH computation for the last <16 Byte block
+ GHASH_MUL %%AAD_HASH, %%XTMP2, %%XTMP5, %%XTMP6, %%XTMP7, %%XTMP8, %%XTMP9
+
+ mov qword [%%GDATA_CTX + PBlockLen], 0
+
+ ;; Set %%IA1 to be the number of bytes to write out
+ mov %%IA0, %%LENGTH
+ mov %%LENGTH, 16
+ sub %%LENGTH, %%IA0
+ jmp %%_enc_dec_done
+
+%%_partial_incomplete:
+%ifidn __OUTPUT_FORMAT__, win64
+ mov %%IA0, %%PLAIN_CYPH_LEN
+ add [%%GDATA_CTX + PBlockLen], %%IA0
+%else
+ add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%endif
+ mov %%LENGTH, %%PLAIN_CYPH_LEN
+
+%%_enc_dec_done:
+ ;; output encrypted Bytes
+
+ lea %%IA0, [rel byte_len_to_mask_table]
+ kmovw %%MASKREG, [%%IA0 + %%LENGTH*2]
+ vmovdqu64 [%%GDATA_CTX + AadHash], %%AAD_HASH
+
+%ifidn %%ENC_DEC, ENC
+ ;; shuffle XTMP1 back to output as ciphertext
+ vpshufb %%XTMP1, [rel SHUF_MASK]
+ vpshufb %%XTMP1, %%XTMP3
+%endif
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET]{%%MASKREG}, %%XTMP1
+ add %%DATA_OFFSET, %%LENGTH
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+%macro GHASH_SINGLE_MUL 9
+%define %%GDATA %1
+%define %%HASHKEY %2
+%define %%CIPHER %3
+%define %%STATE_11 %4
+%define %%STATE_00 %5
+%define %%STATE_MID %6
+%define %%T1 %7
+%define %%T2 %8
+%define %%FIRST %9
+
+ vmovdqu %%T1, [%%GDATA + %%HASHKEY]
+%ifidn %%FIRST, first
+ vpclmulqdq %%STATE_11, %%CIPHER, %%T1, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%STATE_00, %%CIPHER, %%T1, 0x00 ; %%T4_2 = a0*b0
+ vpclmulqdq %%STATE_MID, %%CIPHER, %%T1, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 ; %%T5 = a0*b1
+ vpxor %%STATE_MID, %%STATE_MID, %%T2
+%else
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x11
+ vpxor %%STATE_11, %%STATE_11, %%T2
+
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x00
+ vpxor %%STATE_00, %%STATE_00, %%T2
+
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x01
+ vpxor %%STATE_MID, %%STATE_MID, %%T2
+
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10
+ vpxor %%STATE_MID, %%STATE_MID, %%T2
+%endif
+
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; This macro is used to "warm-up" pipeline for GHASH_8_ENCRYPT_8_PARALLEL
+;;; macro code. It is called only for data lenghts 128 and above.
+;;; The flow is as follows:
+;;; - encrypt the initial %%num_initial_blocks blocks (can be 0)
+;;; - encrypt the next 8 blocks and stitch with
+;;; GHASH for the first %%num_initial_blocks
+;;; - the last 8th block can be partial (lengths between 129 and 239)
+;;; - partial block ciphering is handled within this macro
+;;; - top bytes of such block are cleared for
+;;; the subsequent GHASH calculations
+;;; - PBlockEncKey needs to be setup in case of multi-call
+;;; - top bytes of the block need to include encrypted counter block so that
+;;; when handling partial block case text is read and XOR'ed against it.
+;;; This needs to be in un-shuffled format.
+
+%macro INITIAL_BLOCKS 26-27
+%define %%GDATA_KEY %1 ; [in] pointer to GCM keys
+%define %%GDATA_CTX %2 ; [in] pointer to GCM context
+%define %%CYPH_PLAIN_OUT %3 ; [in] output buffer
+%define %%PLAIN_CYPH_IN %4 ; [in] input buffer
+%define %%LENGTH %5 ; [in/out] number of bytes to process
+%define %%DATA_OFFSET %6 ; [in/out] data offset
+%define %%num_initial_blocks %7 ; [in] can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%CTR %8 ; [in/out] XMM counter block
+%define %%AAD_HASH %9 ; [in/out] ZMM with AAD hash
+%define %%ZT1 %10 ; [out] ZMM cipher blocks 0-3 for GHASH
+%define %%ZT2 %11 ; [out] ZMM cipher blocks 4-7 for GHASH
+%define %%ZT3 %12 ; [clobbered] ZMM temporary
+%define %%ZT4 %13 ; [clobbered] ZMM temporary
+%define %%ZT5 %14 ; [clobbered] ZMM temporary
+%define %%ZT6 %15 ; [clobbered] ZMM temporary
+%define %%ZT7 %16 ; [clobbered] ZMM temporary
+%define %%ZT8 %17 ; [clobbered] ZMM temporary
+%define %%ZT9 %18 ; [clobbered] ZMM temporary
+%define %%ZT10 %19 ; [clobbered] ZMM temporary
+%define %%ZT11 %20 ; [clobbered] ZMM temporary
+%define %%ZT12 %21 ; [clobbered] ZMM temporary
+%define %%IA0 %22 ; [clobbered] GP temporary
+%define %%IA1 %23 ; [clobbered] GP temporary
+%define %%ENC_DEC %24 ; [in] ENC/DEC selector
+%define %%MASKREG %25 ; [clobbered] mask register
+%define %%SHUFMASK %26 ; [in] ZMM with BE/LE shuffle mask
+%define %%PARTIAL_PRESENT %27 ; [in] "no_partial_block" option can be passed here (if length is guaranteed to be > 15*16 bytes)
+
+%define %%T1 XWORD(%%ZT1)
+%define %%T2 XWORD(%%ZT2)
+%define %%T3 XWORD(%%ZT3)
+%define %%T4 XWORD(%%ZT4)
+%define %%T5 XWORD(%%ZT5)
+%define %%T6 XWORD(%%ZT6)
+%define %%T7 XWORD(%%ZT7)
+%define %%T8 XWORD(%%ZT8)
+%define %%T9 XWORD(%%ZT9)
+
+%define %%TH %%ZT10
+%define %%TM %%ZT11
+%define %%TL %%ZT12
+
+;; determine if partial block code needs to be added
+%assign partial_block_possible 1
+%if %0 > 26
+%ifidn %%PARTIAL_PRESENT, no_partial_block
+%assign partial_block_possible 0
+%endif
+%endif
+
+%if %%num_initial_blocks > 0
+ ;; prepare AES counter blocks
+%if %%num_initial_blocks == 1
+ vpaddd %%T3, %%CTR, [rel ONE]
+%elif %%num_initial_blocks == 2
+ vshufi64x2 YWORD(%%ZT3), YWORD(%%CTR), YWORD(%%CTR), 0
+ vpaddd YWORD(%%ZT3), YWORD(%%ZT3), [rel ddq_add_1234]
+%else
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ vpaddd %%ZT3, ZWORD(%%CTR), [rel ddq_add_1234]
+ vpaddd %%ZT4, ZWORD(%%CTR), [rel ddq_add_5678]
+%endif
+
+ ;; extract new counter value (%%T3)
+ ;; shuffle the counters for AES rounds
+%if %%num_initial_blocks <= 4
+ vextracti32x4 %%CTR, %%ZT3, (%%num_initial_blocks - 1)
+%else
+ vextracti32x4 %%CTR, %%ZT4, (%%num_initial_blocks - 5)
+%endif
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+ %%ZT3, %%ZT4, no_zmm, no_zmm, \
+ %%ZT3, %%ZT4, no_zmm, no_zmm, \
+ %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK
+
+ ;; load plain/cipher text
+ ZMM_LOAD_BLOCKS_0_16 %%num_initial_blocks, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \
+ %%ZT5, %%ZT6, no_zmm, no_zmm
+
+ ;; AES rounds and XOR with plain/cipher text
+%assign j 0
+%rep (NROUNDS + 2)
+ vbroadcastf64x2 %%ZT1, [%%GDATA_KEY + (j * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \
+ %%ZT1, j, \
+ %%ZT5, %%ZT6, no_zmm, no_zmm, \
+ %%num_initial_blocks, NROUNDS
+%assign j (j + 1)
+%endrep
+
+ ;; write cipher/plain text back to output and
+ ;; zero bytes outside the mask before hashing
+ ZMM_STORE_BLOCKS_0_16 %%num_initial_blocks, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \
+ %%ZT3, %%ZT4, no_zmm, no_zmm
+
+ ;; Shuffle the cipher text blocks for hashing part
+ ;; ZT5 and ZT6 are expected outputs with blocks for hashing
+%ifidn %%ENC_DEC, DEC
+ ;; Decrypt case
+ ;; - cipher blocks are in ZT5 & ZT6
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+ %%ZT5, %%ZT6, no_zmm, no_zmm, \
+ %%ZT5, %%ZT6, no_zmm, no_zmm, \
+ %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK
+%else
+ ;; Encrypt case
+ ;; - cipher blocks are in ZT3 & ZT4
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+ %%ZT5, %%ZT6, no_zmm, no_zmm, \
+ %%ZT3, %%ZT4, no_zmm, no_zmm, \
+ %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK
+%endif ; Encrypt
+
+ ;; adjust data offset and length
+ sub %%LENGTH, (%%num_initial_blocks * 16)
+ add %%DATA_OFFSET, (%%num_initial_blocks * 16)
+
+ ;; At this stage
+ ;; - ZT5:ZT6 include cipher blocks to be GHASH'ed
+
+%endif ; %%num_initial_blocks > 0
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; - cipher of %%num_initial_blocks is done
+ ;; - prepare counter blocks for the next 8 blocks (ZT3 & ZT4)
+ ;; - save the last block in %%CTR
+ ;; - shuffle the blocks for AES
+ ;; - stitch encryption of the new blocks with
+ ;; GHASHING the previous blocks
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ vpaddd %%ZT3, ZWORD(%%CTR), [rel ddq_add_1234]
+ vpaddd %%ZT4, ZWORD(%%CTR), [rel ddq_add_5678]
+ vextracti32x4 %%CTR, %%ZT4, 3
+
+ vpshufb %%ZT3, %%SHUFMASK
+ vpshufb %%ZT4, %%SHUFMASK
+
+%if partial_block_possible != 0
+ ;; get text load/store mask (assume full mask by default)
+ mov %%IA0, 0xffff_ffff_ffff_ffff
+%if %%num_initial_blocks > 0
+ ;; NOTE: 'jge' is always taken for %%num_initial_blocks = 0
+ ;; This macro is executed for lenght 128 and up,
+ ;; zero length is checked in GCM_ENC_DEC.
+ ;; We know there is partial block if:
+ ;; LENGTH - 16*num_initial_blocks < 128
+ cmp %%LENGTH, 128
+ jge %%_initial_partial_block_continue
+ mov %%IA1, rcx
+ mov rcx, 128
+ sub rcx, %%LENGTH
+ shr %%IA0, cl
+ mov rcx, %%IA1
+%%_initial_partial_block_continue:
+%endif
+ kmovq %%MASKREG, %%IA0
+ ;; load plain or cipher text (masked)
+ ZMM_LOAD_MASKED_BLOCKS_0_16 8, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \
+ %%ZT1, %%ZT2, no_zmm, no_zmm, %%MASKREG
+%else
+ ;; load plain or cipher text
+ ZMM_LOAD_BLOCKS_0_16 8, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \
+ %%ZT1, %%ZT2, no_zmm, no_zmm
+%endif ;; partial_block_possible
+
+ ;; === AES ROUND 0
+%assign aes_round 0
+ vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \
+ %%ZT8, aes_round, \
+ %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+
+ ;; === GHASH blocks 4-7
+%if (%%num_initial_blocks > 0)
+ ;; Hash in AES state
+ vpxorq %%ZT5, %%ZT5, %%AAD_HASH
+
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT6, %%ZT8, %%ZT9, \
+ %%TH, %%TM, %%TL, %%num_initial_blocks
+%endif
+
+ ;; === [1/3] of AES rounds
+
+%rep ((NROUNDS + 1) / 3)
+ vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \
+ %%ZT8, aes_round, \
+ %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endrep ; %rep ((NROUNDS + 1) / 2)
+
+ ;; === GHASH blocks 0-3 and gather
+%if (%%num_initial_blocks > 0)
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT6, %%ZT5, \
+ %%ZT7, %%ZT8, %%ZT9, \
+ %%TH, %%TM, %%TL, %%num_initial_blocks
+%endif
+
+ ;; === [2/3] of AES rounds
+
+%rep ((NROUNDS + 1) / 3)
+ vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \
+ %%ZT8, aes_round, \
+ %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endrep ; %rep ((NROUNDS + 1) / 2)
+
+ ;; === GHASH reduction
+
+%if (%%num_initial_blocks > 0)
+ ;; [out] AAD_HASH - hash output
+ ;; [in] T8 - polynomial
+ ;; [in] T6 - high, T5 - low
+ ;; [clobbered] T9, T7 - temporary
+ vmovdqu64 %%T8, [rel POLY2]
+ VCLMUL_REDUCE XWORD(%%AAD_HASH), %%T8, %%T6, %%T5, %%T7, %%T9
+%endif
+
+ ;; === [3/3] of AES rounds
+
+%rep (((NROUNDS + 1) / 3) + 2)
+%if aes_round < (NROUNDS + 2)
+ vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \
+ %%ZT8, aes_round, \
+ %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endif
+%endrep ; %rep ((NROUNDS + 1) / 2)
+
+%if partial_block_possible != 0
+ ;; write cipher/plain text back to output and
+ ;; zero bytes outside the mask before hashing
+ ZMM_STORE_MASKED_BLOCKS_0_16 8, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \
+ %%ZT3, %%ZT4, no_zmm, no_zmm, %%MASKREG
+ ;; check if there is partial block
+ cmp %%LENGTH, 128
+ jl %%_initial_save_partial
+ ;; adjust offset and length
+ add %%DATA_OFFSET, 128
+ sub %%LENGTH, 128
+ jmp %%_initial_blocks_done
+%%_initial_save_partial:
+ ;; partial block case
+ ;; - save the partial block in unshuffled format
+ ;; - ZT4 is partially XOR'ed with data and top bytes contain
+ ;; encrypted counter block only
+ ;; - save number of bytes process in the partial block
+ ;; - adjust offset and zero the length
+ ;; - clear top bytes of the partial block for subsequent GHASH calculations
+ vextracti32x4 [%%GDATA_CTX + PBlockEncKey], %%ZT4, 3
+ add %%DATA_OFFSET, %%LENGTH
+ sub %%LENGTH, (128 - 16)
+ mov [%%GDATA_CTX + PBlockLen], %%LENGTH
+ xor %%LENGTH, %%LENGTH
+ vmovdqu8 %%ZT4{%%MASKREG}{z}, %%ZT4
+%%_initial_blocks_done:
+%else
+ ZMM_STORE_BLOCKS_0_16 8, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \
+ %%ZT3, %%ZT4, no_zmm, no_zmm
+ add %%DATA_OFFSET, 128
+ sub %%LENGTH, 128
+%endif ;; partial_block_possible
+
+ ;; Shuffle AES result for GHASH.
+%ifidn %%ENC_DEC, DEC
+ ;; Decrypt case
+ ;; - cipher blocks are in ZT1 & ZT2
+ vpshufb %%ZT1, %%SHUFMASK
+ vpshufb %%ZT2, %%SHUFMASK
+%else
+ ;; Encrypt case
+ ;; - cipher blocks are in ZT3 & ZT4
+ vpshufb %%ZT1, %%ZT3, %%SHUFMASK
+ vpshufb %%ZT2, %%ZT4, %%SHUFMASK
+%endif ; Encrypt
+
+ ;; Current hash value is in AAD_HASH
+
+ ;; Combine GHASHed value with the corresponding ciphertext
+ vpxorq %%ZT1, %%ZT1, %%AAD_HASH
+
+%endmacro ; INITIAL_BLOCKS
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; INITIAL_BLOCKS_PARTIAL macro with support for a partial final block.
+;;; It may look similar to INITIAL_BLOCKS but its usage is different:
+;;; - first encrypts/decrypts required number of blocks and then
+;;; ghashes these blocks
+;;; - Small packets or left over data chunks (<256 bytes)
+;;; - single or multi call
+;;; - Remaining data chunks below 256 bytes (multi buffer code)
+;;;
+;;; num_initial_blocks is expected to include the partial final block
+;;; in the count.
+%macro INITIAL_BLOCKS_PARTIAL 41
+%define %%GDATA_KEY %1 ; [in] key pointer
+%define %%GDATA_CTX %2 ; [in] context pointer
+%define %%CYPH_PLAIN_OUT %3 ; [in] text out pointer
+%define %%PLAIN_CYPH_IN %4 ; [in] text out pointer
+%define %%LENGTH %5 ; [in/clobbered] length in bytes
+%define %%DATA_OFFSET %6 ; [in/out] current data offset (updated)
+%define %%num_initial_blocks %7 ; [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0)
+%define %%CTR %8 ; [in/out] current counter value
+%define %%HASH_IN_OUT %9 ; [in/out] XMM ghash in/out value
+%define %%ENC_DEC %10 ; [in] cipher direction (ENC/DEC)
+%define %%INSTANCE_TYPE %11 ; [in] multi_call or single_call
+%define %%ZT0 %12 ; [clobbered] ZMM temporary
+%define %%ZT1 %13 ; [clobbered] ZMM temporary
+%define %%ZT2 %14 ; [clobbered] ZMM temporary
+%define %%ZT3 %15 ; [clobbered] ZMM temporary
+%define %%ZT4 %16 ; [clobbered] ZMM temporary
+%define %%ZT5 %17 ; [clobbered] ZMM temporary
+%define %%ZT6 %18 ; [clobbered] ZMM temporary
+%define %%ZT7 %19 ; [clobbered] ZMM temporary
+%define %%ZT8 %20 ; [clobbered] ZMM temporary
+%define %%ZT9 %21 ; [clobbered] ZMM temporary
+%define %%ZT10 %22 ; [clobbered] ZMM temporary
+%define %%ZT11 %23 ; [clobbered] ZMM temporary
+%define %%ZT12 %24 ; [clobbered] ZMM temporary
+%define %%ZT13 %25 ; [clobbered] ZMM temporary
+%define %%ZT14 %26 ; [clobbered] ZMM temporary
+%define %%ZT15 %27 ; [clobbered] ZMM temporary
+%define %%ZT16 %28 ; [clobbered] ZMM temporary
+%define %%ZT17 %29 ; [clobbered] ZMM temporary
+%define %%ZT18 %30 ; [clobbered] ZMM temporary
+%define %%ZT19 %31 ; [clobbered] ZMM temporary
+%define %%ZT20 %32 ; [clobbered] ZMM temporary
+%define %%ZT21 %33 ; [clobbered] ZMM temporary
+%define %%ZT22 %34 ; [clobbered] ZMM temporary
+%define %%GH %35 ; [in] ZMM ghash sum (high)
+%define %%GL %36 ; [in] ZMM ghash sum (low)
+%define %%GM %37 ; [in] ZMM ghash sum (middle)
+%define %%IA0 %38 ; [clobbered] GP temporary
+%define %%IA1 %39 ; [clobbered] GP temporary
+%define %%MASKREG %40 ; [clobbered] mask register
+%define %%SHUFMASK %41 ; [in] ZMM with BE/LE shuffle mask
+
+%define %%T1 XWORD(%%ZT1)
+%define %%T2 XWORD(%%ZT2)
+%define %%T7 XWORD(%%ZT7)
+
+%define %%CTR0 %%ZT3
+%define %%CTR1 %%ZT4
+%define %%CTR2 %%ZT8
+%define %%CTR3 %%ZT9
+
+%define %%DAT0 %%ZT5
+%define %%DAT1 %%ZT6
+%define %%DAT2 %%ZT10
+%define %%DAT3 %%ZT11
+
+%ifnidn %%GH, no_zmm
+%ifnidn %%GL, no_zmm
+%ifnidn %%GM, no_zmm
+ ;; when temporary sums are passed then zero HASH IN value
+ ;; - whatever it holds it is invalid in this case
+ vpxorq %%HASH_IN_OUT, %%HASH_IN_OUT
+%endif
+%endif
+%endif
+ ;; Copy ghash to temp reg
+ vmovdqa64 %%T2, %%HASH_IN_OUT
+
+ ;; prepare AES counter blocks
+%if %%num_initial_blocks == 1
+ vpaddd XWORD(%%CTR0), %%CTR, [rel ONE]
+%elif %%num_initial_blocks == 2
+ vshufi64x2 YWORD(%%CTR0), YWORD(%%CTR), YWORD(%%CTR), 0
+ vpaddd YWORD(%%CTR0), YWORD(%%CTR0), [rel ddq_add_1234]
+%else
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ vpaddd %%CTR0, ZWORD(%%CTR), [rel ddq_add_1234]
+%if %%num_initial_blocks > 4
+ vpaddd %%CTR1, ZWORD(%%CTR), [rel ddq_add_5678]
+%endif
+%if %%num_initial_blocks > 8
+ vpaddd %%CTR2, %%CTR0, [rel ddq_add_8888]
+%endif
+%if %%num_initial_blocks > 12
+ vpaddd %%CTR3, %%CTR1, [rel ddq_add_8888]
+%endif
+%endif
+
+ ;; get load/store mask
+ lea %%IA0, [rel byte64_len_to_mask_table]
+ mov %%IA1, %%LENGTH
+%if %%num_initial_blocks > 12
+ sub %%IA1, 3 * 64
+%elif %%num_initial_blocks > 8
+ sub %%IA1, 2 * 64
+%elif %%num_initial_blocks > 4
+ sub %%IA1, 64
+%endif
+ kmovq %%MASKREG, [%%IA0 + %%IA1*8]
+
+ ;; extract new counter value
+ ;; shuffle the counters for AES rounds
+%if %%num_initial_blocks <= 4
+ vextracti32x4 %%CTR, %%CTR0, (%%num_initial_blocks - 1)
+%elif %%num_initial_blocks <= 8
+ vextracti32x4 %%CTR, %%CTR1, (%%num_initial_blocks - 5)
+%elif %%num_initial_blocks <= 12
+ vextracti32x4 %%CTR, %%CTR2, (%%num_initial_blocks - 9)
+%else
+ vextracti32x4 %%CTR, %%CTR3, (%%num_initial_blocks - 13)
+%endif
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+ %%CTR0, %%CTR1, %%CTR2, %%CTR3, \
+ %%CTR0, %%CTR1, %%CTR2, %%CTR3, \
+ %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK
+
+ ;; load plain/cipher text
+ ZMM_LOAD_MASKED_BLOCKS_0_16 %%num_initial_blocks, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \
+ %%DAT0, %%DAT1, %%DAT2, %%DAT3, %%MASKREG
+
+ ;; AES rounds and XOR with plain/cipher text
+%assign j 0
+%rep (NROUNDS + 2)
+ vbroadcastf64x2 %%ZT1, [%%GDATA_KEY + (j * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%CTR0, %%CTR1, %%CTR2, %%CTR3, \
+ %%ZT1, j, \
+ %%DAT0, %%DAT1, %%DAT2, %%DAT3, \
+ %%num_initial_blocks, NROUNDS
+%assign j (j + 1)
+%endrep
+
+ ;; retrieve the last cipher counter block (partially XOR'ed with text)
+ ;; - this is needed for partial block cases
+%if %%num_initial_blocks <= 4
+ vextracti32x4 %%T1, %%CTR0, (%%num_initial_blocks - 1)
+%elif %%num_initial_blocks <= 8
+ vextracti32x4 %%T1, %%CTR1, (%%num_initial_blocks - 5)
+%elif %%num_initial_blocks <= 12
+ vextracti32x4 %%T1, %%CTR2, (%%num_initial_blocks - 9)
+%else
+ vextracti32x4 %%T1, %%CTR3, (%%num_initial_blocks - 13)
+%endif
+
+ ;; write cipher/plain text back to output and
+ ZMM_STORE_MASKED_BLOCKS_0_16 %%num_initial_blocks, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \
+ %%CTR0, %%CTR1, %%CTR2, %%CTR3, %%MASKREG
+
+ ;; zero bytes outside the mask before hashing
+%if %%num_initial_blocks <= 4
+ vmovdqu8 %%CTR0{%%MASKREG}{z}, %%CTR0
+%elif %%num_initial_blocks <= 8
+ vmovdqu8 %%CTR1{%%MASKREG}{z}, %%CTR1
+%elif %%num_initial_blocks <= 12
+ vmovdqu8 %%CTR2{%%MASKREG}{z}, %%CTR2
+%else
+ vmovdqu8 %%CTR3{%%MASKREG}{z}, %%CTR3
+%endif
+
+ ;; Shuffle the cipher text blocks for hashing part
+ ;; ZT5 and ZT6 are expected outputs with blocks for hashing
+%ifidn %%ENC_DEC, DEC
+ ;; Decrypt case
+ ;; - cipher blocks are in ZT5 & ZT6
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+ %%DAT0, %%DAT1, %%DAT2, %%DAT3, \
+ %%DAT0, %%DAT1, %%DAT2, %%DAT3, \
+ %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK
+%else
+ ;; Encrypt case
+ ;; - cipher blocks are in CTR0-CTR3
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+ %%DAT0, %%DAT1, %%DAT2, %%DAT3, \
+ %%CTR0, %%CTR1, %%CTR2, %%CTR3, \
+ %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK
+%endif ; Encrypt
+
+ ;; Extract the last block for partials and multi_call cases
+%if %%num_initial_blocks <= 4
+ vextracti32x4 %%T7, %%DAT0, %%num_initial_blocks - 1
+%elif %%num_initial_blocks <= 8
+ vextracti32x4 %%T7, %%DAT1, %%num_initial_blocks - 5
+%elif %%num_initial_blocks <= 12
+ vextracti32x4 %%T7, %%DAT2, %%num_initial_blocks - 9
+%else
+ vextracti32x4 %%T7, %%DAT3, %%num_initial_blocks - 13
+%endif
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Hash all but the last block of data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;; update data offset
+%if %%num_initial_blocks > 1
+ ;; The final block of data may be <16B
+ add %%DATA_OFFSET, 16 * (%%num_initial_blocks - 1)
+ sub %%LENGTH, 16 * (%%num_initial_blocks - 1)
+%endif
+
+%if %%num_initial_blocks < 16
+ ;; NOTE: the 'jl' is always taken for num_initial_blocks = 16.
+ ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 256.
+ cmp %%LENGTH, 16
+ jl %%_small_initial_partial_block
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Handle a full length final block - encrypt and hash all blocks
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ sub %%LENGTH, 16
+ add %%DATA_OFFSET, 16
+ mov [%%GDATA_CTX + PBlockLen], %%LENGTH
+
+ ;; Hash all of the data
+
+ ;; ZT2 - incoming AAD hash (low 128bits)
+ ;; ZT12-ZT20 - temporary registers
+ GHASH_1_TO_16 %%GDATA_KEY, %%HASH_IN_OUT, \
+ %%ZT12, %%ZT13, %%ZT14, %%ZT15, %%ZT16, \
+ %%ZT17, %%ZT18, %%ZT19, %%ZT20, \
+ %%GH, %%GL, %%GM, \
+ %%ZT2, %%DAT0, %%DAT1, %%DAT2, %%DAT3, \
+ %%num_initial_blocks
+
+ jmp %%_small_initial_compute_done
+%endif ; %if %%num_initial_blocks < 16
+
+%%_small_initial_partial_block:
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;; Handle ghash for a <16B final block
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;; In this case if it's a single call to encrypt we can
+ ;; hash all of the data but if it's an init / update / finalize
+ ;; series of call we need to leave the last block if it's
+ ;; less than a full block of data.
+
+ mov [%%GDATA_CTX + PBlockLen], %%LENGTH
+ ;; %%T1 is ciphered counter block
+ vmovdqu64 [%%GDATA_CTX + PBlockEncKey], %%T1
+
+%ifidn %%INSTANCE_TYPE, multi_call
+%assign k (%%num_initial_blocks - 1)
+%assign last_block_to_hash 1
+%else
+%assign k (%%num_initial_blocks)
+%assign last_block_to_hash 0
+%endif
+
+%if (%%num_initial_blocks > last_block_to_hash)
+
+ ;; ZT12-ZT20 - temporary registers
+ GHASH_1_TO_16 %%GDATA_KEY, %%HASH_IN_OUT, \
+ %%ZT12, %%ZT13, %%ZT14, %%ZT15, %%ZT16, \
+ %%ZT17, %%ZT18, %%ZT19, %%ZT20, \
+ %%GH, %%GL, %%GM, \
+ %%ZT2, %%DAT0, %%DAT1, %%DAT2, %%DAT3, k
+
+ ;; just fall through no jmp needed
+%else
+ ;; Record that a reduction is not needed -
+ ;; In this case no hashes are computed because there
+ ;; is only one initial block and it is < 16B in length.
+ ;; We only need to check if a reduction is needed if
+ ;; initial_blocks == 1 and init/update/final is being used.
+ ;; In this case we may just have a partial block, and that
+ ;; gets hashed in finalize.
+
+%assign need_for_reduction 1
+%ifidn %%GH, no_zmm
+%ifidn %%GL, no_zmm
+%ifidn %%GM, no_zmm
+;; if %%GH, %%GL & %%GM not passed then reduction is not required
+%assign need_for_reduction 0
+%endif
+%endif
+%endif
+
+%if need_for_reduction == 0
+ ;; The hash should end up in HASH_IN_OUT.
+ ;; The only way we should get here is if there is
+ ;; a partial block of data, so xor that into the hash.
+ vpxorq %%HASH_IN_OUT, %%T2, %%T7
+%else
+ ;; right - here we have nothing to ghash in the small data but
+ ;; we have GHASH sums passed through that we need to gather and reduce
+
+ ;; integrate TM into TH and TL
+ vpsrldq %%ZT12, %%GM, 8
+ vpslldq %%ZT13, %%GM, 8
+ vpxorq %%GH, %%GH, %%ZT12
+ vpxorq %%GL, %%GL, %%ZT13
+
+ ;; add TH and TL 128-bit words horizontally
+ VHPXORI4x128 %%GH, %%ZT12
+ VHPXORI4x128 %%GL, %%ZT13
+
+ ;; reduction
+ vmovdqa64 XWORD(%%ZT12), [rel POLY2]
+ VCLMUL_REDUCE %%HASH_IN_OUT, XWORD(%%ZT12), \
+ XWORD(%%GH), XWORD(%%GL), XWORD(%%ZT13), XWORD(%%ZT14)
+
+ vpxorq %%HASH_IN_OUT, %%HASH_IN_OUT, %%T7
+%endif
+ ;; The result is in %%HASH_IN_OUT
+ jmp %%_after_reduction
+%endif
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; After GHASH reduction
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_small_initial_compute_done:
+
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; If using init/update/finalize, we need to xor any partial block data
+ ;; into the hash.
+%if %%num_initial_blocks > 1
+ ;; NOTE: for %%num_initial_blocks = 0 the xor never takes place
+%if %%num_initial_blocks != 16
+ ;; NOTE: for %%num_initial_blocks = 16, %%LENGTH, stored in [PBlockLen] is never zero
+ or %%LENGTH, %%LENGTH
+ je %%_after_reduction
+%endif ; %%num_initial_blocks != 16
+ vpxorq %%HASH_IN_OUT, %%HASH_IN_OUT, %%T7
+%endif ; %%num_initial_blocks > 1
+%endif ; %%INSTANCE_TYPE, multi_call
+
+%%_after_reduction:
+ ;; Final hash is now in HASH_IN_OUT
+
+%endmacro ; INITIAL_BLOCKS_PARTIAL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Main GCM macro stitching cipher with GHASH
+;;; - operates on single stream
+;;; - encrypts 8 blocks at a time
+;;; - ghash the 8 previously encrypted ciphertext blocks
+;;; For partial block case and multi_call , AES_PARTIAL_BLOCK on output
+;;; contains encrypted counter block.
+%macro GHASH_8_ENCRYPT_8_PARALLEL 34-37
+%define %%GDATA %1 ; [in] key pointer
+%define %%CYPH_PLAIN_OUT %2 ; [in] pointer to output buffer
+%define %%PLAIN_CYPH_IN %3 ; [in] pointer to input buffer
+%define %%DATA_OFFSET %4 ; [in] data offset
+%define %%CTR1 %5 ; [in/out] ZMM counter blocks 0 to 3
+%define %%CTR2 %6 ; [in/out] ZMM counter blocks 4 to 7
+%define %%GHASHIN_AESOUT_B03 %7 ; [in/out] ZMM ghash in / aes out blocks 0 to 3
+%define %%GHASHIN_AESOUT_B47 %8 ; [in/out] ZMM ghash in / aes out blocks 4 to 7
+%define %%AES_PARTIAL_BLOCK %9 ; [out] XMM partial block (AES)
+%define %%loop_idx %10 ; [in] counter block prep selection "add+shuffle" or "add"
+%define %%ENC_DEC %11 ; [in] cipher direction
+%define %%FULL_PARTIAL %12 ; [in] last block type selection "full" or "partial"
+%define %%IA0 %13 ; [clobbered] temporary GP register
+%define %%IA1 %14 ; [clobbered] temporary GP register
+%define %%LENGTH %15 ; [in] length
+%define %%INSTANCE_TYPE %16 ; [in] 'single_call' or 'multi_call' selection
+%define %%GH4KEY %17 ; [in] ZMM with GHASH keys 4 to 1
+%define %%GH8KEY %18 ; [in] ZMM with GHASH keys 8 to 5
+%define %%SHFMSK %19 ; [in] ZMM with byte swap mask for pshufb
+%define %%ZT1 %20 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT2 %21 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT3 %22 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT4 %23 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT5 %24 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT10 %25 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT11 %26 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT12 %27 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT13 %28 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT14 %29 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT15 %30 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT16 %31 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT17 %32 ; [clobbered] temporary ZMM (ghash)
+%define %%MASKREG %33 ; [clobbered] mask register for partial loads/stores
+%define %%DO_REDUCTION %34 ; [in] "reduction", "no_reduction", "final_reduction"
+%define %%TO_REDUCE_L %35 ; [in/out] ZMM for low 4x128-bit in case of "no_reduction"
+%define %%TO_REDUCE_H %36 ; [in/out] ZMM for hi 4x128-bit in case of "no_reduction"
+%define %%TO_REDUCE_M %37 ; [in/out] ZMM for medium 4x128-bit in case of "no_reduction"
+
+%define %%GH1H %%ZT10
+%define %%GH1L %%ZT11
+%define %%GH1M1 %%ZT12
+%define %%GH1M2 %%ZT13
+
+%define %%GH2H %%ZT14
+%define %%GH2L %%ZT15
+%define %%GH2M1 %%ZT16
+%define %%GH2M2 %%ZT17
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; populate counter blocks for cipher part
+%ifidn %%loop_idx, in_order
+ ;; %%CTR1 & %%CTR2 are shuffled outside the scope of this macro
+ ;; it has to be kept in unshuffled format
+ vpshufb %%ZT1, %%CTR1, %%SHFMSK
+ vpshufb %%ZT2, %%CTR2, %%SHFMSK
+%else
+ vmovdqa64 %%ZT1, %%CTR1
+ vmovdqa64 %%ZT2, %%CTR2
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; stitch AES rounds with GHASH
+
+%assign aes_round 0
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES round 0 - ARK
+ vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ %%ZT3, aes_round, \
+ %%ZT4, %%ZT5, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+
+ ;;==================================================
+ ;; GHASH 4 blocks
+ vpclmulqdq %%GH1H, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x11 ; a1*b1
+ vpclmulqdq %%GH1L, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x00 ; a0*b0
+ vpclmulqdq %%GH1M1, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x01 ; a1*b0
+ vpclmulqdq %%GH1M2, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x10 ; a0*b1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 3 AES rounds
+%rep 3
+ vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ %%ZT3, aes_round, \
+ %%ZT4, %%ZT5, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endrep ; 3 x AES ROUND
+
+ ;; =================================================
+ ;; GHASH 4 blocks
+ vpclmulqdq %%GH2M1, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x10 ; a0*b1
+ vpclmulqdq %%GH2M2, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x01 ; a1*b0
+ vpclmulqdq %%GH2H, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x11 ; a1*b1
+ vpclmulqdq %%GH2L, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x00 ; a0*b0
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 3 AES rounds
+%rep 3
+ vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ %%ZT3, aes_round, \
+ %%ZT4, %%ZT5, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endrep ; 3 x AES ROUND
+
+ ;; =================================================
+ ;; gather GHASH in GH1L (low) and GH1H (high)
+%ifidn %%DO_REDUCTION, no_reduction
+ vpternlogq %%GH1M1, %%GH1M2, %%GH2M1, 0x96 ; TM: GH1M1 ^= GH1M2 ^ GH2M1
+ vpternlogq %%TO_REDUCE_M, %%GH1M1, %%GH2M2, 0x96 ; TM: TO_REDUCE_M ^= GH1M1 ^ GH2M2
+ vpternlogq %%TO_REDUCE_H, %%GH1H, %%GH2H, 0x96 ; TH: TO_REDUCE_H ^= GH1H ^ GH2H
+ vpternlogq %%TO_REDUCE_L, %%GH1L, %%GH2L, 0x96 ; TL: TO_REDUCE_L ^= GH1L ^ GH2L
+%endif
+%ifidn %%DO_REDUCTION, do_reduction
+ ;; phase 1: add mid products together
+ vpternlogq %%GH1M1, %%GH1M2, %%GH2M1, 0x96 ; TM: GH1M1 ^= GH1M2 ^ GH2M1
+ vpxorq %%GH1M1, %%GH1M1, %%GH2M2
+
+ vpsrldq %%GH2M1, %%GH1M1, 8
+ vpslldq %%GH1M1, %%GH1M1, 8
+%endif
+%ifidn %%DO_REDUCTION, final_reduction
+ ;; phase 1: add mid products together
+ vpternlogq %%GH1M1, %%GH1M2, %%GH2M1, 0x96 ; TM: GH1M1 ^= GH1M2 ^ GH2M1
+ vpternlogq %%GH1M1, %%TO_REDUCE_M, %%GH2M2, 0x96 ; TM: GH1M1 ^= TO_REDUCE_M ^ GH2M2
+
+ vpsrldq %%GH2M1, %%GH1M1, 8
+ vpslldq %%GH1M1, %%GH1M1, 8
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 2 AES rounds
+%rep 2
+ vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ %%ZT3, aes_round, \
+ %%ZT4, %%ZT5, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endrep ; 2 x AES ROUND
+
+ ;; =================================================
+ ;; Add mid product to high and low then
+ ;; horizontal xor of low and high 4x128
+%ifidn %%DO_REDUCTION, final_reduction
+ vpternlogq %%GH1H, %%GH2H, %%GH2M1, 0x96 ; TH = TH1 + TH2 + TM>>64
+ vpxorq %%GH1H, %%TO_REDUCE_H
+ vpternlogq %%GH1L, %%GH2L, %%GH1M1, 0x96 ; TL = TL1 + TL2 + TM<<64
+ vpxorq %%GH1L, %%TO_REDUCE_L
+%endif
+%ifidn %%DO_REDUCTION, do_reduction
+ vpternlogq %%GH1H, %%GH2H, %%GH2M1, 0x96 ; TH = TH1 + TH2 + TM>>64
+ vpternlogq %%GH1L, %%GH2L, %%GH1M1, 0x96 ; TL = TL1 + TL2 + TM<<64
+%endif
+%ifnidn %%DO_REDUCTION, no_reduction
+ VHPXORI4x128 %%GH1H, %%GH2H
+ VHPXORI4x128 %%GH1L, %%GH2L
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 2 AES rounds
+%rep 2
+%if (aes_round < (NROUNDS + 1))
+ vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ %%ZT3, aes_round, \
+ %%ZT4, %%ZT5, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endif ; aes_round < (NROUNDS + 1)
+%endrep
+
+ ;; =================================================
+ ;; first phase of reduction
+%ifnidn %%DO_REDUCTION, no_reduction
+ vmovdqu64 XWORD(%%GH2M2), [rel POLY2]
+ vpclmulqdq XWORD(%%ZT15), XWORD(%%GH2M2), XWORD(%%GH1L), 0x01
+ vpslldq XWORD(%%ZT15), XWORD(%%ZT15), 8 ; shift-L 2 DWs
+ vpxorq XWORD(%%ZT15), XWORD(%%GH1L), XWORD(%%ZT15) ; first phase of the reduct
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 2 AES rounds
+%rep 2
+%if (aes_round < (NROUNDS + 1))
+ vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ %%ZT3, aes_round, \
+ %%ZT4, %%ZT5, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endif ; aes_round < (NROUNDS + 1)
+%endrep
+
+ ;; =================================================
+ ;; second phase of the reduction
+%ifnidn %%DO_REDUCTION, no_reduction
+ vpclmulqdq XWORD(%%ZT16), XWORD(%%GH2M2), XWORD(%%ZT15), 0x00
+ vpsrldq XWORD(%%ZT16), XWORD(%%ZT16), 4 ; shift-R 1-DW to obtain 2-DWs shift-R
+
+ vpclmulqdq XWORD(%%ZT13), XWORD(%%GH2M2), XWORD(%%ZT15), 0x10
+ vpslldq XWORD(%%ZT13), XWORD(%%ZT13), 4 ; shift-L 1-DW for result without shifts
+ ;; ZT13 = ZT13 xor ZT16 xor GH1H
+ vpternlogq XWORD(%%ZT13), XWORD(%%ZT16), XWORD(%%GH1H), 0x96
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; all remaining AES rounds but the last
+%rep (NROUNDS + 2)
+%if (aes_round < (NROUNDS + 1))
+ vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ %%ZT3, aes_round, \
+ %%ZT4, %%ZT5, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endif ; aes_round < (NROUNDS + 1)
+%endrep
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; load/store mask (partial case) and load the text data
+%ifidn %%FULL_PARTIAL, full
+ VX512LDR %%ZT4, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ VX512LDR %%ZT5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 64]
+%else
+ lea %%IA0, [rel byte64_len_to_mask_table]
+ mov %%IA1, %%LENGTH
+ sub %%IA1, 64
+ kmovq %%MASKREG, [%%IA0 + 8*%%IA1]
+ VX512LDR %%ZT4, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vmovdqu8 %%ZT5{%%MASKREG}{z}, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 64]
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; the last AES round (NROUNDS + 1) and XOR against plain/cipher text
+ vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ %%ZT3, aes_round, \
+ %%ZT4, %%ZT5, no_zmm, no_zmm, \
+ 8, NROUNDS
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; store the cipher/plain text data
+%ifidn %%FULL_PARTIAL, full
+ VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], %%ZT1
+ VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 64], %%ZT2
+%else
+ VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], %%ZT1
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 64]{%%MASKREG}, %%ZT2
+%endif
+
+ ;; =================================================
+ ;; prep cipher text blocks for the next ghash round
+
+%ifnidn %%FULL_PARTIAL, full
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; for partial block & multi_call we need encrypted counter block
+ vpxorq %%ZT3, %%ZT2, %%ZT5
+ vextracti32x4 %%AES_PARTIAL_BLOCK, %%ZT3, 3
+%endif
+ ;; for GHASH computation purpose clear the top bytes of the partial block
+%ifidn %%ENC_DEC, ENC
+ vmovdqu8 %%ZT2{%%MASKREG}{z}, %%ZT2
+%else
+ vmovdqu8 %%ZT5{%%MASKREG}{z}, %%ZT5
+%endif
+%endif ; %ifnidn %%FULL_PARTIAL, full
+
+ ;; =================================================
+ ;; shuffle cipher text blocks for GHASH computation
+%ifidn %%ENC_DEC, ENC
+ vpshufb %%GHASHIN_AESOUT_B03, %%ZT1, %%SHFMSK
+ vpshufb %%GHASHIN_AESOUT_B47, %%ZT2, %%SHFMSK
+%else
+ vpshufb %%GHASHIN_AESOUT_B03, %%ZT4, %%SHFMSK
+ vpshufb %%GHASHIN_AESOUT_B47, %%ZT5, %%SHFMSK
+%endif
+
+%ifidn %%DO_REDUCTION, do_reduction
+ ;; =================================================
+ ;; XOR current GHASH value (ZT13) into block 0
+ vpxorq %%GHASHIN_AESOUT_B03, %%ZT13
+%endif
+%ifidn %%DO_REDUCTION, final_reduction
+ ;; =================================================
+ ;; Return GHASH value (ZT13) in TO_REDUCE_L
+ vmovdqa64 %%TO_REDUCE_L, %%ZT13
+%endif
+
+%endmacro ; GHASH_8_ENCRYPT_8_PARALLEL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Main GCM macro stitching cipher with GHASH
+;;; - operates on single stream
+;;; - encrypts 16 blocks at a time
+;;; - ghash the 16 previously encrypted ciphertext blocks
+;;; - no partial block or multi_call handling here
+%macro GHASH_16_ENCRYPT_16_PARALLEL 42
+%define %%GDATA %1 ; [in] key pointer
+%define %%CYPH_PLAIN_OUT %2 ; [in] pointer to output buffer
+%define %%PLAIN_CYPH_IN %3 ; [in] pointer to input buffer
+%define %%DATA_OFFSET %4 ; [in] data offset
+%define %%CTR_BE %5 ; [in/out] ZMM counter blocks (last 4) in big-endian
+%define %%CTR_CHECK %6 ; [in/out] GP with 8-bit counter for overflow check
+%define %%HASHKEY_OFFSET %7 ; [in] numerical offset for the highest hash key
+%define %%AESOUT_BLK_OFFSET %8 ; [in] numerical offset for AES-CTR out
+%define %%GHASHIN_BLK_OFFSET %9 ; [in] numerical offset for GHASH blocks in
+%define %%SHFMSK %10 ; [in] ZMM with byte swap mask for pshufb
+%define %%ZT1 %11 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT2 %12 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT3 %13 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT4 %14 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT5 %15 ; [clobbered/out] temporary ZMM or GHASH OUT (final_reduction)
+%define %%ZT6 %16 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT7 %17 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT8 %18 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT9 %19 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT10 %20 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT11 %21 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT12 %22 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT13 %23 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT14 %24 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT15 %25 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT16 %26 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT17 %27 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT18 %28 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT19 %29 ; [clobbered] temporary ZMM
+%define %%ZT20 %30 ; [clobbered] temporary ZMM
+%define %%ZT21 %31 ; [clobbered] temporary ZMM
+%define %%ZT22 %32 ; [clobbered] temporary ZMM
+%define %%ZT23 %33 ; [clobbered] temporary ZMM
+%define %%ADDBE_4x4 %34 ; [in] ZMM with 4x128bits 4 in big-endian
+%define %%ADDBE_1234 %35 ; [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
+%define %%TO_REDUCE_L %36 ; [in/out] ZMM for low 4x128-bit GHASH sum
+%define %%TO_REDUCE_H %37 ; [in/out] ZMM for hi 4x128-bit GHASH sum
+%define %%TO_REDUCE_M %38 ; [in/out] ZMM for medium 4x128-bit GHASH sum
+%define %%DO_REDUCTION %39 ; [in] "no_reduction", "final_reduction", "first_time"
+%define %%ENC_DEC %40 ; [in] cipher direction
+%define %%DATA_DISPL %41 ; [in] fixed numerical data displacement/offset
+%define %%GHASH_IN %42 ; [in] current GHASH value or "no_ghash_in"
+
+%define %%B00_03 %%ZT1
+%define %%B04_07 %%ZT2
+%define %%B08_11 %%ZT3
+%define %%B12_15 %%ZT4
+
+%define %%GH1H %%ZT5 ; @note: do not change this mapping
+%define %%GH1L %%ZT6
+%define %%GH1M %%ZT7
+%define %%GH1T %%ZT8
+
+%define %%GH2H %%ZT9
+%define %%GH2L %%ZT10
+%define %%GH2M %%ZT11
+%define %%GH2T %%ZT12
+
+%define %%RED_POLY %%GH2T
+%define %%RED_P1 %%GH2L
+%define %%RED_T1 %%GH2H
+%define %%RED_T2 %%GH2M
+
+%define %%GH3H %%ZT13
+%define %%GH3L %%ZT14
+%define %%GH3M %%ZT15
+%define %%GH3T %%ZT16
+
+%define %%DATA1 %%ZT13
+%define %%DATA2 %%ZT14
+%define %%DATA3 %%ZT15
+%define %%DATA4 %%ZT16
+
+%define %%AESKEY1 %%ZT17
+%define %%AESKEY2 %%ZT18
+
+%define %%GHKEY1 %%ZT19
+%define %%GHKEY2 %%ZT20
+%define %%GHDAT1 %%ZT21
+%define %%GHDAT2 %%ZT22
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; prepare counter blocks
+
+ cmp BYTE(%%CTR_CHECK), (256 - 16)
+ jae %%_16_blocks_overflow
+ vpaddd %%B00_03, %%CTR_BE, %%ADDBE_1234
+ vpaddd %%B04_07, %%B00_03, %%ADDBE_4x4
+ vpaddd %%B08_11, %%B04_07, %%ADDBE_4x4
+ vpaddd %%B12_15, %%B08_11, %%ADDBE_4x4
+ jmp %%_16_blocks_ok
+%%_16_blocks_overflow:
+ vpshufb %%CTR_BE, %%CTR_BE, %%SHFMSK
+ vmovdqa64 %%B12_15, [rel ddq_add_4444]
+ vpaddd %%B00_03, %%CTR_BE, [rel ddq_add_1234]
+ vpaddd %%B04_07, %%B00_03, %%B12_15
+ vpaddd %%B08_11, %%B04_07, %%B12_15
+ vpaddd %%B12_15, %%B08_11, %%B12_15
+ vpshufb %%B00_03, %%SHFMSK
+ vpshufb %%B04_07, %%SHFMSK
+ vpshufb %%B08_11, %%SHFMSK
+ vpshufb %%B12_15, %%SHFMSK
+%%_16_blocks_ok:
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; pre-load constants
+ vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 0)]
+%ifnidn %%GHASH_IN, no_ghash_in
+ vpxorq %%GHDAT1, %%GHASH_IN, [rsp + %%GHASHIN_BLK_OFFSET + (0*64)]
+%else
+ vmovdqa64 %%GHDAT1, [rsp + %%GHASHIN_BLK_OFFSET + (0*64)]
+%endif
+ vmovdqu64 %%GHKEY1, [%%GDATA + %%HASHKEY_OFFSET + (0*64)]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; save counter for the next round
+ ;; increment counter overflow check register
+ vshufi64x2 %%CTR_BE, %%B12_15, %%B12_15, 1111_1111b
+ add BYTE(%%CTR_CHECK), 16
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; pre-load constants
+ vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 1)]
+ vmovdqu64 %%GHKEY2, [%%GDATA + %%HASHKEY_OFFSET + (1*64)]
+ vmovdqa64 %%GHDAT2, [rsp + %%GHASHIN_BLK_OFFSET + (1*64)]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; stitch AES rounds with GHASH
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES round 0 - ARK
+
+ vpxorq %%B00_03, %%AESKEY1
+ vpxorq %%B04_07, %%AESKEY1
+ vpxorq %%B08_11, %%AESKEY1
+ vpxorq %%B12_15, %%AESKEY1
+ vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 2)]
+
+ ;;==================================================
+ ;; GHASH 4 blocks (15 to 12)
+ vpclmulqdq %%GH1H, %%GHDAT1, %%GHKEY1, 0x11 ; a1*b1
+ vpclmulqdq %%GH1L, %%GHDAT1, %%GHKEY1, 0x00 ; a0*b0
+ vpclmulqdq %%GH1M, %%GHDAT1, %%GHKEY1, 0x01 ; a1*b0
+ vpclmulqdq %%GH1T, %%GHDAT1, %%GHKEY1, 0x10 ; a0*b1
+
+ vmovdqu64 %%GHKEY1, [%%GDATA + %%HASHKEY_OFFSET + (2*64)]
+ vmovdqa64 %%GHDAT1, [rsp + %%GHASHIN_BLK_OFFSET + (2*64)]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES round 1
+ vaesenc %%B00_03, %%B00_03, %%AESKEY2
+ vaesenc %%B04_07, %%B04_07, %%AESKEY2
+ vaesenc %%B08_11, %%B08_11, %%AESKEY2
+ vaesenc %%B12_15, %%B12_15, %%AESKEY2
+ vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 3)]
+
+ ;; =================================================
+ ;; GHASH 4 blocks (11 to 8)
+ vpclmulqdq %%GH2M, %%GHDAT2, %%GHKEY2, 0x10 ; a0*b1
+ vpclmulqdq %%GH2T, %%GHDAT2, %%GHKEY2, 0x01 ; a1*b0
+ vpclmulqdq %%GH2H, %%GHDAT2, %%GHKEY2, 0x11 ; a1*b1
+ vpclmulqdq %%GH2L, %%GHDAT2, %%GHKEY2, 0x00 ; a0*b0
+
+ vmovdqu64 %%GHKEY2, [%%GDATA + %%HASHKEY_OFFSET + (3*64)]
+ vmovdqa64 %%GHDAT2, [rsp + %%GHASHIN_BLK_OFFSET + (3*64)]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES round 2
+ vaesenc %%B00_03, %%B00_03, %%AESKEY1
+ vaesenc %%B04_07, %%B04_07, %%AESKEY1
+ vaesenc %%B08_11, %%B08_11, %%AESKEY1
+ vaesenc %%B12_15, %%B12_15, %%AESKEY1
+ vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 4)]
+
+ ;; =================================================
+ ;; GHASH 4 blocks (7 to 4)
+ vpclmulqdq %%GH3M, %%GHDAT1, %%GHKEY1, 0x10 ; a0*b1
+ vpclmulqdq %%GH3T, %%GHDAT1, %%GHKEY1, 0x01 ; a1*b0
+ vpclmulqdq %%GH3H, %%GHDAT1, %%GHKEY1, 0x11 ; a1*b1
+ vpclmulqdq %%GH3L, %%GHDAT1, %%GHKEY1, 0x00 ; a0*b0
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES rounds 3
+ vaesenc %%B00_03, %%B00_03, %%AESKEY2
+ vaesenc %%B04_07, %%B04_07, %%AESKEY2
+ vaesenc %%B08_11, %%B08_11, %%AESKEY2
+ vaesenc %%B12_15, %%B12_15, %%AESKEY2
+ vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 5)]
+
+ ;; =================================================
+ ;; Gather (XOR) GHASH for 12 blocks
+ vpternlogq %%GH1H, %%GH2H, %%GH3H, 0x96
+ vpternlogq %%GH1L, %%GH2L, %%GH3L, 0x96
+ vpternlogq %%GH1T, %%GH2T, %%GH3T, 0x96
+ vpternlogq %%GH1M, %%GH2M, %%GH3M, 0x96
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES rounds 4
+ vaesenc %%B00_03, %%B00_03, %%AESKEY1
+ vaesenc %%B04_07, %%B04_07, %%AESKEY1
+ vaesenc %%B08_11, %%B08_11, %%AESKEY1
+ vaesenc %%B12_15, %%B12_15, %%AESKEY1
+ vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 6)]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; load plain/cipher text (recycle GH3xx registers)
+ VX512LDR %%DATA1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (0 * 64)]
+ VX512LDR %%DATA2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (1 * 64)]
+ VX512LDR %%DATA3, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (2 * 64)]
+ VX512LDR %%DATA4, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (3 * 64)]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES rounds 5
+ vaesenc %%B00_03, %%B00_03, %%AESKEY2
+ vaesenc %%B04_07, %%B04_07, %%AESKEY2
+ vaesenc %%B08_11, %%B08_11, %%AESKEY2
+ vaesenc %%B12_15, %%B12_15, %%AESKEY2
+ vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 7)]
+
+ ;; =================================================
+ ;; GHASH 4 blocks (3 to 0)
+ vpclmulqdq %%GH2M, %%GHDAT2, %%GHKEY2, 0x10 ; a0*b1
+ vpclmulqdq %%GH2T, %%GHDAT2, %%GHKEY2, 0x01 ; a1*b0
+ vpclmulqdq %%GH2H, %%GHDAT2, %%GHKEY2, 0x11 ; a1*b1
+ vpclmulqdq %%GH2L, %%GHDAT2, %%GHKEY2, 0x00 ; a0*b0
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES round 6
+ vaesenc %%B00_03, %%B00_03, %%AESKEY1
+ vaesenc %%B04_07, %%B04_07, %%AESKEY1
+ vaesenc %%B08_11, %%B08_11, %%AESKEY1
+ vaesenc %%B12_15, %%B12_15, %%AESKEY1
+ vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 8)]
+
+ ;; =================================================
+ ;; gather GHASH in GH1L (low) and GH1H (high)
+%ifidn %%DO_REDUCTION, first_time
+ vpternlogq %%GH1M, %%GH1T, %%GH2T, 0x96 ; TM
+ vpxorq %%TO_REDUCE_M, %%GH1M, %%GH2M ; TM
+ vpxorq %%TO_REDUCE_H, %%GH1H, %%GH2H ; TH
+ vpxorq %%TO_REDUCE_L, %%GH1L, %%GH2L ; TL
+%endif
+%ifidn %%DO_REDUCTION, no_reduction
+ vpternlogq %%GH1M, %%GH1T, %%GH2T, 0x96 ; TM
+ vpternlogq %%TO_REDUCE_M, %%GH1M, %%GH2M, 0x96 ; TM
+ vpternlogq %%TO_REDUCE_H, %%GH1H, %%GH2H, 0x96 ; TH
+ vpternlogq %%TO_REDUCE_L, %%GH1L, %%GH2L, 0x96 ; TL
+%endif
+%ifidn %%DO_REDUCTION, final_reduction
+ ;; phase 1: add mid products together
+ ;; also load polynomial constant for reduction
+ vpternlogq %%GH1M, %%GH1T, %%GH2T, 0x96 ; TM
+ vpternlogq %%GH1M, %%TO_REDUCE_M, %%GH2M, 0x96
+
+ vpsrldq %%GH2M, %%GH1M, 8
+ vpslldq %%GH1M, %%GH1M, 8
+
+ vmovdqa64 XWORD(%%RED_POLY), [rel POLY2]
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES round 7
+ vaesenc %%B00_03, %%B00_03, %%AESKEY2
+ vaesenc %%B04_07, %%B04_07, %%AESKEY2
+ vaesenc %%B08_11, %%B08_11, %%AESKEY2
+ vaesenc %%B12_15, %%B12_15, %%AESKEY2
+ vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 9)]
+
+ ;; =================================================
+ ;; Add mid product to high and low
+%ifidn %%DO_REDUCTION, final_reduction
+ vpternlogq %%GH1H, %%GH2H, %%GH2M, 0x96 ; TH = TH1 + TH2 + TM>>64
+ vpxorq %%GH1H, %%TO_REDUCE_H
+ vpternlogq %%GH1L, %%GH2L, %%GH1M, 0x96 ; TL = TL1 + TL2 + TM<<64
+ vpxorq %%GH1L, %%TO_REDUCE_L
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES round 8
+ vaesenc %%B00_03, %%B00_03, %%AESKEY1
+ vaesenc %%B04_07, %%B04_07, %%AESKEY1
+ vaesenc %%B08_11, %%B08_11, %%AESKEY1
+ vaesenc %%B12_15, %%B12_15, %%AESKEY1
+ vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 10)]
+
+ ;; =================================================
+ ;; horizontal xor of low and high 4x128
+%ifidn %%DO_REDUCTION, final_reduction
+ VHPXORI4x128 %%GH1H, %%GH2H
+ VHPXORI4x128 %%GH1L, %%GH2L
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES round 9
+ vaesenc %%B00_03, %%B00_03, %%AESKEY2
+ vaesenc %%B04_07, %%B04_07, %%AESKEY2
+ vaesenc %%B08_11, %%B08_11, %%AESKEY2
+ vaesenc %%B12_15, %%B12_15, %%AESKEY2
+%if (NROUNDS >= 11)
+ vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 11)]
+%endif
+ ;; =================================================
+ ;; first phase of reduction
+%ifidn %%DO_REDUCTION, final_reduction
+ vpclmulqdq XWORD(%%RED_P1), XWORD(%%RED_POLY), XWORD(%%GH1L), 0x01
+ vpslldq XWORD(%%RED_P1), XWORD(%%RED_P1), 8 ; shift-L 2 DWs
+ vpxorq XWORD(%%RED_P1), XWORD(%%GH1L), XWORD(%%RED_P1) ; first phase of the reduct
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES rounds up to 11 (AES192) or 13 (AES256)
+ ;; AES128 is done
+%if (NROUNDS >= 11)
+ vaesenc %%B00_03, %%B00_03, %%AESKEY1
+ vaesenc %%B04_07, %%B04_07, %%AESKEY1
+ vaesenc %%B08_11, %%B08_11, %%AESKEY1
+ vaesenc %%B12_15, %%B12_15, %%AESKEY1
+ vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 12)]
+
+ vaesenc %%B00_03, %%B00_03, %%AESKEY2
+ vaesenc %%B04_07, %%B04_07, %%AESKEY2
+ vaesenc %%B08_11, %%B08_11, %%AESKEY2
+ vaesenc %%B12_15, %%B12_15, %%AESKEY2
+%if (NROUNDS == 13)
+ vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 13)]
+
+ vaesenc %%B00_03, %%B00_03, %%AESKEY1
+ vaesenc %%B04_07, %%B04_07, %%AESKEY1
+ vaesenc %%B08_11, %%B08_11, %%AESKEY1
+ vaesenc %%B12_15, %%B12_15, %%AESKEY1
+ vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 14)]
+
+ vaesenc %%B00_03, %%B00_03, %%AESKEY2
+ vaesenc %%B04_07, %%B04_07, %%AESKEY2
+ vaesenc %%B08_11, %%B08_11, %%AESKEY2
+ vaesenc %%B12_15, %%B12_15, %%AESKEY2
+%endif ; GCM256 / NROUNDS = 13 (15 including the first and the last)
+%endif ; GCM192 / NROUNDS = 11 (13 including the first and the last)
+
+ ;; =================================================
+ ;; second phase of the reduction
+%ifidn %%DO_REDUCTION, final_reduction
+ vpclmulqdq XWORD(%%RED_T1), XWORD(%%RED_POLY), XWORD(%%RED_P1), 0x00
+ vpsrldq XWORD(%%RED_T1), XWORD(%%RED_T1), 4 ; shift-R 1-DW to obtain 2-DWs shift-R
+
+ vpclmulqdq XWORD(%%RED_T2), XWORD(%%RED_POLY), XWORD(%%RED_P1), 0x10
+ vpslldq XWORD(%%RED_T2), XWORD(%%RED_T2), 4 ; shift-L 1-DW for result without shifts
+ ;; GH1H = GH1H x RED_T1 x RED_T2
+ vpternlogq XWORD(%%GH1H), XWORD(%%RED_T2), XWORD(%%RED_T1), 0x96
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; the last AES round
+ vaesenclast %%B00_03, %%B00_03, %%AESKEY1
+ vaesenclast %%B04_07, %%B04_07, %%AESKEY1
+ vaesenclast %%B08_11, %%B08_11, %%AESKEY1
+ vaesenclast %%B12_15, %%B12_15, %%AESKEY1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; XOR against plain/cipher text
+ vpxorq %%B00_03, %%B00_03, %%DATA1
+ vpxorq %%B04_07, %%B04_07, %%DATA2
+ vpxorq %%B08_11, %%B08_11, %%DATA3
+ vpxorq %%B12_15, %%B12_15, %%DATA4
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; store cipher/plain text
+ VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (0 * 64)], %%B00_03
+ VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (1 * 64)], %%B04_07
+ VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (2 * 64)], %%B08_11
+ VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (3 * 64)], %%B12_15
+
+ ;; =================================================
+ ;; shuffle cipher text blocks for GHASH computation
+%ifidn %%ENC_DEC, ENC
+ vpshufb %%B00_03, %%B00_03, %%SHFMSK
+ vpshufb %%B04_07, %%B04_07, %%SHFMSK
+ vpshufb %%B08_11, %%B08_11, %%SHFMSK
+ vpshufb %%B12_15, %%B12_15, %%SHFMSK
+%else
+ vpshufb %%B00_03, %%DATA1, %%SHFMSK
+ vpshufb %%B04_07, %%DATA2, %%SHFMSK
+ vpshufb %%B08_11, %%DATA3, %%SHFMSK
+ vpshufb %%B12_15, %%DATA4, %%SHFMSK
+%endif
+
+ ;; =================================================
+ ;; store shuffled cipher text for ghashing
+ vmovdqa64 [rsp + %%AESOUT_BLK_OFFSET + (0*64)], %%B00_03
+ vmovdqa64 [rsp + %%AESOUT_BLK_OFFSET + (1*64)], %%B04_07
+ vmovdqa64 [rsp + %%AESOUT_BLK_OFFSET + (2*64)], %%B08_11
+ vmovdqa64 [rsp + %%AESOUT_BLK_OFFSET + (3*64)], %%B12_15
+
+%ifidn %%DO_REDUCTION, final_reduction
+ ;; =================================================
+ ;; Return GHASH value through %%GH1H
+%endif
+
+%endmacro ; GHASH_16_ENCRYPT_16_PARALLEL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; GHASH the last 8 ciphertext blocks.
+;;; - optionally accepts GHASH product sums as input
+%macro GHASH_LAST_8 10-13
+%define %%GDATA %1 ; [in] key pointer
+%define %%BL47 %2 ; [in/clobbered] ZMM AES blocks 4 to 7
+%define %%BL03 %3 ; [in/cloberred] ZMM AES blocks 0 to 3
+%define %%ZTH %4 ; [cloberred] ZMM temporary
+%define %%ZTM %5 ; [cloberred] ZMM temporary
+%define %%ZTL %6 ; [cloberred] ZMM temporary
+%define %%ZT01 %7 ; [cloberred] ZMM temporary
+%define %%ZT02 %8 ; [cloberred] ZMM temporary
+%define %%ZT03 %9 ; [cloberred] ZMM temporary
+%define %%AAD_HASH %10 ; [out] XMM hash value
+%define %%GH %11 ; [in/optional] ZMM with GHASH high product sum
+%define %%GL %12 ; [in/optional] ZMM with GHASH low product sum
+%define %%GM %13 ; [in/optional] ZMM with GHASH mid product sum
+
+ VCLMUL_STEP1 %%GDATA, %%BL47, %%ZT01, %%ZTH, %%ZTM, %%ZTL
+
+%if %0 > 10
+ ;; add optional sums before step2
+ vpxorq %%ZTH, %%ZTH, %%GH
+ vpxorq %%ZTL, %%ZTL, %%GL
+ vpxorq %%ZTM, %%ZTM, %%GM
+%endif
+
+ VCLMUL_STEP2 %%GDATA, %%BL47, %%BL03, %%ZT01, %%ZT02, %%ZT03, %%ZTH, %%ZTM, %%ZTL
+
+ vmovdqa64 XWORD(%%ZT03), [rel POLY2]
+ VCLMUL_REDUCE %%AAD_HASH, XWORD(%%ZT03), XWORD(%%BL47), XWORD(%%BL03), \
+ XWORD(%%ZT01), XWORD(%%ZT02)
+%endmacro ; GHASH_LAST_8
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; GHASH the last 7 cipher text blocks.
+;;; - it uses same GHASH macros as GHASH_LAST_8 but with some twist
+;;; - it loads GHASH keys for each of the data blocks, so that:
+;;; - blocks 4, 5 and 6 will use GHASH keys 3, 2, 1 respectively
+;;; - code ensures that unused block 7 and corresponding GHASH key are zeroed
+;;; (clmul product is zero this way and will not affect the result)
+;;; - blocks 0, 1, 2 and 3 will use USE GHASH keys 7, 6, 5 and 4 respectively
+;;; - optionally accepts GHASH product sums as input
+%macro GHASH_LAST_7 13-16
+%define %%GDATA %1 ; [in] key pointer
+%define %%BL47 %2 ; [in/clobbered] ZMM AES blocks 4 to 7
+%define %%BL03 %3 ; [in/cloberred] ZMM AES blocks 0 to 3
+%define %%ZTH %4 ; [cloberred] ZMM temporary
+%define %%ZTM %5 ; [cloberred] ZMM temporary
+%define %%ZTL %6 ; [cloberred] ZMM temporary
+%define %%ZT01 %7 ; [cloberred] ZMM temporary
+%define %%ZT02 %8 ; [cloberred] ZMM temporary
+%define %%ZT03 %9 ; [cloberred] ZMM temporary
+%define %%ZT04 %10 ; [cloberred] ZMM temporary
+%define %%AAD_HASH %11 ; [out] XMM hash value
+%define %%MASKREG %12 ; [clobbered] mask register to use for loads
+%define %%IA0 %13 ; [clobbered] GP temporary register
+%define %%GH %14 ; [in/optional] ZMM with GHASH high product sum
+%define %%GL %15 ; [in/optional] ZMM with GHASH low product sum
+%define %%GM %16 ; [in/optional] ZMM with GHASH mid product sum
+
+ vmovdqa64 XWORD(%%ZT04), [rel POLY2]
+
+ VCLMUL_1_TO_8_STEP1 %%GDATA, %%BL47, %%ZT01, %%ZT02, %%ZTH, %%ZTM, %%ZTL, 7
+
+%if %0 > 13
+ ;; add optional sums before step2
+ vpxorq %%ZTH, %%ZTH, %%GH
+ vpxorq %%ZTL, %%ZTL, %%GL
+ vpxorq %%ZTM, %%ZTM, %%GM
+%endif
+
+ VCLMUL_1_TO_8_STEP2 %%GDATA, %%BL47, %%BL03, \
+ %%ZT01, %%ZT02, %%ZT03, \
+ %%ZTH, %%ZTM, %%ZTL, 7
+
+ VCLMUL_REDUCE %%AAD_HASH, XWORD(%%ZT04), XWORD(%%BL47), XWORD(%%BL03), \
+ XWORD(%%ZT01), XWORD(%%ZT02)
+%endmacro ; GHASH_LAST_7
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Encryption of a single block
+%macro ENCRYPT_SINGLE_BLOCK 2
+%define %%GDATA %1
+%define %%XMM0 %2
+
+ vpxorq %%XMM0, %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep NROUNDS
+ vaesenc %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep
+ vaesenclast %%XMM0, [%%GDATA+16*i]
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Save register content for the caller
+%macro FUNC_SAVE 0
+ ;; Required for Update/GMC_ENC
+ ;the number of pushes must equal STACK_OFFSET
+ mov rax, rsp
+
+ sub rsp, STACK_FRAME_SIZE
+ and rsp, ~63
+
+ mov [rsp + STACK_GP_OFFSET + 0*8], r12
+ mov [rsp + STACK_GP_OFFSET + 1*8], r13
+ mov [rsp + STACK_GP_OFFSET + 2*8], r14
+ mov [rsp + STACK_GP_OFFSET + 3*8], r15
+ mov [rsp + STACK_GP_OFFSET + 4*8], rax ; stack
+ mov r14, rax ; r14 is used to retrieve stack args
+ mov [rsp + STACK_GP_OFFSET + 5*8], rbp
+ mov [rsp + STACK_GP_OFFSET + 6*8], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + STACK_GP_OFFSET + 7*8], rdi
+ mov [rsp + STACK_GP_OFFSET + 8*8], rsi
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ vmovdqu [rsp + STACK_XMM_OFFSET + 0*16], xmm6
+ vmovdqu [rsp + STACK_XMM_OFFSET + 1*16], xmm7
+ vmovdqu [rsp + STACK_XMM_OFFSET + 2*16], xmm8
+ vmovdqu [rsp + STACK_XMM_OFFSET + 3*16], xmm9
+ vmovdqu [rsp + STACK_XMM_OFFSET + 4*16], xmm10
+ vmovdqu [rsp + STACK_XMM_OFFSET + 5*16], xmm11
+ vmovdqu [rsp + STACK_XMM_OFFSET + 6*16], xmm12
+ vmovdqu [rsp + STACK_XMM_OFFSET + 7*16], xmm13
+ vmovdqu [rsp + STACK_XMM_OFFSET + 8*16], xmm14
+ vmovdqu [rsp + STACK_XMM_OFFSET + 9*16], xmm15
+%endif
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Restore register content for the caller
+%macro FUNC_RESTORE 0
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_zmms_asm
+%else
+ vzeroupper
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15, [rsp + STACK_XMM_OFFSET + 9*16]
+ vmovdqu xmm14, [rsp + STACK_XMM_OFFSET + 8*16]
+ vmovdqu xmm13, [rsp + STACK_XMM_OFFSET + 7*16]
+ vmovdqu xmm12, [rsp + STACK_XMM_OFFSET + 6*16]
+ vmovdqu xmm11, [rsp + STACK_XMM_OFFSET + 5*16]
+ vmovdqu xmm10, [rsp + STACK_XMM_OFFSET + 4*16]
+ vmovdqu xmm9, [rsp + STACK_XMM_OFFSET + 3*16]
+ vmovdqu xmm8, [rsp + STACK_XMM_OFFSET + 2*16]
+ vmovdqu xmm7, [rsp + STACK_XMM_OFFSET + 1*16]
+ vmovdqu xmm6, [rsp + STACK_XMM_OFFSET + 0*16]
+%endif
+
+ ;; Required for Update/GMC_ENC
+ mov rbp, [rsp + STACK_GP_OFFSET + 5*8]
+ mov rbx, [rsp + STACK_GP_OFFSET + 6*8]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [rsp + STACK_GP_OFFSET + 7*8]
+ mov rsi, [rsp + STACK_GP_OFFSET + 8*8]
+%endif
+ mov r12, [rsp + STACK_GP_OFFSET + 0*8]
+ mov r13, [rsp + STACK_GP_OFFSET + 1*8]
+ mov r14, [rsp + STACK_GP_OFFSET + 2*8]
+ mov r15, [rsp + STACK_GP_OFFSET + 3*8]
+ mov rsp, [rsp + STACK_GP_OFFSET + 4*8] ; stack
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
+;;; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV,
+;;; Additional Authentication data (A_IN), Additional Data length (A_LEN).
+;;; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA_CTX.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_INIT 21
+%define %%GDATA_KEY %1 ; [in] GCM expanded keys pointer
+%define %%GDATA_CTX %2 ; [in] GCM context pointer
+%define %%IV %3 ; [in] IV pointer
+%define %%A_IN %4 ; [in] AAD pointer
+%define %%A_LEN %5 ; [in] AAD length in bytes
+%define %%GPR1 %6 ; [clobbered] GP register
+%define %%GPR2 %7 ; [clobbered] GP register
+%define %%GPR3 %8 ; [clobbered] GP register
+%define %%MASKREG %9 ; [clobbered] mask register
+%define %%AAD_HASH %10 ; [out] XMM for AAD_HASH value (xmm14)
+%define %%CUR_COUNT %11 ; [out] XMM with current counter (xmm2)
+%define %%ZT0 %12 ; [clobbered] ZMM register
+%define %%ZT1 %13 ; [clobbered] ZMM register
+%define %%ZT2 %14 ; [clobbered] ZMM register
+%define %%ZT3 %15 ; [clobbered] ZMM register
+%define %%ZT4 %16 ; [clobbered] ZMM register
+%define %%ZT5 %17 ; [clobbered] ZMM register
+%define %%ZT6 %18 ; [clobbered] ZMM register
+%define %%ZT7 %19 ; [clobbered] ZMM register
+%define %%ZT8 %20 ; [clobbered] ZMM register
+%define %%ZT9 %21 ; [clobbered] ZMM register
+
+ CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, \
+ %%ZT0, %%ZT1, %%ZT2, %%ZT3, %%ZT4, %%ZT5, %%ZT6, %%ZT7, %%ZT8, %%ZT9, \
+ %%GPR1, %%GPR2, %%GPR3, %%MASKREG
+
+ mov %%GPR1, %%A_LEN
+ vmovdqu64 [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx.aad hash = aad_hash
+ mov [%%GDATA_CTX + AadLen], %%GPR1 ; ctx.aad_length = aad_length
+
+ xor %%GPR1, %%GPR1
+ mov [%%GDATA_CTX + InLen], %%GPR1 ; ctx.in_length = 0
+ mov [%%GDATA_CTX + PBlockLen], %%GPR1 ; ctx.partial_block_length = 0
+
+ ;; read 12 IV bytes and pad with 0x00000001
+ vmovdqu8 %%CUR_COUNT, [rel ONEf]
+ mov %%GPR2, %%IV
+ mov %%GPR1, 0x0000_0000_0000_0fff
+ kmovq %%MASKREG, %%GPR1
+ vmovdqu8 %%CUR_COUNT{%%MASKREG}, [%%GPR2] ; ctr = IV | 0x1
+
+ vmovdqu64 [%%GDATA_CTX + OrigIV], %%CUR_COUNT ; ctx.orig_IV = iv
+
+ ;; store IV as counter in LE format
+ vpshufb %%CUR_COUNT, [rel SHUF_MASK]
+ vmovdqu [%%GDATA_CTX + CurCount], %%CUR_COUNT ; ctx.current_counter = iv
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Cipher and ghash of payloads shorter than 256 bytes
+;;; - number of blocks in the message comes as argument
+;;; - depending on the number of blocks an optimized variant of
+;;; INITIAL_BLOCKS_PARTIAL is invoked
+%macro GCM_ENC_DEC_SMALL 42
+%define %%GDATA_KEY %1 ; [in] key pointer
+%define %%GDATA_CTX %2 ; [in] context pointer
+%define %%CYPH_PLAIN_OUT %3 ; [in] output buffer
+%define %%PLAIN_CYPH_IN %4 ; [in] input buffer
+%define %%PLAIN_CYPH_LEN %5 ; [in] buffer length
+%define %%ENC_DEC %6 ; [in] cipher direction
+%define %%DATA_OFFSET %7 ; [in] data offset
+%define %%LENGTH %8 ; [in] data length
+%define %%NUM_BLOCKS %9 ; [in] number of blocks to process 1 to 16
+%define %%CTR %10 ; [in/out] XMM counter block
+%define %%HASH_IN_OUT %11 ; [in/out] XMM GHASH value
+%define %%INSTANCE_TYPE %12 ; [in] single or multi call
+%define %%ZTMP0 %13 ; [clobbered] ZMM register
+%define %%ZTMP1 %14 ; [clobbered] ZMM register
+%define %%ZTMP2 %15 ; [clobbered] ZMM register
+%define %%ZTMP3 %16 ; [clobbered] ZMM register
+%define %%ZTMP4 %17 ; [clobbered] ZMM register
+%define %%ZTMP5 %18 ; [clobbered] ZMM register
+%define %%ZTMP6 %19 ; [clobbered] ZMM register
+%define %%ZTMP7 %20 ; [clobbered] ZMM register
+%define %%ZTMP8 %21 ; [clobbered] ZMM register
+%define %%ZTMP9 %22 ; [clobbered] ZMM register
+%define %%ZTMP10 %23 ; [clobbered] ZMM register
+%define %%ZTMP11 %24 ; [clobbered] ZMM register
+%define %%ZTMP12 %25 ; [clobbered] ZMM register
+%define %%ZTMP13 %26 ; [clobbered] ZMM register
+%define %%ZTMP14 %27 ; [clobbered] ZMM register
+%define %%ZTMP15 %28 ; [clobbered] ZMM register
+%define %%ZTMP16 %29 ; [clobbered] ZMM register
+%define %%ZTMP17 %30 ; [clobbered] ZMM register
+%define %%ZTMP18 %31 ; [clobbered] ZMM register
+%define %%ZTMP19 %32 ; [clobbered] ZMM register
+%define %%ZTMP20 %33 ; [clobbered] ZMM register
+%define %%ZTMP21 %34 ; [clobbered] ZMM register
+%define %%ZTMP22 %35 ; [clobbered] ZMM register
+%define %%GH %36 ; [in] ZMM ghash sum (high)
+%define %%GL %37 ; [in] ZMM ghash sum (low)
+%define %%GM %38 ; [in] ZMM ghash sum (middle)
+%define %%IA0 %39 ; [clobbered] GP register
+%define %%IA1 %40 ; [clobbered] GP register
+%define %%MASKREG %41 ; [clobbered] mask register
+%define %%SHUFMASK %42 ; [in] ZMM with BE/LE shuffle mask
+
+ cmp %%NUM_BLOCKS, 8
+ je %%_small_initial_num_blocks_is_8
+ jl %%_small_initial_num_blocks_is_7_1
+
+
+ cmp %%NUM_BLOCKS, 12
+ je %%_small_initial_num_blocks_is_12
+ jl %%_small_initial_num_blocks_is_11_9
+
+ ;; 16, 15, 14 or 13
+ cmp %%NUM_BLOCKS, 16
+ je %%_small_initial_num_blocks_is_16
+ cmp %%NUM_BLOCKS, 15
+ je %%_small_initial_num_blocks_is_15
+ cmp %%NUM_BLOCKS, 14
+ je %%_small_initial_num_blocks_is_14
+ jmp %%_small_initial_num_blocks_is_13
+
+%%_small_initial_num_blocks_is_11_9:
+ ;; 11, 10 or 9
+ cmp %%NUM_BLOCKS, 11
+ je %%_small_initial_num_blocks_is_11
+ cmp %%NUM_BLOCKS, 10
+ je %%_small_initial_num_blocks_is_10
+ jmp %%_small_initial_num_blocks_is_9
+
+%%_small_initial_num_blocks_is_7_1:
+ cmp %%NUM_BLOCKS, 4
+ je %%_small_initial_num_blocks_is_4
+ jl %%_small_initial_num_blocks_is_3_1
+ ;; 7, 6 or 5
+ cmp %%NUM_BLOCKS, 7
+ je %%_small_initial_num_blocks_is_7
+ cmp %%NUM_BLOCKS, 6
+ je %%_small_initial_num_blocks_is_6
+ jmp %%_small_initial_num_blocks_is_5
+
+%%_small_initial_num_blocks_is_3_1:
+ ;; 3, 2 or 1
+ cmp %%NUM_BLOCKS, 3
+ je %%_small_initial_num_blocks_is_3
+ cmp %%NUM_BLOCKS, 2
+ je %%_small_initial_num_blocks_is_2
+
+ ;; for %%NUM_BLOCKS == 1, just fall through and no 'jmp' needed
+
+ ;; Use rep to generate different block size variants
+ ;; - one block size has to be the first one
+%assign num_blocks 1
+%rep 16
+%%_small_initial_num_blocks_is_ %+ num_blocks :
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, num_blocks, \
+ %%CTR, %%HASH_IN_OUT, %%ENC_DEC, %%INSTANCE_TYPE, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \
+ %%ZTMP5, %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, \
+ %%ZTMP10, %%ZTMP11, %%ZTMP12, %%ZTMP13, %%ZTMP14, \
+ %%ZTMP15, %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \
+ %%ZTMP20, %%ZTMP21, %%ZTMP22, \
+ %%GH, %%GL, %%GM, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFMASK
+%if num_blocks != 16
+ jmp %%_small_initial_blocks_encrypted
+%endif
+%assign num_blocks (num_blocks + 1)
+%endrep
+
+%%_small_initial_blocks_encrypted:
+
+%endmacro ; GCM_ENC_DEC_SMALL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct
+; has been initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN),
+; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC).
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10-r15, and zmm0-zmm31, k1
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_ENC_DEC 7
+%define %%GDATA_KEY %1 ; [in] key pointer
+%define %%GDATA_CTX %2 ; [in] context pointer
+%define %%CYPH_PLAIN_OUT %3 ; [in] output buffer pointer
+%define %%PLAIN_CYPH_IN %4 ; [in] input buffer pointer
+%define %%PLAIN_CYPH_LEN %5 ; [in] buffer length
+%define %%ENC_DEC %6 ; [in] cipher direction
+%define %%INSTANCE_TYPE %7 ; [in] 'single_call' or 'multi_call' selection
+
+%define %%IA0 r10
+%define %%IA1 r12
+%define %%IA2 r13
+%define %%IA3 r15
+%define %%IA4 r11
+%define %%IA5 rax
+
+%define %%LENGTH %%IA2
+%define %%CTR_CHECK %%IA3
+%define %%DATA_OFFSET %%IA4
+
+%define %%HASHK_PTR %%IA5
+
+%define %%GCM_INIT_CTR_BLOCK xmm2 ; hardcoded in GCM_INIT for now
+
+%define %%AES_PARTIAL_BLOCK xmm8
+%define %%CTR_BLOCK2z zmm18
+%define %%CTR_BLOCKz zmm9
+%define %%CTR_BLOCKx xmm9
+%define %%AAD_HASHz zmm14
+%define %%AAD_HASHx xmm14
+
+;;; ZTMP0 - ZTMP12 - used in by8 code, by128/48 code and GCM_ENC_DEC_SMALL
+%define %%ZTMP0 zmm0
+%define %%ZTMP1 zmm3
+%define %%ZTMP2 zmm4
+%define %%ZTMP3 zmm5
+%define %%ZTMP4 zmm6
+%define %%ZTMP5 zmm7
+%define %%ZTMP6 zmm10
+%define %%ZTMP7 zmm11
+%define %%ZTMP8 zmm12
+%define %%ZTMP9 zmm13
+%define %%ZTMP10 zmm15
+%define %%ZTMP11 zmm16
+%define %%ZTMP12 zmm17
+
+;;; ZTMP13 - ZTMP22 - used in by128/48 code and GCM_ENC_DEC_SMALL
+;;; - some used by8 code as well through TMPxy names
+%define %%ZTMP13 zmm19
+%define %%ZTMP14 zmm20
+%define %%ZTMP15 zmm21
+%define %%ZTMP16 zmm30 ; can be used in very/big_loop part
+%define %%ZTMP17 zmm31 ; can be used in very/big_loop part
+%define %%ZTMP18 zmm1
+%define %%ZTMP19 zmm2
+%define %%ZTMP20 zmm8
+%define %%ZTMP21 zmm22
+%define %%ZTMP22 zmm23
+
+;;; Free to use: zmm24 - zmm29
+;;; - used by by128/48 and by8
+%define %%GH zmm24
+%define %%GL zmm25
+%define %%GM zmm26
+%define %%SHUF_MASK zmm29
+%define %%CTR_BLOCK_SAVE zmm28
+
+;;; - used by by128/48 code only
+%define %%ADDBE_4x4 zmm27
+%define %%ADDBE_1234 zmm28 ; conflicts with CTR_BLOCK_SAVE
+
+;; used by8 code only
+%define %%GH4KEY %%ZTMP17
+%define %%GH8KEY %%ZTMP16
+%define %%BLK0 %%ZTMP18
+%define %%BLK1 %%ZTMP19
+%define %%ADD8BE zmm27
+%define %%ADD8LE %%ZTMP13
+
+%define %%MASKREG k1
+
+%ifdef GCM_BIG_DATA
+;; reduction every 128 blocks, depth 32 blocks
+;; @note 128 blocks is the maximum capacity of the stack frame when
+;; GCM_BIG_DATA is defined
+%assign very_big_loop_nblocks 128
+%assign very_big_loop_depth 32
+%endif
+
+;; reduction every 48 blocks, depth 32 blocks
+;; @note 48 blocks is the maximum capacity of the stack frame when
+;; GCM_BIG_DATA is not defined
+%assign big_loop_nblocks 48
+%assign big_loop_depth 32
+
+;;; Macro flow:
+;;; - for message size bigger than very_big_loop_nblocks process data
+;;; with "very_big_loop" parameters
+;;; - for message size bigger than big_loop_nblocks process data
+;;; with "big_loop" parameters
+;;; - calculate the number of 16byte blocks in the message
+;;; - process (number of 16byte blocks) mod 8
+;;; '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+;;; - process 8 16 byte blocks at a time until all are done in %%_encrypt_by_8_new
+
+%ifidn __OUTPUT_FORMAT__, win64
+ cmp %%PLAIN_CYPH_LEN, 0
+%else
+ or %%PLAIN_CYPH_LEN, %%PLAIN_CYPH_LEN
+%endif
+ je %%_enc_dec_done
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+
+ ;; Update length of data processed
+%ifidn __OUTPUT_FORMAT__, win64
+ mov %%IA0, %%PLAIN_CYPH_LEN
+ add [%%GDATA_CTX + InLen], %%IA0
+%else
+ add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN
+%endif
+ vmovdqu64 %%AAD_HASHx, [%%GDATA_CTX + AadHash]
+
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; NOTE: partial block processing makes only sense for multi_call here.
+ ;; Used for the update flow - if there was a previous partial
+ ;; block fill the remaining bytes here.
+ PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%PLAIN_CYPH_LEN, %%DATA_OFFSET, %%AAD_HASHx, %%ENC_DEC, \
+ %%IA0, %%IA1, %%IA2, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \
+ %%ZTMP5, %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, %%MASKREG
+%endif
+
+ ;; lift counter block from GCM_INIT to here
+%ifidn %%INSTANCE_TYPE, single_call
+ vmovdqu64 %%CTR_BLOCKx, %%GCM_INIT_CTR_BLOCK
+%else
+ vmovdqu64 %%CTR_BLOCKx, [%%GDATA_CTX + CurCount]
+%endif
+
+ ;; Save the amount of data left to process in %%LENGTH
+ mov %%LENGTH, %%PLAIN_CYPH_LEN
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; NOTE: %%DATA_OFFSET is zero in single_call case.
+ ;; Consequently PLAIN_CYPH_LEN will never be zero after
+ ;; %%DATA_OFFSET subtraction below.
+ ;; There may be no more data if it was consumed in the partial block.
+ sub %%LENGTH, %%DATA_OFFSET
+ je %%_enc_dec_done
+%endif ; %%INSTANCE_TYPE, multi_call
+
+ vmovdqa64 %%SHUF_MASK, [rel SHUF_MASK]
+ vmovdqa64 %%ADDBE_4x4, [rel ddq_addbe_4444]
+
+%ifdef GCM_BIG_DATA
+ vmovdqa64 %%ADDBE_1234, [rel ddq_addbe_1234]
+
+ cmp %%LENGTH, (very_big_loop_nblocks * 16)
+ jl %%_message_below_very_big_nblocks
+
+ INITIAL_BLOCKS_Nx16 %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \
+ %%AAD_HASHz, %%CTR_BLOCKz, %%CTR_CHECK, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+ %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+ %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \
+ %%ZTMP20, %%ZTMP21, %%ZTMP22, \
+ %%GH, %%GL, %%GM, \
+ %%ADDBE_4x4, %%ADDBE_1234, \
+ %%SHUF_MASK, %%ENC_DEC, very_big_loop_nblocks, very_big_loop_depth
+
+ sub %%LENGTH, (very_big_loop_nblocks * 16)
+ cmp %%LENGTH, (very_big_loop_nblocks * 16)
+ jl %%_no_more_very_big_nblocks
+
+%%_encrypt_very_big_nblocks:
+ GHASH_ENCRYPT_Nx16_PARALLEL \
+ %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \
+ %%CTR_BLOCKz, %%SHUF_MASK, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+ %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+ %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \
+ %%ZTMP20, %%ZTMP21, %%ZTMP22, \
+ %%GH, %%GL, %%GM, \
+ %%ADDBE_4x4, %%ADDBE_1234, %%AAD_HASHz, \
+ %%ENC_DEC, very_big_loop_nblocks, very_big_loop_depth, %%CTR_CHECK
+
+ sub %%LENGTH, (very_big_loop_nblocks * 16)
+ cmp %%LENGTH, (very_big_loop_nblocks * 16)
+ jge %%_encrypt_very_big_nblocks
+
+%%_no_more_very_big_nblocks:
+ vpshufb %%CTR_BLOCKx, XWORD(%%SHUF_MASK)
+ vmovdqa64 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx
+
+ GHASH_LAST_Nx16 %%GDATA_KEY, %%AAD_HASHz, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+ %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+ %%GH, %%GL, %%GM, very_big_loop_nblocks, very_big_loop_depth
+
+ or %%LENGTH, %%LENGTH
+ jz %%_ghash_done
+
+%%_message_below_very_big_nblocks:
+%endif ; GCM_BIG_DATA
+
+ cmp %%LENGTH, (big_loop_nblocks * 16)
+ jl %%_message_below_big_nblocks
+
+ ;; overwritten above by CTR_BLOCK_SAVE
+ vmovdqa64 %%ADDBE_1234, [rel ddq_addbe_1234]
+
+ INITIAL_BLOCKS_Nx16 %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \
+ %%AAD_HASHz, %%CTR_BLOCKz, %%CTR_CHECK, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+ %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+ %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \
+ %%ZTMP20, %%ZTMP21, %%ZTMP22, \
+ %%GH, %%GL, %%GM, \
+ %%ADDBE_4x4, %%ADDBE_1234, \
+ %%SHUF_MASK, %%ENC_DEC, big_loop_nblocks, big_loop_depth
+
+ sub %%LENGTH, (big_loop_nblocks * 16)
+ cmp %%LENGTH, (big_loop_nblocks * 16)
+ jl %%_no_more_big_nblocks
+
+%%_encrypt_big_nblocks:
+ GHASH_ENCRYPT_Nx16_PARALLEL \
+ %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \
+ %%CTR_BLOCKz, %%SHUF_MASK, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+ %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+ %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \
+ %%ZTMP20, %%ZTMP21, %%ZTMP22, \
+ %%GH, %%GL, %%GM, \
+ %%ADDBE_4x4, %%ADDBE_1234, %%AAD_HASHz, \
+ %%ENC_DEC, big_loop_nblocks, big_loop_depth, %%CTR_CHECK
+
+ sub %%LENGTH, (big_loop_nblocks * 16)
+ cmp %%LENGTH, (big_loop_nblocks * 16)
+ jge %%_encrypt_big_nblocks
+
+%%_no_more_big_nblocks:
+ vpshufb %%CTR_BLOCKx, XWORD(%%SHUF_MASK)
+ vmovdqa64 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx
+
+ GHASH_LAST_Nx16 %%GDATA_KEY, %%AAD_HASHz, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+ %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+ %%GH, %%GL, %%GM, big_loop_nblocks, big_loop_depth
+
+ or %%LENGTH, %%LENGTH
+ jz %%_ghash_done
+
+%%_message_below_big_nblocks:
+
+ ;; Less than 256 bytes will be handled by the small message code, which
+ ;; can process up to 16 x blocks (16 bytes each)
+ cmp %%LENGTH, (16 * 16)
+ jge %%_large_message_path
+
+ ;; Determine how many blocks to process
+ ;; - process one additional block if there is a partial block
+ mov %%IA1, %%LENGTH
+ add %%IA1, 15
+ shr %%IA1, 4
+ ;; %%IA1 can be in the range from 0 to 16
+
+ GCM_ENC_DEC_SMALL \
+ %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, \
+ %%LENGTH, %%IA1, %%CTR_BLOCKx, %%AAD_HASHx, %%INSTANCE_TYPE, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+ %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+ %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \
+ %%ZTMP20, %%ZTMP21, %%ZTMP22, \
+ no_zmm, no_zmm, no_zmm, \
+ %%IA0, %%IA3, %%MASKREG, %%SHUF_MASK
+
+ vmovdqa64 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx
+
+ jmp %%_ghash_done
+
+%%_large_message_path:
+ ;; Determine how many blocks to process in INITIAL
+ ;; - process one additional block in INITIAL if there is a partial block
+ mov %%IA1, %%LENGTH
+ and %%IA1, 0xff
+ add %%IA1, 15
+ shr %%IA1, 4
+ ;; Don't allow 8 INITIAL blocks since this will
+ ;; be handled by the x8 partial loop.
+ and %%IA1, 7
+ je %%_initial_num_blocks_is_0
+ cmp %%IA1, 1
+ je %%_initial_num_blocks_is_1
+ cmp %%IA1, 2
+ je %%_initial_num_blocks_is_2
+ cmp %%IA1, 3
+ je %%_initial_num_blocks_is_3
+ cmp %%IA1, 4
+ je %%_initial_num_blocks_is_4
+ cmp %%IA1, 5
+ je %%_initial_num_blocks_is_5
+ cmp %%IA1, 6
+ je %%_initial_num_blocks_is_6
+
+%assign number_of_blocks 7
+%rep 8
+%%_initial_num_blocks_is_ %+ number_of_blocks:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, number_of_blocks, %%CTR_BLOCKx, %%AAD_HASHz, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \
+ %%ZTMP5, %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+ %%IA0, %%IA1, %%ENC_DEC, %%MASKREG, %%SHUF_MASK, no_partial_block
+%if number_of_blocks != 0
+ jmp %%_initial_blocks_encrypted
+%endif
+%assign number_of_blocks (number_of_blocks - 1)
+%endrep
+
+%%_initial_blocks_encrypted:
+ vmovdqa64 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx
+
+ ;; move cipher blocks from intial blocks to input of by8 macro
+ ;; and for GHASH_LAST_8/7
+ ;; - ghash value already xor'ed into block 0
+ vmovdqa64 %%BLK0, %%ZTMP0
+ vmovdqa64 %%BLK1, %%ZTMP1
+
+ ;; The entire message cannot get processed in INITIAL_BLOCKS
+ ;; - GCM_ENC_DEC_SMALL handles up to 16 blocks
+ ;; - INITIAL_BLOCKS processes up to 15 blocks
+ ;; - no need to check for zero length at this stage
+
+ ;; In order to have only one reduction at the end
+ ;; start HASH KEY pointer needs to be determined based on length and
+ ;; call type.
+ ;; - note that 8 blocks are already ciphered in INITIAL_BLOCKS and
+ ;; subtracted from LENGTH
+ lea %%IA1, [%%LENGTH + (8 * 16)]
+ add %%IA1, 15
+ and %%IA1, 0x3f0
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; if partial block and multi_call then change hash key start by one
+ mov %%IA0, %%LENGTH
+ and %%IA0, 15
+ add %%IA0, 15
+ and %%IA0, 16
+ sub %%IA1, %%IA0
+%endif
+ lea %%HASHK_PTR, [%%GDATA_KEY + HashKey + 16]
+ sub %%HASHK_PTR, %%IA1
+ ;; HASHK_PTR
+ ;; - points at the first hash key to start GHASH with
+ ;; - needs to be updated as the message is processed (incremented)
+
+ ;; pre-load constants
+ vmovdqa64 %%ADD8BE, [rel ddq_addbe_8888]
+ vmovdqa64 %%ADD8LE, [rel ddq_add_8888]
+ vpxorq %%GH, %%GH
+ vpxorq %%GL, %%GL
+ vpxorq %%GM, %%GM
+
+ ;; prepare counter 8 blocks
+ vshufi64x2 %%CTR_BLOCKz, %%CTR_BLOCKz, %%CTR_BLOCKz, 0
+ vpaddd %%CTR_BLOCK2z, %%CTR_BLOCKz, [rel ddq_add_5678]
+ vpaddd %%CTR_BLOCKz, %%CTR_BLOCKz, [rel ddq_add_1234]
+ vpshufb %%CTR_BLOCKz, %%SHUF_MASK
+ vpshufb %%CTR_BLOCK2z, %%SHUF_MASK
+
+ ;; Process 7 full blocks plus a partial block
+ cmp %%LENGTH, 128
+ jl %%_encrypt_by_8_partial
+
+%%_encrypt_by_8_parallel:
+ ;; in_order vs. out_order is an optimization to increment the counter
+ ;; without shuffling it back into little endian.
+ ;; %%CTR_CHECK keeps track of when we need to increment in order so
+ ;; that the carry is handled correctly.
+
+ vmovq %%CTR_CHECK, XWORD(%%CTR_BLOCK_SAVE)
+
+%%_encrypt_by_8_new:
+ and WORD(%%CTR_CHECK), 255
+ add WORD(%%CTR_CHECK), 8
+
+ vmovdqu64 %%GH4KEY, [%%HASHK_PTR + (4 * 16)]
+ vmovdqu64 %%GH8KEY, [%%HASHK_PTR + (0 * 16)]
+
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%DATA_OFFSET, %%CTR_BLOCKz, %%CTR_BLOCK2z,\
+ %%BLK0, %%BLK1, %%AES_PARTIAL_BLOCK, \
+ out_order, %%ENC_DEC, full, %%IA0, %%IA1, %%LENGTH, %%INSTANCE_TYPE, \
+ %%GH4KEY, %%GH8KEY, %%SHUF_MASK, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, \
+ %%ZTMP7, %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, %%ZTMP12, \
+ %%MASKREG, no_reduction, %%GL, %%GH, %%GM
+
+ add %%HASHK_PTR, (8 * 16)
+ add %%DATA_OFFSET, 128
+ sub %%LENGTH, 128
+ jz %%_encrypt_done
+
+ cmp WORD(%%CTR_CHECK), (256 - 8)
+ jae %%_encrypt_by_8
+
+ vpaddd %%CTR_BLOCKz, %%ADD8BE
+ vpaddd %%CTR_BLOCK2z, %%ADD8BE
+
+ cmp %%LENGTH, 128
+ jl %%_encrypt_by_8_partial
+
+ jmp %%_encrypt_by_8_new
+
+%%_encrypt_by_8:
+ vpshufb %%CTR_BLOCKz, %%SHUF_MASK
+ vpshufb %%CTR_BLOCK2z, %%SHUF_MASK
+ vpaddd %%CTR_BLOCKz, %%ADD8LE
+ vpaddd %%CTR_BLOCK2z, %%ADD8LE
+ vpshufb %%CTR_BLOCKz, %%SHUF_MASK
+ vpshufb %%CTR_BLOCK2z, %%SHUF_MASK
+
+ cmp %%LENGTH, 128
+ jge %%_encrypt_by_8_new
+
+%%_encrypt_by_8_partial:
+ ;; Test to see if we need a by 8 with partial block. At this point
+ ;; bytes remaining should be either zero or between 113-127.
+ ;; 'in_order' shuffle needed to align key for partial block xor.
+ ;; 'out_order' is a little faster because it avoids extra shuffles.
+ ;; - counter blocks for the next 8 blocks are prepared and in BE format
+ ;; - we can go ahead with out_order scenario
+
+ vmovdqu64 %%GH4KEY, [%%HASHK_PTR + (4 * 16)]
+ vmovdqu64 %%GH8KEY, [%%HASHK_PTR + (0 * 16)]
+
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%DATA_OFFSET, %%CTR_BLOCKz, %%CTR_BLOCK2z, \
+ %%BLK0, %%BLK1, %%AES_PARTIAL_BLOCK, \
+ out_order, %%ENC_DEC, partial, %%IA0, %%IA1, %%LENGTH, %%INSTANCE_TYPE, \
+ %%GH4KEY, %%GH8KEY, %%SHUF_MASK, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, \
+ %%ZTMP7, %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, %%ZTMP12, \
+ %%MASKREG, no_reduction, %%GL, %%GH, %%GM
+
+ add %%HASHK_PTR, (8 * 16)
+ add %%DATA_OFFSET, (128 - 16)
+ sub %%LENGTH, (128 - 16)
+
+%ifidn %%INSTANCE_TYPE, multi_call
+ mov [%%GDATA_CTX + PBlockLen], %%LENGTH
+ vmovdqu64 [%%GDATA_CTX + PBlockEncKey], %%AES_PARTIAL_BLOCK
+%endif
+
+%%_encrypt_done:
+ ;; Extract the last counter block in LE format
+ vextracti32x4 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCK2z, 3
+ vpshufb XWORD(%%CTR_BLOCK_SAVE), XWORD(%%SHUF_MASK)
+
+ ;; GHASH last cipher text blocks in xmm1-xmm8
+ ;; - if block 8th is partial in a multi-call path then skip the block
+%ifidn %%INSTANCE_TYPE, multi_call
+ cmp qword [%%GDATA_CTX + PBlockLen], 0
+ jz %%_hash_last_8
+
+ ;; save the 8th partial block as GHASH_LAST_7 will clobber %%BLK1
+ vextracti32x4 XWORD(%%ZTMP7), %%BLK1, 3
+
+ GHASH_LAST_7 %%GDATA_KEY, %%BLK1, %%BLK0, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, \
+ %%AAD_HASHx, %%MASKREG, %%IA0, %%GH, %%GL, %%GM
+
+ ;; XOR the partial word into the hash
+ vpxorq %%AAD_HASHx, %%AAD_HASHx, XWORD(%%ZTMP7)
+ jmp %%_ghash_done
+%%_hash_last_8:
+%endif
+ GHASH_LAST_8 %%GDATA_KEY, %%BLK1, %%BLK0, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%AAD_HASHx, \
+ %%GH, %%GL, %%GM
+%%_ghash_done:
+ vmovdqu64 [%%GDATA_CTX + CurCount], XWORD(%%CTR_BLOCK_SAVE)
+ vmovdqu64 [%%GDATA_CTX + AadHash], %%AAD_HASHx
+%%_enc_dec_done:
+
+%endmacro ; GCM_ENC_DEC
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; Encrypt/decrypt the initial 16 blocks
+%macro INITIAL_BLOCKS_16 22
+%define %%IN %1 ; [in] input buffer
+%define %%OUT %2 ; [in] output buffer
+%define %%KP %3 ; [in] pointer to expanded keys
+%define %%DATA_OFFSET %4 ; [in] data offset
+%define %%GHASH %5 ; [in] ZMM with AAD (low 128 bits)
+%define %%CTR %6 ; [in] ZMM with CTR BE blocks 4x128 bits
+%define %%CTR_CHECK %7 ; [in/out] GPR with counter overflow check
+%define %%ADDBE_4x4 %8 ; [in] ZMM 4x128bits with value 4 (big endian)
+%define %%ADDBE_1234 %9 ; [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian)
+%define %%T0 %10 ; [clobered] temporary ZMM register
+%define %%T1 %11 ; [clobered] temporary ZMM register
+%define %%T2 %12 ; [clobered] temporary ZMM register
+%define %%T3 %13 ; [clobered] temporary ZMM register
+%define %%T4 %14 ; [clobered] temporary ZMM register
+%define %%T5 %15 ; [clobered] temporary ZMM register
+%define %%T6 %16 ; [clobered] temporary ZMM register
+%define %%T7 %17 ; [clobered] temporary ZMM register
+%define %%T8 %18 ; [clobered] temporary ZMM register
+%define %%SHUF_MASK %19 ; [in] ZMM with BE/LE shuffle mask
+%define %%ENC_DEC %20 ; [in] ENC (encrypt) or DEC (decrypt) selector
+%define %%BLK_OFFSET %21 ; [in] stack frame offset to ciphered blocks
+%define %%DATA_DISPL %22 ; [in] fixed numerical data displacement/offset
+
+%define %%B00_03 %%T5
+%define %%B04_07 %%T6
+%define %%B08_11 %%T7
+%define %%B12_15 %%T8
+
+%assign stack_offset (%%BLK_OFFSET)
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; prepare counter blocks
+
+ cmp BYTE(%%CTR_CHECK), (256 - 16)
+ jae %%_next_16_overflow
+ vpaddd %%B00_03, %%CTR, %%ADDBE_1234
+ vpaddd %%B04_07, %%B00_03, %%ADDBE_4x4
+ vpaddd %%B08_11, %%B04_07, %%ADDBE_4x4
+ vpaddd %%B12_15, %%B08_11, %%ADDBE_4x4
+ jmp %%_next_16_ok
+%%_next_16_overflow:
+ vpshufb %%CTR, %%CTR, %%SHUF_MASK
+ vmovdqa64 %%B12_15, [rel ddq_add_4444]
+ vpaddd %%B00_03, %%CTR, [rel ddq_add_1234]
+ vpaddd %%B04_07, %%B00_03, %%B12_15
+ vpaddd %%B08_11, %%B04_07, %%B12_15
+ vpaddd %%B12_15, %%B08_11, %%B12_15
+ vpshufb %%B00_03, %%SHUF_MASK
+ vpshufb %%B04_07, %%SHUF_MASK
+ vpshufb %%B08_11, %%SHUF_MASK
+ vpshufb %%B12_15, %%SHUF_MASK
+%%_next_16_ok:
+ vshufi64x2 %%CTR, %%B12_15, %%B12_15, 1111_1111b
+ add BYTE(%%CTR_CHECK), 16
+
+ ;; === load 16 blocks of data
+ VX512LDR %%T0, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*0)]
+ VX512LDR %%T1, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*1)]
+ VX512LDR %%T2, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*2)]
+ VX512LDR %%T3, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*3)]
+
+ ;; move to AES encryption rounds
+%assign i 0
+ vbroadcastf64x2 %%T4, [%%KP + (16*i)]
+ vpxorq %%B00_03, %%B00_03, %%T4
+ vpxorq %%B04_07, %%B04_07, %%T4
+ vpxorq %%B08_11, %%B08_11, %%T4
+ vpxorq %%B12_15, %%B12_15, %%T4
+%assign i (i + 1)
+
+%rep NROUNDS
+ vbroadcastf64x2 %%T4, [%%KP + (16*i)]
+ vaesenc %%B00_03, %%B00_03, %%T4
+ vaesenc %%B04_07, %%B04_07, %%T4
+ vaesenc %%B08_11, %%B08_11, %%T4
+ vaesenc %%B12_15, %%B12_15, %%T4
+%assign i (i + 1)
+%endrep
+
+ vbroadcastf64x2 %%T4, [%%KP + (16*i)]
+ vaesenclast %%B00_03, %%B00_03, %%T4
+ vaesenclast %%B04_07, %%B04_07, %%T4
+ vaesenclast %%B08_11, %%B08_11, %%T4
+ vaesenclast %%B12_15, %%B12_15, %%T4
+
+ ;; xor against text
+ vpxorq %%B00_03, %%B00_03, %%T0
+ vpxorq %%B04_07, %%B04_07, %%T1
+ vpxorq %%B08_11, %%B08_11, %%T2
+ vpxorq %%B12_15, %%B12_15, %%T3
+
+ ;; store
+ VX512STR [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*0)], %%B00_03
+ VX512STR [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*1)], %%B04_07
+ VX512STR [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*2)], %%B08_11
+ VX512STR [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*3)], %%B12_15
+
+%ifidn %%ENC_DEC, DEC
+ ;; decryption - cipher text needs to go to GHASH phase
+ vpshufb %%B00_03, %%T0, %%SHUF_MASK
+ vpshufb %%B04_07, %%T1, %%SHUF_MASK
+ vpshufb %%B08_11, %%T2, %%SHUF_MASK
+ vpshufb %%B12_15, %%T3, %%SHUF_MASK
+%else
+ ;; encryption
+ vpshufb %%B00_03, %%B00_03, %%SHUF_MASK
+ vpshufb %%B04_07, %%B04_07, %%SHUF_MASK
+ vpshufb %%B08_11, %%B08_11, %%SHUF_MASK
+ vpshufb %%B12_15, %%B12_15, %%SHUF_MASK
+%endif
+
+%ifnidn %%GHASH, no_ghash
+ ;; === xor cipher block 0 with GHASH for the next GHASH round
+ vpxorq %%B00_03, %%B00_03, %%GHASH
+%endif
+
+ vmovdqa64 [rsp + stack_offset + (0 * 64)], %%B00_03
+ vmovdqa64 [rsp + stack_offset + (1 * 64)], %%B04_07
+ vmovdqa64 [rsp + stack_offset + (2 * 64)], %%B08_11
+ vmovdqa64 [rsp + stack_offset + (3 * 64)], %%B12_15
+%endmacro ;INITIAL_BLOCKS_16
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; Encrypt the initial N x 16 blocks
+;;; - A x 16 blocks are encrypted/decrypted first (pipeline depth)
+;;; - B x 16 blocks are encrypted/decrypted and previous A x 16 are ghashed
+;;; - A + B = N
+%macro INITIAL_BLOCKS_Nx16 39
+%define %%IN %1 ; [in] input buffer
+%define %%OUT %2 ; [in] output buffer
+%define %%KP %3 ; [in] pointer to expanded keys
+%define %%DATA_OFFSET %4 ; [in/out] data offset
+%define %%GHASH %5 ; [in] ZMM with AAD (low 128 bits)
+%define %%CTR %6 ; [in/out] ZMM with CTR: in - LE & 128b; out - BE & 4x128b
+%define %%CTR_CHECK %7 ; [in/out] GPR with counter overflow check
+%define %%T0 %8 ; [clobered] temporary ZMM register
+%define %%T1 %9 ; [clobered] temporary ZMM register
+%define %%T2 %10 ; [clobered] temporary ZMM register
+%define %%T3 %11 ; [clobered] temporary ZMM register
+%define %%T4 %12 ; [clobered] temporary ZMM register
+%define %%T5 %13 ; [clobered] temporary ZMM register
+%define %%T6 %14 ; [clobered] temporary ZMM register
+%define %%T7 %15 ; [clobered] temporary ZMM register
+%define %%T8 %16 ; [clobered] temporary ZMM register
+%define %%T9 %17 ; [clobered] temporary ZMM register
+%define %%T10 %18 ; [clobered] temporary ZMM register
+%define %%T11 %19 ; [clobered] temporary ZMM register
+%define %%T12 %20 ; [clobered] temporary ZMM register
+%define %%T13 %21 ; [clobered] temporary ZMM register
+%define %%T14 %22 ; [clobered] temporary ZMM register
+%define %%T15 %23 ; [clobered] temporary ZMM register
+%define %%T16 %24 ; [clobered] temporary ZMM register
+%define %%T17 %25 ; [clobered] temporary ZMM register
+%define %%T18 %26 ; [clobered] temporary ZMM register
+%define %%T19 %27 ; [clobered] temporary ZMM register
+%define %%T20 %28 ; [clobered] temporary ZMM register
+%define %%T21 %29 ; [clobered] temporary ZMM register
+%define %%T22 %30 ; [clobered] temporary ZMM register
+%define %%GH %31 ; [out] ZMM ghash sum (high)
+%define %%GL %32 ; [out] ZMM ghash sum (low)
+%define %%GM %33 ; [out] ZMM ghash sum (middle)
+%define %%ADDBE_4x4 %34 ; [in] ZMM 4x128bits with value 4 (big endian)
+%define %%ADDBE_1234 %35 ; [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian)
+%define %%SHUF_MASK %36 ; [in] ZMM with BE/LE shuffle mask
+%define %%ENC_DEC %37 ; [in] ENC (encrypt) or DEC (decrypt) selector
+%define %%NBLOCKS %38 ; [in] number of blocks: multiple of 16
+%define %%DEPTH_BLK %39 ; [in] pipline depth, number of blocks (mulitple of 16)
+
+%assign aesout_offset (STACK_LOCAL_OFFSET + (0 * 16))
+%assign ghashin_offset (STACK_LOCAL_OFFSET + (0 * 16))
+%assign hkey_offset HashKey_ %+ %%NBLOCKS
+%assign data_in_out_offset 0
+
+ ;; set up CTR_CHECK
+ vmovd DWORD(%%CTR_CHECK), XWORD(%%CTR)
+ and DWORD(%%CTR_CHECK), 255
+
+ ;; in LE format after init, convert to BE
+ vshufi64x2 %%CTR, %%CTR, %%CTR, 0
+ vpshufb %%CTR, %%CTR, %%SHUF_MASK
+
+ ;; ==== AES lead in
+
+ ;; first 16 blocks - just cipher
+ INITIAL_BLOCKS_16 %%IN, %%OUT, %%KP, %%DATA_OFFSET, \
+ %%GHASH, %%CTR, %%CTR_CHECK, %%ADDBE_4x4, %%ADDBE_1234, \
+ %%T0, %%T1, %%T2, %%T3, %%T4, \
+ %%T5, %%T6, %%T7, %%T8, \
+ %%SHUF_MASK, %%ENC_DEC, aesout_offset, data_in_out_offset
+
+%assign aesout_offset (aesout_offset + (16 * 16))
+%assign data_in_out_offset (data_in_out_offset + (16 * 16))
+
+%if (%%DEPTH_BLK > 16)
+%rep ((%%DEPTH_BLK - 16) / 16)
+ INITIAL_BLOCKS_16 %%IN, %%OUT, %%KP, %%DATA_OFFSET, \
+ no_ghash, %%CTR, %%CTR_CHECK, %%ADDBE_4x4, %%ADDBE_1234, \
+ %%T0, %%T1, %%T2, %%T3, %%T4, \
+ %%T5, %%T6, %%T7, %%T8, \
+ %%SHUF_MASK, %%ENC_DEC, aesout_offset, data_in_out_offset
+%assign aesout_offset (aesout_offset + (16 * 16))
+%assign data_in_out_offset (data_in_out_offset + (16 * 16))
+%endrep
+%endif
+
+ ;; ==== GHASH + AES follows
+
+ ;; first 16 blocks stitched
+ GHASH_16_ENCRYPT_16_PARALLEL %%KP, %%OUT, %%IN, %%DATA_OFFSET, \
+ %%CTR, %%CTR_CHECK, \
+ hkey_offset, aesout_offset, ghashin_offset, %%SHUF_MASK, \
+ %%T0, %%T1, %%T2, %%T3, \
+ %%T4, %%T5, %%T6, %%T7, \
+ %%T8, %%T9, %%T10, %%T11,\
+ %%T12, %%T13, %%T14, %%T15,\
+ %%T16, %%T17, %%T18, %%T19, \
+ %%T20, %%T21, %%T22, \
+ %%ADDBE_4x4, %%ADDBE_1234, \
+ %%GL, %%GH, %%GM, \
+ first_time, %%ENC_DEC, data_in_out_offset, no_ghash_in
+
+%if ((%%NBLOCKS - %%DEPTH_BLK) > 16)
+%rep ((%%NBLOCKS - %%DEPTH_BLK - 16) / 16)
+%assign ghashin_offset (ghashin_offset + (16 * 16))
+%assign hkey_offset (hkey_offset + (16 * 16))
+%assign aesout_offset (aesout_offset + (16 * 16))
+%assign data_in_out_offset (data_in_out_offset + (16 * 16))
+
+ ;; mid 16 blocks - stitched
+ GHASH_16_ENCRYPT_16_PARALLEL %%KP, %%OUT, %%IN, %%DATA_OFFSET, \
+ %%CTR, %%CTR_CHECK, \
+ hkey_offset, aesout_offset, ghashin_offset, %%SHUF_MASK, \
+ %%T0, %%T1, %%T2, %%T3, \
+ %%T4, %%T5, %%T6, %%T7, \
+ %%T8, %%T9, %%T10, %%T11,\
+ %%T12, %%T13, %%T14, %%T15,\
+ %%T16, %%T17, %%T18, %%T19, \
+ %%T20, %%T21, %%T22, \
+ %%ADDBE_4x4, %%ADDBE_1234, \
+ %%GL, %%GH, %%GM, \
+ no_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in
+%endrep
+%endif
+ add %%DATA_OFFSET, (%%NBLOCKS * 16)
+
+%endmacro ;INITIAL_BLOCKS_Nx16
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; GHASH the last 16 blocks of cipher text (last part of by 32/64/128 code)
+%macro GHASH_LAST_Nx16 23
+%define %%KP %1 ; [in] pointer to expanded keys
+%define %%GHASH %2 ; [out] ghash output
+%define %%T1 %3 ; [clobbered] temporary ZMM
+%define %%T2 %4 ; [clobbered] temporary ZMM
+%define %%T3 %5 ; [clobbered] temporary ZMM
+%define %%T4 %6 ; [clobbered] temporary ZMM
+%define %%T5 %7 ; [clobbered] temporary ZMM
+%define %%T6 %8 ; [clobbered] temporary ZMM
+%define %%T7 %9 ; [clobbered] temporary ZMM
+%define %%T8 %10 ; [clobbered] temporary ZMM
+%define %%T9 %11 ; [clobbered] temporary ZMM
+%define %%T10 %12 ; [clobbered] temporary ZMM
+%define %%T11 %13 ; [clobbered] temporary ZMM
+%define %%T12 %14 ; [clobbered] temporary ZMM
+%define %%T13 %15 ; [clobbered] temporary ZMM
+%define %%T14 %16 ; [clobbered] temporary ZMM
+%define %%T15 %17 ; [clobbered] temporary ZMM
+%define %%T16 %18 ; [clobbered] temporary ZMM
+%define %%GH %19 ; [in/cloberred] ghash sum (high)
+%define %%GL %20 ; [in/cloberred] ghash sum (low)
+%define %%GM %21 ; [in/cloberred] ghash sum (medium)
+%define %%LOOP_BLK %22 ; [in] numerical number of blocks handled by the loop
+%define %%DEPTH_BLK %23 ; [in] numerical number, pipeline depth (ghash vs aes)
+
+%define %%T0H %%T1
+%define %%T0L %%T2
+%define %%T0M1 %%T3
+%define %%T0M2 %%T4
+
+%define %%T1H %%T5
+%define %%T1L %%T6
+%define %%T1M1 %%T7
+%define %%T1M2 %%T8
+
+%define %%T2H %%T9
+%define %%T2L %%T10
+%define %%T2M1 %%T11
+%define %%T2M2 %%T12
+
+%define %%BLK1 %%T13
+%define %%BLK2 %%T14
+
+%define %%HK1 %%T15
+%define %%HK2 %%T16
+
+%assign hashk HashKey_ %+ %%DEPTH_BLK
+%assign cipher_blk (STACK_LOCAL_OFFSET + ((%%LOOP_BLK - %%DEPTH_BLK) * 16))
+
+ ;; load cipher blocks and ghash keys
+ vmovdqa64 %%BLK1, [rsp + cipher_blk]
+ vmovdqa64 %%BLK2, [rsp + cipher_blk + 64]
+ vmovdqu64 %%HK1, [%%KP + hashk]
+ vmovdqu64 %%HK2, [%%KP + hashk + 64]
+ ;; ghash blocks 0-3
+ vpclmulqdq %%T0H, %%BLK1, %%HK1, 0x11 ; %%TH = a1*b1
+ vpclmulqdq %%T0L, %%BLK1, %%HK1, 0x00 ; %%TL = a0*b0
+ vpclmulqdq %%T0M1, %%BLK1, %%HK1, 0x01 ; %%TM1 = a1*b0
+ vpclmulqdq %%T0M2, %%BLK1, %%HK1, 0x10 ; %%TM2 = a0*b1
+ ;; ghash blocks 4-7
+ vpclmulqdq %%T1H, %%BLK2, %%HK2, 0x11 ; %%TTH = a1*b1
+ vpclmulqdq %%T1L, %%BLK2, %%HK2, 0x00 ; %%TTL = a0*b0
+ vpclmulqdq %%T1M1, %%BLK2, %%HK2, 0x01 ; %%TTM1 = a1*b0
+ vpclmulqdq %%T1M2, %%BLK2, %%HK2, 0x10 ; %%TTM2 = a0*b1
+ vpternlogq %%T0H, %%T1H, %%GH, 0x96 ; T0H = T0H + T1H + GH
+ vpternlogq %%T0L, %%T1L, %%GL, 0x96 ; T0L = T0L + T1L + GL
+ vpternlogq %%T0M1, %%T1M1, %%GM, 0x96 ; T0M1 = T0M1 + T1M1 + GM
+ vpxorq %%T0M2, %%T0M2, %%T1M2 ; T0M2 = T0M2 + T1M2
+
+%rep ((%%DEPTH_BLK - 8) / 8)
+%assign hashk (hashk + 128)
+%assign cipher_blk (cipher_blk + 128)
+
+ ;; remaining blocks
+ ;; load next 8 cipher blocks and corresponding ghash keys
+ vmovdqa64 %%BLK1, [rsp + cipher_blk]
+ vmovdqa64 %%BLK2, [rsp + cipher_blk + 64]
+ vmovdqu64 %%HK1, [%%KP + hashk]
+ vmovdqu64 %%HK2, [%%KP + hashk + 64]
+ ;; ghash blocks 0-3
+ vpclmulqdq %%T1H, %%BLK1, %%HK1, 0x11 ; %%TH = a1*b1
+ vpclmulqdq %%T1L, %%BLK1, %%HK1, 0x00 ; %%TL = a0*b0
+ vpclmulqdq %%T1M1, %%BLK1, %%HK1, 0x01 ; %%TM1 = a1*b0
+ vpclmulqdq %%T1M2, %%BLK1, %%HK1, 0x10 ; %%TM2 = a0*b1
+ ;; ghash blocks 4-7
+ vpclmulqdq %%T2H, %%BLK2, %%HK2, 0x11 ; %%TTH = a1*b1
+ vpclmulqdq %%T2L, %%BLK2, %%HK2, 0x00 ; %%TTL = a0*b0
+ vpclmulqdq %%T2M1, %%BLK2, %%HK2, 0x01 ; %%TTM1 = a1*b0
+ vpclmulqdq %%T2M2, %%BLK2, %%HK2, 0x10 ; %%TTM2 = a0*b1
+ ;; update sums
+ vpternlogq %%T0H, %%T1H, %%T2H, 0x96 ; TH = T0H + T1H + T2H
+ vpternlogq %%T0L, %%T1L, %%T2L, 0x96 ; TL = T0L + T1L + T2L
+ vpternlogq %%T0M1, %%T1M1, %%T2M1, 0x96 ; TM1 = T0M1 + T1M1 xor T2M1
+ vpternlogq %%T0M2, %%T1M2, %%T2M2, 0x96 ; TM2 = T0M2 + T1M1 xor T2M2
+%endrep
+
+ ;; integrate TM into TH and TL
+ vpxorq %%T0M1, %%T0M1, %%T0M2
+ vpsrldq %%T1M1, %%T0M1, 8
+ vpslldq %%T1M2, %%T0M1, 8
+ vpxorq %%T0H, %%T0H, %%T1M1
+ vpxorq %%T0L, %%T0L, %%T1M2
+
+ ;; add TH and TL 128-bit words horizontally
+ VHPXORI4x128 %%T0H, %%T2M1
+ VHPXORI4x128 %%T0L, %%T2M2
+
+ ;; reduction
+ vmovdqa64 %%HK1, [rel POLY2]
+ VCLMUL_REDUCE %%GHASH, %%HK1, %%T0H, %%T0L, %%T0M1, %%T0M2
+%endmacro
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; Encrypt & ghash multiples of 16 blocks
+
+%macro GHASH_ENCRYPT_Nx16_PARALLEL 39
+%define %%IN %1 ; [in] input buffer
+%define %%OUT %2 ; [in] output buffer
+%define %%GDATA_KEY %3 ; [in] pointer to expanded keys
+%define %%DATA_OFFSET %4 ; [in/out] data offset
+%define %%CTR_BE %5 ; [in/out] ZMM last counter block
+%define %%SHFMSK %6 ; [in] ZMM with byte swap mask for pshufb
+%define %%ZT0 %7 ; [clobered] temporary ZMM register
+%define %%ZT1 %8 ; [clobered] temporary ZMM register
+%define %%ZT2 %9 ; [clobered] temporary ZMM register
+%define %%ZT3 %10 ; [clobered] temporary ZMM register
+%define %%ZT4 %11 ; [clobered] temporary ZMM register
+%define %%ZT5 %12 ; [clobered] temporary ZMM register
+%define %%ZT6 %13 ; [clobered] temporary ZMM register
+%define %%ZT7 %14 ; [clobered] temporary ZMM register
+%define %%ZT8 %15 ; [clobered] temporary ZMM register
+%define %%ZT9 %16 ; [clobered] temporary ZMM register
+%define %%ZT10 %17 ; [clobered] temporary ZMM register
+%define %%ZT11 %18 ; [clobered] temporary ZMM register
+%define %%ZT12 %19 ; [clobered] temporary ZMM register
+%define %%ZT13 %20 ; [clobered] temporary ZMM register
+%define %%ZT14 %21 ; [clobered] temporary ZMM register
+%define %%ZT15 %22 ; [clobered] temporary ZMM register
+%define %%ZT16 %23 ; [clobered] temporary ZMM register
+%define %%ZT17 %24 ; [clobered] temporary ZMM register
+%define %%ZT18 %25 ; [clobered] temporary ZMM register
+%define %%ZT19 %26 ; [clobered] temporary ZMM register
+%define %%ZT20 %27 ; [clobered] temporary ZMM register
+%define %%ZT21 %28 ; [clobered] temporary ZMM register
+%define %%ZT22 %29 ; [clobered] temporary ZMM register
+%define %%GTH %30 ; [in/out] ZMM GHASH sum (high)
+%define %%GTL %31 ; [in/out] ZMM GHASH sum (low)
+%define %%GTM %32 ; [in/out] ZMM GHASH sum (medium)
+%define %%ADDBE_4x4 %33 ; [in] ZMM 4x128bits with value 4 (big endian)
+%define %%ADDBE_1234 %34 ; [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian)
+%define %%GHASH %35 ; [clobbered] ZMM with intermidiate GHASH value
+%define %%ENC_DEC %36 ; [in] ENC (encrypt) or DEC (decrypt) selector
+%define %%NUM_BLOCKS %37 ; [in] number of blocks to process in the loop
+%define %%DEPTH_BLK %38 ; [in] pipeline depth in blocks
+%define %%CTR_CHECK %39 ; [in/out] counter to check byte overflow
+
+%assign aesout_offset (STACK_LOCAL_OFFSET + (0 * 16))
+%assign ghashin_offset (STACK_LOCAL_OFFSET + ((%%NUM_BLOCKS - %%DEPTH_BLK) * 16))
+%assign hkey_offset HashKey_ %+ %%DEPTH_BLK
+%assign data_in_out_offset 0
+
+ ;; mid 16 blocks
+%if (%%DEPTH_BLK > 16)
+%rep ((%%DEPTH_BLK - 16) / 16)
+ GHASH_16_ENCRYPT_16_PARALLEL %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \
+ %%CTR_BE, %%CTR_CHECK, \
+ hkey_offset, aesout_offset, ghashin_offset, %%SHFMSK, \
+ %%ZT0, %%ZT1, %%ZT2, %%ZT3, \
+ %%ZT4, %%ZT5, %%ZT6, %%ZT7, \
+ %%ZT8, %%ZT9, %%ZT10, %%ZT11,\
+ %%ZT12, %%ZT13, %%ZT14, %%ZT15,\
+ %%ZT16, %%ZT17, %%ZT18, %%ZT19, \
+ %%ZT20, %%ZT21, %%ZT22, \
+ %%ADDBE_4x4, %%ADDBE_1234, \
+ %%GTL, %%GTH, %%GTM, \
+ no_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in
+
+%assign aesout_offset (aesout_offset + (16 * 16))
+%assign ghashin_offset (ghashin_offset + (16 * 16))
+%assign hkey_offset (hkey_offset + (16 * 16))
+%assign data_in_out_offset (data_in_out_offset + (16 * 16))
+%endrep
+%endif
+
+ ;; 16 blocks with reduction
+ GHASH_16_ENCRYPT_16_PARALLEL %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \
+ %%CTR_BE, %%CTR_CHECK, \
+ HashKey_16, aesout_offset, ghashin_offset, %%SHFMSK, \
+ %%ZT0, %%ZT1, %%ZT2, %%ZT3, \
+ %%ZT4, %%ZT5, %%ZT6, %%ZT7, \
+ %%ZT8, %%ZT9, %%ZT10, %%ZT11,\
+ %%ZT12, %%ZT13, %%ZT14, %%ZT15,\
+ %%ZT16, %%ZT17, %%ZT18, %%ZT19, \
+ %%ZT20, %%ZT21, %%ZT22, \
+ %%ADDBE_4x4, %%ADDBE_1234, \
+ %%GTL, %%GTH, %%GTM, \
+ final_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in
+
+%assign aesout_offset (aesout_offset + (16 * 16))
+%assign data_in_out_offset (data_in_out_offset + (16 * 16))
+%assign ghashin_offset (STACK_LOCAL_OFFSET + (0 * 16))
+%assign hkey_offset HashKey_ %+ %%NUM_BLOCKS
+
+ ;; === xor cipher block 0 with GHASH (ZT4)
+ vmovdqa64 %%GHASH, %%ZT4
+
+ ;; start the pipeline again
+ GHASH_16_ENCRYPT_16_PARALLEL %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \
+ %%CTR_BE, %%CTR_CHECK, \
+ hkey_offset, aesout_offset, ghashin_offset, %%SHFMSK, \
+ %%ZT0, %%ZT1, %%ZT2, %%ZT3, \
+ %%ZT4, %%ZT5, %%ZT6, %%ZT7, \
+ %%ZT8, %%ZT9, %%ZT10, %%ZT11,\
+ %%ZT12, %%ZT13, %%ZT14, %%ZT15,\
+ %%ZT16, %%ZT17, %%ZT18, %%ZT19, \
+ %%ZT20, %%ZT21, %%ZT22, \
+ %%ADDBE_4x4, %%ADDBE_1234, \
+ %%GTL, %%GTH, %%GTM, \
+ first_time, %%ENC_DEC, data_in_out_offset, %%GHASH
+
+%if ((%%NUM_BLOCKS - %%DEPTH_BLK) > 16)
+%rep ((%%NUM_BLOCKS - %%DEPTH_BLK - 16 ) / 16)
+
+%assign aesout_offset (aesout_offset + (16 * 16))
+%assign data_in_out_offset (data_in_out_offset + (16 * 16))
+%assign ghashin_offset (ghashin_offset + (16 * 16))
+%assign hkey_offset (hkey_offset + (16 * 16))
+
+ GHASH_16_ENCRYPT_16_PARALLEL %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \
+ %%CTR_BE, %%CTR_CHECK, \
+ hkey_offset, aesout_offset, ghashin_offset, %%SHFMSK, \
+ %%ZT0, %%ZT1, %%ZT2, %%ZT3, \
+ %%ZT4, %%ZT5, %%ZT6, %%ZT7, \
+ %%ZT8, %%ZT9, %%ZT10, %%ZT11,\
+ %%ZT12, %%ZT13, %%ZT14, %%ZT15,\
+ %%ZT16, %%ZT17, %%ZT18, %%ZT19, \
+ %%ZT20, %%ZT21, %%ZT22, \
+ %%ADDBE_4x4, %%ADDBE_1234, \
+ %%GTL, %%GTH, %%GTM, \
+ no_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in
+%endrep
+%endif
+
+ add %%DATA_OFFSET, (%%NUM_BLOCKS * 16)
+
+%endmacro ;GHASH_ENCRYPT_Nx16_PARALLEL
+;;; ===========================================================================
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX) and whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_COMPLETE 6
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%AUTH_TAG %3
+%define %%AUTH_TAG_LEN %4
+%define %%ENC_DEC %5
+%define %%INSTANCE_TYPE %6
+%define %%PLAIN_CYPH_LEN rax
+
+ vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+ ;; Start AES as early as possible
+ vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0
+ ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0)
+
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; If the GCM function is called as a single function call rather
+ ;; than invoking the individual parts (init, update, finalize) we
+ ;; can remove a write to read dependency on AadHash.
+ vmovdqu xmm14, [%%GDATA_CTX + AadHash]
+
+ ;; Encrypt the final partial block. If we did this as a single call then
+ ;; the partial block was handled in the main GCM_ENC_DEC macro.
+ mov r12, [%%GDATA_CTX + PBlockLen]
+ cmp r12, 0
+
+ je %%_partial_done
+
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ vmovdqu [%%GDATA_CTX + AadHash], xmm14
+
+%%_partial_done:
+
+%endif
+
+ mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes)
+ mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
+
+ shl r12, 3 ; convert into number of bits
+ vmovd xmm15, r12d ; len(A) in xmm15
+
+ shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
+ vmovq xmm1, %%PLAIN_CYPH_LEN
+ vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
+ vpxor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C)
+
+ vpxor xmm14, xmm15
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6
+ vpshufb xmm14, [rel SHUF_MASK] ; perform a 16Byte swap
+
+ vpxor xmm9, xmm9, xmm14
+
+
+%%_return_T:
+ mov r10, %%AUTH_TAG ; r10 = authTag
+ mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
+
+ cmp r11, 16
+ je %%_T_16
+
+ cmp r11, 12
+ je %%_T_12
+
+ cmp r11, 8
+ je %%_T_8
+
+ simd_store_avx_15 r10, xmm9, r11, r12, rax
+ jmp %%_return_T_done
+%%_T_8:
+ vmovq rax, xmm9
+ mov [r10], rax
+ jmp %%_return_T_done
+%%_T_12:
+ vmovq rax, xmm9
+ mov [r10], rax
+ vpsrldq xmm9, xmm9, 8
+ vmovd eax, xmm9
+ mov [r10 + 8], eax
+ jmp %%_return_T_done
+%%_T_16:
+ vmovdqu [r10], xmm9
+
+%%_return_T_done:
+
+%ifdef SAFE_DATA
+ ;; Clear sensitive data from context structure
+ vpxor xmm0, xmm0
+ vmovdqu [%%GDATA_CTX + AadHash], xmm0
+ vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm0
+%endif
+%endmacro ; GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_precomp_128_vaes_avx512 /
+; aes_gcm_precomp_192_vaes_avx512 /
+; aes_gcm_precomp_256_vaes_avx512
+; (struct gcm_key_data *key_data)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(precomp,_),function,)
+FN_NAME(precomp,_):
+;; Parameter is passed through register
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_precomp
+%endif
+
+ FUNC_SAVE
+
+ vpxor xmm6, xmm6
+ ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey
+
+ vpshufb xmm6, [rel SHUF_MASK]
+ ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+ vmovdqa xmm2, xmm6
+ vpsllq xmm6, xmm6, 1
+ vpsrlq xmm2, xmm2, 63
+ vmovdqa xmm1, xmm2
+ vpslldq xmm2, xmm2, 8
+ vpsrldq xmm1, xmm1, 8
+ vpor xmm6, xmm6, xmm2
+ ;reduction
+ vpshufd xmm2, xmm1, 00100100b
+ vpcmpeqd xmm2, [rel TWOONE]
+ vpand xmm2, xmm2, [rel POLY]
+ vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
+
+
+ PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+ FUNC_RESTORE
+exit_precomp:
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_init_128_vaes_avx512 / aes_gcm_init_192_vaes_avx512 / aes_gcm_init_256_vaes_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(init,_),function,)
+FN_NAME(init,_):
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_init
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_init
+
+ ;; Check IV != NULL
+ cmp arg3, 0
+ jz exit_init
+
+ ;; Check if aad_len == 0
+ cmp arg5, 0
+ jz skip_aad_check_init
+
+ ;; Check aad != NULL (aad_len != 0)
+ cmp arg4, 0
+ jz exit_init
+
+skip_aad_check_init:
+%endif
+ GCM_INIT arg1, arg2, arg3, arg4, arg5, r10, r11, r12, k1, xmm14, xmm2, \
+ zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10
+
+exit_init:
+
+ FUNC_RESTORE
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_update_vaes_avx512 / aes_gcm_enc_192_update_vaes_avx512 /
+; aes_gcm_enc_256_update_vaes_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_update_),function,)
+FN_NAME(enc,_update_):
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_update_enc
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_update_enc
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_update_enc
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_update_enc
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_update_enc
+
+skip_in_out_check_update_enc:
+%endif
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call
+
+exit_update_enc:
+ FUNC_RESTORE
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_update_vaes_avx512 / aes_gcm_dec_192_update_vaes_avx512 /
+; aes_gcm_dec_256_update_vaes_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_update_),function,)
+FN_NAME(dec,_update_):
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_update_dec
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_update_dec
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_update_dec
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_update_dec
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_update_dec
+
+skip_in_out_check_update_dec:
+%endif
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call
+
+exit_update_dec:
+ FUNC_RESTORE
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_finalize_vaes_avx512 / aes_gcm_enc_192_finalize_vaes_avx512 /
+; aes_gcm_enc_256_finalize_vaes_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_finalize_),function,)
+FN_NAME(enc,_finalize_):
+
+;; All parameters are passed through registers
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_enc_fin
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_enc_fin
+
+ ;; Check auth_tag != NULL
+ cmp arg3, 0
+ jz exit_enc_fin
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg4, 0
+ jz exit_enc_fin
+
+ cmp arg4, 16
+ ja exit_enc_fin
+%endif
+
+ FUNC_SAVE
+ GCM_COMPLETE arg1, arg2, arg3, arg4, ENC, multi_call
+
+ FUNC_RESTORE
+
+exit_enc_fin:
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_finalize_vaes_avx512 / aes_gcm_dec_192_finalize_vaes_avx512
+; aes_gcm_dec_256_finalize_vaes_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_finalize_),function,)
+FN_NAME(dec,_finalize_):
+
+;; All parameters are passed through registers
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_dec_fin
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_dec_fin
+
+ ;; Check auth_tag != NULL
+ cmp arg3, 0
+ jz exit_dec_fin
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg4, 0
+ jz exit_dec_fin
+
+ cmp arg4, 16
+ ja exit_dec_fin
+%endif
+
+ FUNC_SAVE
+ GCM_COMPLETE arg1, arg2, arg3, arg4, DEC, multi_call
+
+ FUNC_RESTORE
+
+exit_dec_fin:
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_vaes_avx512 / aes_gcm_enc_192_vaes_avx512 / aes_gcm_enc_256_vaes_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(enc,_),function,)
+FN_NAME(enc,_):
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_enc
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_enc
+
+ ;; Check IV != NULL
+ cmp arg6, 0
+ jz exit_enc
+
+ ;; Check auth_tag != NULL
+ cmp arg9, 0
+ jz exit_enc
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg10, 0
+ jz exit_enc
+
+ cmp arg10, 16
+ ja exit_enc
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_enc
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_enc
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_enc
+
+skip_in_out_check_enc:
+ ;; Check if aad_len == 0
+ cmp arg8, 0
+ jz skip_aad_check_enc
+
+ ;; Check aad != NULL (aad_len != 0)
+ cmp arg7, 0
+ jz exit_enc
+
+skip_aad_check_enc:
+%endif
+ GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12, k1, xmm14, xmm2, \
+ zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call
+ GCM_COMPLETE arg1, arg2, arg9, arg10, ENC, single_call
+
+exit_enc:
+ FUNC_RESTORE
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_vaes_avx512 / aes_gcm_dec_192_vaes_avx512 / aes_gcm_dec_256_vaes_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+MKGLOBAL(FN_NAME(dec,_),function,)
+FN_NAME(dec,_):
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_dec
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_dec
+
+ ;; Check IV != NULL
+ cmp arg6, 0
+ jz exit_dec
+
+ ;; Check auth_tag != NULL
+ cmp arg9, 0
+ jz exit_dec
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg10, 0
+ jz exit_dec
+
+ cmp arg10, 16
+ ja exit_dec
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_dec
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_dec
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_dec
+
+skip_in_out_check_dec:
+ ;; Check if aad_len == 0
+ cmp arg8, 0
+ jz skip_aad_check_dec
+
+ ;; Check aad != NULL (aad_len != 0)
+ cmp arg7, 0
+ jz exit_dec
+
+skip_aad_check_dec:
+%endif
+ GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12, k1, xmm14, xmm2, \
+ zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call
+ GCM_COMPLETE arg1, arg2, arg9, arg10, DEC, single_call
+
+exit_dec:
+ FUNC_RESTORE
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes192_flush_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes192_flush_avx512.asm
new file mode 100644
index 000000000..449229531
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes192_flush_avx512.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define AES_CBC_ENC_X16 aes_cbc_enc_192_vaes_avx512
+%define FLUSH_JOB_AES_ENC flush_job_aes192_enc_vaes_avx512
+%define NUM_KEYS 13
+%include "avx512/mb_mgr_aes_flush_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes192_submit_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes192_submit_avx512.asm
new file mode 100644
index 000000000..3bbb30158
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes192_submit_avx512.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define AES_CBC_ENC_X16 aes_cbc_enc_192_vaes_avx512
+%define SUBMIT_JOB_AES_ENC submit_job_aes192_enc_vaes_avx512
+%define NUM_KEYS 13
+%include "avx512/mb_mgr_aes_submit_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes256_flush_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes256_flush_avx512.asm
new file mode 100644
index 000000000..2ff448393
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes256_flush_avx512.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define AES_CBC_ENC_X16 aes_cbc_enc_256_vaes_avx512
+%define FLUSH_JOB_AES_ENC flush_job_aes256_enc_vaes_avx512
+%define NUM_KEYS 15
+%include "avx512/mb_mgr_aes_flush_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes256_submit_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes256_submit_avx512.asm
new file mode 100644
index 000000000..4db4629e2
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes256_submit_avx512.asm
@@ -0,0 +1,31 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define AES_CBC_ENC_X16 aes_cbc_enc_256_vaes_avx512
+%define SUBMIT_JOB_AES_ENC submit_job_aes256_enc_vaes_avx512
+%define NUM_KEYS 15
+%include "avx512/mb_mgr_aes_submit_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes_flush_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes_flush_avx512.asm
new file mode 100644
index 000000000..4a52ed1e6
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes_flush_avx512.asm
@@ -0,0 +1,320 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "constants.asm"
+%include "include/reg_sizes.asm"
+
+%ifndef AES_CBC_ENC_X16
+%define AES_CBC_ENC_X16 aes_cbc_enc_128_vaes_avx512
+%define FLUSH_JOB_AES_ENC flush_job_aes128_enc_vaes_avx512
+%define NUM_KEYS 11
+%endif
+
+; void AES_CBC_ENC_X16(AES_ARGS *args, UINT64 len_in_bytes);
+extern AES_CBC_ENC_X16
+
+section .text
+
+%define APPEND(a,b) a %+ b
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define job_rax rax
+
+%if 1
+%define unused_lanes rbx
+%define tmp1 rbx
+
+%define good_lane rdx
+%define iv rdx
+
+%define tmp2 rax
+
+; idx needs to be in rbp
+%define tmp rbp
+%define idx rbp
+
+%define tmp3 r8
+%define tmp4 r9
+%endif
+
+; copy IV into NULL lanes
+%macro COPY_IV_TO_NULL_LANES 4
+%define %%IDX %1 ; [in] GP with good lane idx (scaled x16)
+%define %%NULL_MASK %2 ; [clobbered] GP to store NULL lane mask
+%define %%XTMP %3 ; [clobbered] temp XMM reg
+%define %%MASK_REG %4 ; [in] mask register
+
+ vmovdqa64 %%XTMP, [state + _aes_args_IV + %%IDX]
+ kmovw DWORD(%%NULL_MASK), %%MASK_REG
+%assign i 0
+%rep 16
+ bt %%NULL_MASK, i
+ jnc %%_skip_copy %+ i
+ vmovdqa64 [state + _aes_args_IV + (i*16)], %%XTMP
+%%_skip_copy %+ i:
+%assign i (i + 1)
+%endrep
+
+%endmacro
+
+; clear IV into NULL lanes
+%macro CLEAR_IV_IN_NULL_LANES 3
+%define %%NULL_MASK %1 ; [clobbered] GP to store NULL lane mask
+%define %%XTMP %2 ; [clobbered] temp XMM reg
+%define %%MASK_REG %3 ; [in] mask register
+
+ vpxorq %%XTMP, %%XTMP
+ kmovw DWORD(%%NULL_MASK), %%MASK_REG
+%assign i 0
+%rep 16
+ bt %%NULL_MASK, i
+ jnc %%_skip_clear %+ i
+ vmovdqa64 [state + _aes_args_IV + (i*16)], %%XTMP
+%%_skip_clear %+ i:
+%assign i (i + 1)
+%endrep
+
+%endmacro
+
+; copy round key's into NULL lanes
+%macro COPY_KEYS_TO_NULL_LANES 5
+%define %%IDX %1 ; [in] GP with good lane idx (scaled x16)
+%define %%NULL_MASK %2 ; [clobbered] GP to store NULL lane mask
+%define %%KEY_TAB %3 ; [clobbered] GP to store key table pointer
+%define %%XTMP %4 ; [clobbered] temp XMM reg
+%define %%MASK_REG %5 ; [in] mask register
+
+ lea %%KEY_TAB, [state + _aes_args_key_tab]
+ kmovw DWORD(%%NULL_MASK), %%MASK_REG
+%assign j 0 ; outer loop to iterate through round keys
+%rep 15
+ vmovdqa64 %%XTMP, [%%KEY_TAB + j + %%IDX]
+%assign k 0 ; inner loop to iterate through lanes
+%rep 16
+ bt %%NULL_MASK, k
+ jnc %%_skip_copy %+ j %+ _ %+ k
+ vmovdqa64 [%%KEY_TAB + j + (k*16)], %%XTMP
+%%_skip_copy %+ j %+ _ %+ k:
+%assign k (k + 1)
+%endrep
+
+%assign j (j + 256)
+%endrep
+
+%endmacro
+
+; clear round key's in NULL lanes
+%macro CLEAR_KEYS_IN_NULL_LANES 3
+%define %%NULL_MASK %1 ; [clobbered] GP to store NULL lane mask
+%define %%XTMP %2 ; [clobbered] temp XMM reg
+%define %%MASK_REG %3 ; [in] mask register
+
+ vpxorq %%XTMP, %%XTMP
+ kmovw DWORD(%%NULL_MASK), %%MASK_REG
+%assign k 0 ; outer loop to iterate through lanes
+%rep 16
+ bt %%NULL_MASK, k
+ jnc %%_skip_clear %+ k
+%assign j 0 ; inner loop to iterate through round keys
+%rep NUM_KEYS
+ vmovdqa64 [state + _aesarg_key_tab + j + (k*16)], %%XTMP
+%assign j (j + 256)
+%endrep
+%%_skip_clear %+ k:
+%assign k (k + 1)
+%endrep
+
+%endmacro
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+; JOB* FLUSH_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(FLUSH_JOB_AES_ENC,function,internal)
+FLUSH_JOB_AES_ENC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ ; check for empty
+ cmp qword [state + _aes_lanes_in_use], 0
+ je return_null
+
+ ; find a lane with a non-null job
+ vpxord zmm0, zmm0, zmm0
+ vmovdqu64 zmm1, [state + _aes_job_in_lane + (0*PTR_SZ)]
+ vmovdqu64 zmm2, [state + _aes_job_in_lane + (8*PTR_SZ)]
+ vpcmpq k1, zmm1, zmm0, 4 ; NEQ
+ vpcmpq k2, zmm2, zmm0, 4 ; NEQ
+ kmovw DWORD(tmp), k1
+ kmovw DWORD(tmp1), k2
+ mov DWORD(tmp2), DWORD(tmp1)
+ shl DWORD(tmp2), 8
+ or DWORD(tmp2), DWORD(tmp) ; mask of non-null jobs in tmp2
+ not BYTE(tmp)
+ kmovw k4, DWORD(tmp)
+ not BYTE(tmp1)
+ kmovw k5, DWORD(tmp1)
+ mov DWORD(tmp), DWORD(tmp2)
+ not WORD(tmp)
+ kmovw k6, DWORD(tmp) ; mask of NULL jobs in k4, k5 and k6
+ mov DWORD(tmp), DWORD(tmp2)
+ xor tmp2, tmp2
+ bsf WORD(tmp2), WORD(tmp) ; index of the 1st set bit in tmp2
+
+ ;; copy good lane data into NULL lanes
+ mov tmp, [state + _aes_args_in + tmp2*8]
+ vpbroadcastq zmm1, tmp
+ vmovdqa64 [state + _aes_args_in + (0*PTR_SZ)]{k4}, zmm1
+ vmovdqa64 [state + _aes_args_in + (8*PTR_SZ)]{k5}, zmm1
+ ;; - out pointer
+ mov tmp, [state + _aes_args_out + tmp2*8]
+ vpbroadcastq zmm1, tmp
+ vmovdqa64 [state + _aes_args_out + (0*PTR_SZ)]{k4}, zmm1
+ vmovdqa64 [state + _aes_args_out + (8*PTR_SZ)]{k5}, zmm1
+
+ ;; - set len to UINT16_MAX
+ mov WORD(tmp), 0xffff
+ vpbroadcastw ymm3, WORD(tmp)
+ vmovdqa64 ymm0, [state + _aes_lens]
+ vmovdqu16 ymm0{k6}, ymm3
+ vmovdqa64 [state + _aes_lens], ymm0
+
+ ;; Find min length for lanes 0-7
+ vphminposuw xmm2, xmm0
+
+ ;; scale up good lane idx before copying IV and keys
+ shl tmp2, 4
+ ;; - copy IV to null lanes
+ COPY_IV_TO_NULL_LANES tmp2, tmp1, xmm4, k6
+
+ ; extract min length of lanes 0-7
+ vpextrw DWORD(len2), xmm2, 0 ; min value
+ vpextrw DWORD(idx), xmm2, 1 ; min index
+
+ ;; - copy round keys to null lanes
+ COPY_KEYS_TO_NULL_LANES tmp2, tmp1, tmp3, xmm4, k6
+
+ ;; Update lens and find min for lanes 8-15
+ vextracti128 xmm1, ymm0, 1
+ vphminposuw xmm2, xmm1
+ vpextrw DWORD(tmp3), xmm2, 0 ; min value
+ cmp DWORD(len2), DWORD(tmp3)
+ jle use_min
+ vpextrw DWORD(idx), xmm2, 1 ; min index
+ add DWORD(idx), 8 ; but index +8
+ mov len2, tmp3 ; min len
+use_min:
+ vpbroadcastw ymm3, WORD(len2)
+ vpsubw ymm0, ymm0, ymm3
+ vmovdqa [state + _aes_lens], ymm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call AES_CBC_ENC_X16
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ mov job_rax, [state + _aes_job_in_lane + idx*8]
+ mov unused_lanes, [state + _aes_unused_lanes]
+ mov qword [state + _aes_job_in_lane + idx*8], 0
+ or dword [job_rax + _status], STS_COMPLETED_AES
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _aes_unused_lanes], unused_lanes
+ sub qword [state + _aes_lanes_in_use], 1
+
+%ifdef SAFE_DATA
+ ; Set bit of lane of returned job
+ xor DWORD(tmp3), DWORD(tmp3)
+ bts DWORD(tmp3), DWORD(idx)
+ kmovw k1, DWORD(tmp3)
+ korw k6, k1, k6
+
+ ;; Clear IV and expanded keys of returned job and "NULL lanes"
+ ;; (k6 contains the mask of the jobs)
+ CLEAR_IV_IN_NULL_LANES tmp1, xmm0, k6
+ CLEAR_KEYS_IN_NULL_LANES tmp1, xmm0, k6
+%endif
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes_submit_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes_submit_avx512.asm
new file mode 100644
index 000000000..f79d15f68
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_aes_submit_avx512.asm
@@ -0,0 +1,280 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+
+%include "include/reg_sizes.asm"
+%include "include/const.inc"
+%ifndef AES_CBC_ENC_X16
+%define AES_CBC_ENC_X16 aes_cbc_enc_128_vaes_avx512
+%define NUM_KEYS 11
+%define SUBMIT_JOB_AES_ENC submit_job_aes128_enc_vaes_avx512
+%endif
+
+; void AES_CBC_ENC_X16(AES_ARGS_x16 *args, UINT64 len_in_bytes);
+extern AES_CBC_ENC_X16
+
+section .text
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define job_rax rax
+
+%if 1
+; idx needs to be in rbp
+%define len rbp
+%define idx rbp
+%define tmp r10
+%define tmp2 r11
+%define tmp3 r12
+
+%define lane r8
+
+%define iv r9
+
+%define unused_lanes rbx
+%endif
+
+; STACK_SPACE needs to be an odd multiple of 8
+; This routine and its callee clobbers all GPRs
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+
+%macro INSERT_KEYS 6
+%define %%KP %1 ; [in] GP reg with pointer to expanded keys
+%define %%LANE %2 ; [in] GP reg with lane number
+%define %%NKEYS %3 ; [in] number of round keys (numerical value)
+%define %%COL %4 ; [clobbered] GP reg
+%define %%ZTMP %5 ; [clobbered] ZMM reg
+%define %%IA0 %6 ; [clobbered] GP reg
+
+
+%assign ROW (16*16)
+
+ mov %%COL, %%LANE
+ shl %%COL, 4
+ lea %%IA0, [state + _aes_args_key_tab]
+ add %%COL, %%IA0
+
+ vmovdqu64 %%ZTMP, [%%KP]
+ vextracti64x2 [%%COL + ROW*0], %%ZTMP, 0
+ vextracti64x2 [%%COL + ROW*1], %%ZTMP, 1
+ vextracti64x2 [%%COL + ROW*2], %%ZTMP, 2
+ vextracti64x2 [%%COL + ROW*3], %%ZTMP, 3
+
+ vmovdqu64 %%ZTMP, [%%KP + 64]
+ vextracti64x2 [%%COL + ROW*4], %%ZTMP, 0
+ vextracti64x2 [%%COL + ROW*5], %%ZTMP, 1
+ vextracti64x2 [%%COL + ROW*6], %%ZTMP, 2
+ vextracti64x2 [%%COL + ROW*7], %%ZTMP, 3
+
+%if %%NKEYS > 11 ; 192 or 256 - copy 4 more keys
+ vmovdqu64 %%ZTMP, [%%KP + 128]
+ vextracti64x2 [%%COL + ROW*11], %%ZTMP, 3
+%else ; 128 - copy 3 more keys
+ mov %%IA0, 0x3f
+ kmovq k1, %%IA0
+ vmovdqu64 %%ZTMP{k1}{z}, [%%KP + 128]
+%endif
+ vextracti64x2 [%%COL + ROW*8], %%ZTMP, 0
+ vextracti64x2 [%%COL + ROW*9], %%ZTMP, 1
+ vextracti64x2 [%%COL + ROW*10], %%ZTMP, 2
+
+%if %%NKEYS == 15 ; 256 - 3 more keys
+ mov %%IA0, 0x3f
+ kmovq k1, %%IA0
+ vmovdqu64 %%ZTMP{k1}{z}, [%%KP + 192]
+ vextracti64x2 [%%COL + ROW*12], %%ZTMP, 0
+ vextracti64x2 [%%COL + ROW*13], %%ZTMP, 1
+ vextracti64x2 [%%COL + ROW*14], %%ZTMP, 2
+%elif %%NKEYS == 13 ; 192 - 1 more key
+ mov %%IA0, 0x3
+ kmovq k1, %%IA0
+ vmovdqu64 %%ZTMP{k1}{z}, [%%KP + 192]
+ vextracti64x2 [%%COL + ROW*12], %%ZTMP, 0
+%endif
+%endmacro
+
+; JOB* SUBMIT_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : state
+; arg 2 : job
+MKGLOBAL(SUBMIT_JOB_AES_ENC,function,internal)
+SUBMIT_JOB_AES_ENC:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -16
+
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _aes_unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ mov len, [job + _msg_len_to_cipher_in_bytes]
+ and len, -16 ; DOCSIS may pass size unaligned to block size
+ mov iv, [job + _iv]
+ mov [state + _aes_unused_lanes], unused_lanes
+ add qword [state + _aes_lanes_in_use], 1
+
+ mov [state + _aes_job_in_lane + lane*8], job
+
+ ;; Update lane len
+ vmovdqa64 ymm0, [state + _aes_lens]
+ mov tmp2, rcx ; save rcx
+ mov rcx, lane
+ mov tmp, 1
+ shl tmp, cl
+ mov rcx, tmp2 ; restore rcx
+ kmovq k1, tmp
+
+ vpbroadcastw ymm1, WORD(len)
+ vmovdqu16 ymm0{k1}, ymm1
+ vmovdqa64 [state + _aes_lens], ymm0
+
+ ;; Find min length for lanes 0-7
+ vphminposuw xmm2, xmm0
+
+ ;; Update input pointer
+ mov tmp, [job + _src]
+ add tmp, [job + _cipher_start_src_offset_in_bytes]
+ vmovdqu xmm1, [iv]
+ mov [state + _aes_args_in + lane*8], tmp
+
+ ;; Insert expanded keys
+ mov tmp, [job + _aes_enc_key_expanded]
+ INSERT_KEYS tmp, lane, NUM_KEYS, tmp2, zmm4, tmp3
+
+ ;; Update output pointer
+ mov tmp, [job + _dst]
+ mov [state + _aes_args_out + lane*8], tmp
+ shl lane, 4 ; multiply by 16
+ vmovdqa [state + _aes_args_IV + lane], xmm1
+
+ cmp qword [state + _aes_lanes_in_use], 16
+ jne return_null
+
+ ; Find min length for lanes 8-15
+ vpextrw DWORD(len2), xmm2, 0 ; min value
+ vpextrw DWORD(idx), xmm2, 1 ; min index
+ vextracti128 xmm1, ymm0, 1
+ vphminposuw xmm2, xmm1
+ vpextrw DWORD(tmp), xmm2, 0 ; min value
+ cmp DWORD(len2), DWORD(tmp)
+ jle use_min
+ vpextrw DWORD(idx), xmm2, 1 ; min index
+ add DWORD(idx), 8 ; but index +8
+ mov len2, tmp ; min len
+use_min:
+ cmp len2, 0
+ je len_is_0
+
+ vpbroadcastw ymm3, WORD(len2)
+ vpsubw ymm0, ymm0, ymm3
+ vmovdqa [state + _aes_lens], ymm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call AES_CBC_ENC_X16
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ mov job_rax, [state + _aes_job_in_lane + idx*8]
+
+ mov unused_lanes, [state + _aes_unused_lanes]
+ mov qword [state + _aes_job_in_lane + idx*8], 0
+ or dword [job_rax + _status], STS_COMPLETED_AES
+ shl unused_lanes, 4
+ or unused_lanes, idx
+
+ mov [state + _aes_unused_lanes], unused_lanes
+ sub qword [state + _aes_lanes_in_use], 1
+
+%ifdef SAFE_DATA
+ ;; Clear IV
+ vpxorq xmm0, xmm0
+ shl idx, 4 ; multiply by 16
+ vmovdqa [state + _aes_args_IV + idx], xmm0
+
+ ;; Clear expanded keys
+%assign round 0
+%rep NUM_KEYS
+ vmovdqa [state + _aesarg_key_tab + round * (16*16) + idx], xmm0
+%assign round (round + 1)
+%endrep
+
+%endif
+
+return:
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_avx512.c b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_avx512.c
new file mode 100644
index 000000000..bd1aaef63
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_avx512.c
@@ -0,0 +1,1066 @@
+/*******************************************************************************
+ Copyright (c) 2012-2019, Intel Corporation
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define AVX512
+#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_zmms
+
+#include "intel-ipsec-mb.h"
+#include "include/kasumi_internal.h"
+#include "include/zuc_internal.h"
+#include "include/snow3g.h"
+
+#include "save_xmms.h"
+#include "asm.h"
+#include "des.h"
+#include "gcm.h"
+#include "cpu_feature.h"
+#include "noaesni.h"
+
+JOB_AES_HMAC *submit_job_aes128_enc_avx(MB_MGR_AES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes128_enc_avx(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes192_enc_avx(MB_MGR_AES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes192_enc_avx(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes256_enc_avx(MB_MGR_AES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes256_enc_avx(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes_xcbc_avx(MB_MGR_AES_XCBC_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_aes_xcbc_avx(MB_MGR_AES_XCBC_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes128_enc_vaes_avx512(MB_MGR_AES_OOO *state,
+ JOB_AES_HMAC *job);
+
+JOB_AES_HMAC *flush_job_aes128_enc_vaes_avx512(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes192_enc_vaes_avx512(MB_MGR_AES_OOO *state,
+ JOB_AES_HMAC *job);
+
+JOB_AES_HMAC *flush_job_aes192_enc_vaes_avx512(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes256_enc_vaes_avx512(MB_MGR_AES_OOO *state,
+ JOB_AES_HMAC *job);
+
+JOB_AES_HMAC *flush_job_aes256_enc_vaes_avx512(MB_MGR_AES_OOO *state);
+
+JOB_AES_HMAC *submit_job_des_cbc_enc_avx512(MB_MGR_DES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_des_cbc_enc_avx512(MB_MGR_DES_OOO *state);
+
+JOB_AES_HMAC *submit_job_des_cbc_dec_avx512(MB_MGR_DES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_des_cbc_dec_avx512(MB_MGR_DES_OOO *state);
+
+JOB_AES_HMAC *submit_job_3des_cbc_enc_avx512(MB_MGR_DES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_3des_cbc_enc_avx512(MB_MGR_DES_OOO *state);
+
+JOB_AES_HMAC *submit_job_3des_cbc_dec_avx512(MB_MGR_DES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_3des_cbc_dec_avx512(MB_MGR_DES_OOO *state);
+
+JOB_AES_HMAC *submit_job_docsis_des_enc_avx512(MB_MGR_DES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_docsis_des_enc_avx512(MB_MGR_DES_OOO *state);
+
+JOB_AES_HMAC *submit_job_docsis_des_dec_avx512(MB_MGR_DES_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_docsis_des_dec_avx512(MB_MGR_DES_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes_cntr_avx(JOB_AES_HMAC *job);
+
+JOB_AES_HMAC *submit_job_aes_cntr_bit_avx(JOB_AES_HMAC *job);
+
+#define SAVE_XMMS save_xmms_avx
+#define RESTORE_XMMS restore_xmms_avx
+
+#define SUBMIT_JOB_AES128_ENC submit_job_aes128_enc_avx512
+#define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_avx512
+#define FLUSH_JOB_AES128_ENC flush_job_aes128_enc_avx512
+
+#define SUBMIT_JOB_AES192_ENC submit_job_aes192_enc_avx512
+#define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_avx512
+#define FLUSH_JOB_AES192_ENC flush_job_aes192_enc_avx512
+
+#define SUBMIT_JOB_AES256_ENC submit_job_aes256_enc_avx512
+#define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_avx512
+#define FLUSH_JOB_AES256_ENC flush_job_aes256_enc_avx512
+
+#define SUBMIT_JOB_AES_ECB_128_ENC submit_job_aes_ecb_128_enc_avx
+#define SUBMIT_JOB_AES_ECB_128_DEC submit_job_aes_ecb_128_dec_avx
+#define SUBMIT_JOB_AES_ECB_192_ENC submit_job_aes_ecb_192_enc_avx
+#define SUBMIT_JOB_AES_ECB_192_DEC submit_job_aes_ecb_192_dec_avx
+#define SUBMIT_JOB_AES_ECB_256_ENC submit_job_aes_ecb_256_enc_avx
+#define SUBMIT_JOB_AES_ECB_256_DEC submit_job_aes_ecb_256_dec_avx
+
+#define SUBMIT_JOB_AES_CNTR submit_job_aes_cntr_avx512
+#define SUBMIT_JOB_AES_CNTR_BIT submit_job_aes_cntr_bit_avx512
+
+#define AES_CBC_DEC_128 aes_cbc_dec_128_avx512
+#define AES_CBC_DEC_192 aes_cbc_dec_192_avx512
+#define AES_CBC_DEC_256 aes_cbc_dec_256_avx512
+
+#define AES_CNTR_128 aes_cntr_128_avx
+#define AES_CNTR_192 aes_cntr_192_avx
+#define AES_CNTR_256 aes_cntr_256_avx
+
+#define AES_CNTR_CCM_128 aes_cntr_ccm_128_avx
+
+#define AES_ECB_ENC_128 aes_ecb_enc_128_avx
+#define AES_ECB_ENC_192 aes_ecb_enc_192_avx
+#define AES_ECB_ENC_256 aes_ecb_enc_256_avx
+#define AES_ECB_DEC_128 aes_ecb_dec_128_avx
+#define AES_ECB_DEC_192 aes_ecb_dec_192_avx
+#define AES_ECB_DEC_256 aes_ecb_dec_256_avx
+
+#define SUBMIT_JOB_PON_ENC submit_job_pon_enc_avx
+#define SUBMIT_JOB_PON_DEC submit_job_pon_dec_avx
+#define SUBMIT_JOB_PON_ENC_NO_CTR submit_job_pon_enc_no_ctr_avx
+#define SUBMIT_JOB_PON_DEC_NO_CTR submit_job_pon_dec_no_ctr_avx
+
+#define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_avx
+#define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_avx
+
+#define SUBMIT_JOB_DES_CBC_ENC submit_job_des_cbc_enc_avx512
+#define FLUSH_JOB_DES_CBC_ENC flush_job_des_cbc_enc_avx512
+
+#define SUBMIT_JOB_DES_CBC_DEC submit_job_des_cbc_dec_avx512
+#define FLUSH_JOB_DES_CBC_DEC flush_job_des_cbc_dec_avx512
+
+#define SUBMIT_JOB_3DES_CBC_ENC submit_job_3des_cbc_enc_avx512
+#define FLUSH_JOB_3DES_CBC_ENC flush_job_3des_cbc_enc_avx512
+
+#define SUBMIT_JOB_3DES_CBC_DEC submit_job_3des_cbc_dec_avx512
+#define FLUSH_JOB_3DES_CBC_DEC flush_job_3des_cbc_dec_avx512
+
+#define SUBMIT_JOB_DOCSIS_DES_ENC submit_job_docsis_des_enc_avx512
+#define FLUSH_JOB_DOCSIS_DES_ENC flush_job_docsis_des_enc_avx512
+
+#define SUBMIT_JOB_DOCSIS_DES_DEC submit_job_docsis_des_dec_avx512
+#define FLUSH_JOB_DOCSIS_DES_DEC flush_job_docsis_des_dec_avx512
+
+#define SUBMIT_JOB_AES_ENC SUBMIT_JOB_AES_ENC_AVX512
+#define FLUSH_JOB_AES_ENC FLUSH_JOB_AES_ENC_AVX512
+#define SUBMIT_JOB_AES_DEC SUBMIT_JOB_AES_DEC_AVX512
+
+JOB_AES_HMAC *submit_job_hmac_avx512(MB_MGR_HMAC_SHA_1_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_avx512(MB_MGR_HMAC_SHA_1_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_224_avx512(MB_MGR_HMAC_SHA_256_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_224_avx512(MB_MGR_HMAC_SHA_256_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_256_avx512(MB_MGR_HMAC_SHA_256_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_256_avx512(MB_MGR_HMAC_SHA_256_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_384_avx512(MB_MGR_HMAC_SHA_512_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_384_avx512(MB_MGR_HMAC_SHA_512_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_sha_512_avx512(MB_MGR_HMAC_SHA_512_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_sha_512_avx512(MB_MGR_HMAC_SHA_512_OOO *state);
+
+JOB_AES_HMAC *submit_job_hmac_md5_avx2(MB_MGR_HMAC_MD5_OOO *state,
+ JOB_AES_HMAC *job);
+JOB_AES_HMAC *flush_job_hmac_md5_avx2(MB_MGR_HMAC_MD5_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes_cmac_auth_avx(MB_MGR_CMAC_OOO *state,
+ JOB_AES_HMAC *job);
+
+JOB_AES_HMAC *flush_job_aes_cmac_auth_avx(MB_MGR_CMAC_OOO *state);
+
+JOB_AES_HMAC *submit_job_aes_ccm_auth_avx(MB_MGR_CCM_OOO *state,
+ JOB_AES_HMAC *job);
+
+JOB_AES_HMAC *flush_job_aes_ccm_auth_avx(MB_MGR_CCM_OOO *state);
+
+#define SUBMIT_JOB_HMAC submit_job_hmac_avx512
+#define FLUSH_JOB_HMAC flush_job_hmac_avx512
+#define SUBMIT_JOB_HMAC_SHA_224 submit_job_hmac_sha_224_avx512
+#define FLUSH_JOB_HMAC_SHA_224 flush_job_hmac_sha_224_avx512
+#define SUBMIT_JOB_HMAC_SHA_256 submit_job_hmac_sha_256_avx512
+#define FLUSH_JOB_HMAC_SHA_256 flush_job_hmac_sha_256_avx512
+#define SUBMIT_JOB_HMAC_SHA_384 submit_job_hmac_sha_384_avx512
+#define FLUSH_JOB_HMAC_SHA_384 flush_job_hmac_sha_384_avx512
+#define SUBMIT_JOB_HMAC_SHA_512 submit_job_hmac_sha_512_avx512
+#define FLUSH_JOB_HMAC_SHA_512 flush_job_hmac_sha_512_avx512
+#define SUBMIT_JOB_HMAC_MD5 submit_job_hmac_md5_avx2
+#define FLUSH_JOB_HMAC_MD5 flush_job_hmac_md5_avx2
+
+#ifndef NO_GCM
+#define AES_GCM_DEC_128 aes_gcm_dec_128_avx512
+#define AES_GCM_ENC_128 aes_gcm_enc_128_avx512
+#define AES_GCM_DEC_192 aes_gcm_dec_192_avx512
+#define AES_GCM_ENC_192 aes_gcm_enc_192_avx512
+#define AES_GCM_DEC_256 aes_gcm_dec_256_avx512
+#define AES_GCM_ENC_256 aes_gcm_enc_256_avx512
+
+#define AES_GCM_DEC_128_VAES aes_gcm_dec_128_vaes_avx512
+#define AES_GCM_ENC_128_VAES aes_gcm_enc_128_vaes_avx512
+#define AES_GCM_DEC_192_VAES aes_gcm_dec_192_vaes_avx512
+#define AES_GCM_ENC_192_VAES aes_gcm_enc_192_vaes_avx512
+#define AES_GCM_DEC_256_VAES aes_gcm_dec_256_vaes_avx512
+#define AES_GCM_ENC_256_VAES aes_gcm_enc_256_vaes_avx512
+
+#define SUBMIT_JOB_AES_GCM_DEC submit_job_aes_gcm_dec_avx512
+#define FLUSH_JOB_AES_GCM_DEC flush_job_aes_gcm_avx512
+#define SUBMIT_JOB_AES_GCM_ENC submit_job_aes_gcm_enc_avx512
+#define FLUSH_JOB_AES_GCM_ENC flush_job_aes_gcm_avx512
+#endif /* NO_GCM */
+
+/* ====================================================================== */
+
+#define SUBMIT_JOB submit_job_avx512
+#define FLUSH_JOB flush_job_avx512
+#define QUEUE_SIZE queue_size_avx512
+#define SUBMIT_JOB_NOCHECK submit_job_nocheck_avx512
+#define GET_NEXT_JOB get_next_job_avx512
+#define GET_COMPLETED_JOB get_completed_job_avx512
+
+/* ====================================================================== */
+
+#define SUBMIT_JOB_HASH SUBMIT_JOB_HASH_AVX512
+#define FLUSH_JOB_HASH FLUSH_JOB_HASH_AVX512
+
+/* ====================================================================== */
+
+#define AES_CFB_128_ONE aes_cfb_128_one_avx512
+
+void aes128_cbc_mac_x8(AES_ARGS *args, uint64_t len);
+
+#define AES128_CBC_MAC aes128_cbc_mac_x8
+
+#define FLUSH_JOB_AES_CCM_AUTH flush_job_aes_ccm_auth_avx
+#define SUBMIT_JOB_AES_CCM_AUTH submit_job_aes_ccm_auth_avx
+
+#define FLUSH_JOB_AES_CMAC_AUTH flush_job_aes_cmac_auth_avx
+#define SUBMIT_JOB_AES_CMAC_AUTH submit_job_aes_cmac_auth_avx
+
+/* ====================================================================== */
+
+/*
+ * GCM submit / flush API for AVX512 arch
+ */
+#ifndef NO_GCM
+static JOB_AES_HMAC *
+plain_submit_gcm_dec_avx512(MB_MGR *state, JOB_AES_HMAC *job)
+{
+ DECLARE_ALIGNED(struct gcm_context_data ctx, 16);
+ (void) state;
+
+ if (16 == job->aes_key_len_in_bytes)
+ AES_GCM_DEC_128(job->aes_dec_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else if (24 == job->aes_key_len_in_bytes)
+ AES_GCM_DEC_192(job->aes_dec_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else /* assume 32 bytes */
+ AES_GCM_DEC_256(job->aes_dec_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+
+ job->status = STS_COMPLETED;
+ return job;
+}
+
+static JOB_AES_HMAC *
+plain_submit_gcm_enc_avx512(MB_MGR *state, JOB_AES_HMAC *job)
+{
+ DECLARE_ALIGNED(struct gcm_context_data ctx, 16);
+ (void) state;
+
+ if (16 == job->aes_key_len_in_bytes)
+ AES_GCM_ENC_128(job->aes_enc_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes, job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else if (24 == job->aes_key_len_in_bytes)
+ AES_GCM_ENC_192(job->aes_enc_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes, job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else /* assume 32 bytes */
+ AES_GCM_ENC_256(job->aes_enc_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes, job->iv,
+ job->u.GCM.aad, job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+
+ job->status = STS_COMPLETED;
+ return job;
+}
+
+static JOB_AES_HMAC *
+vaes_submit_gcm_dec_avx512(MB_MGR *state, JOB_AES_HMAC *job)
+{
+ DECLARE_ALIGNED(struct gcm_context_data ctx, 16);
+ (void) state;
+
+ if (16 == job->aes_key_len_in_bytes)
+ AES_GCM_DEC_128_VAES(job->aes_dec_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv,
+ job->u.GCM.aad,
+ job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else if (24 == job->aes_key_len_in_bytes)
+ AES_GCM_DEC_192_VAES(job->aes_dec_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv,
+ job->u.GCM.aad,
+ job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else /* assume 32 bytes */
+ AES_GCM_DEC_256_VAES(job->aes_dec_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes,
+ job->iv,
+ job->u.GCM.aad,
+ job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+
+ job->status = STS_COMPLETED;
+ return job;
+}
+
+static JOB_AES_HMAC *
+vaes_submit_gcm_enc_avx512(MB_MGR *state, JOB_AES_HMAC *job)
+{
+ DECLARE_ALIGNED(struct gcm_context_data ctx, 16);
+ (void) state;
+
+ if (16 == job->aes_key_len_in_bytes)
+ AES_GCM_ENC_128_VAES(job->aes_enc_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes, job->iv,
+ job->u.GCM.aad,
+ job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else if (24 == job->aes_key_len_in_bytes)
+ AES_GCM_ENC_192_VAES(job->aes_enc_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes, job->iv,
+ job->u.GCM.aad,
+ job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+ else /* assume 32 bytes */
+ AES_GCM_ENC_256_VAES(job->aes_enc_key_expanded, &ctx, job->dst,
+ job->src +
+ job->cipher_start_src_offset_in_bytes,
+ job->msg_len_to_cipher_in_bytes, job->iv,
+ job->u.GCM.aad,
+ job->u.GCM.aad_len_in_bytes,
+ job->auth_tag_output,
+ job->auth_tag_output_len_in_bytes);
+
+ job->status = STS_COMPLETED;
+ return job;
+}
+
+static JOB_AES_HMAC *
+flush_job_aes_gcm_avx512(MB_MGR *state, JOB_AES_HMAC *job)
+{
+ (void) state;
+ (void) job;
+ return NULL;
+}
+
+static JOB_AES_HMAC *(*submit_job_aes_gcm_enc_avx512)
+ (MB_MGR *state, JOB_AES_HMAC *job) = plain_submit_gcm_enc_avx512;
+
+static JOB_AES_HMAC *(*submit_job_aes_gcm_dec_avx512)
+ (MB_MGR *state, JOB_AES_HMAC *job) = plain_submit_gcm_dec_avx512;
+
+#endif /* NO_GCM */
+
+static JOB_AES_HMAC *(*submit_job_aes_cntr_avx512)
+ (JOB_AES_HMAC *job) = submit_job_aes_cntr_avx;
+static JOB_AES_HMAC *(*submit_job_aes_cntr_bit_avx512)
+ (JOB_AES_HMAC *job) = submit_job_aes_cntr_bit_avx;
+
+static JOB_AES_HMAC *
+vaes_submit_cntr_avx512(JOB_AES_HMAC *job)
+{
+ if (16 == job->aes_key_len_in_bytes)
+ aes_cntr_128_submit_vaes_avx512(job);
+ else if (24 == job->aes_key_len_in_bytes)
+ aes_cntr_192_submit_vaes_avx512(job);
+ else /* assume 32 bytes */
+ aes_cntr_256_submit_vaes_avx512(job);
+
+ job->status |= STS_COMPLETED_AES;
+ return job;
+}
+
+static JOB_AES_HMAC *
+vaes_submit_cntr_bit_avx512(JOB_AES_HMAC *job)
+{
+ if (16 == job->aes_key_len_in_bytes)
+ aes_cntr_bit_128_submit_vaes_avx512(job);
+ else if (24 == job->aes_key_len_in_bytes)
+ aes_cntr_bit_192_submit_vaes_avx512(job);
+ else /* assume 32 bytes */
+ aes_cntr_bit_256_submit_vaes_avx512(job);
+
+ job->status |= STS_COMPLETED_AES;
+ return job;
+}
+
+/* ====================================================================== */
+
+static JOB_AES_HMAC *
+(*submit_job_aes128_enc_avx512)
+ (MB_MGR_AES_OOO *state, JOB_AES_HMAC *job) = submit_job_aes128_enc_avx;
+
+static JOB_AES_HMAC *
+(*submit_job_aes192_enc_avx512)
+ (MB_MGR_AES_OOO *state, JOB_AES_HMAC *job) = submit_job_aes192_enc_avx;
+
+static JOB_AES_HMAC *
+(*submit_job_aes256_enc_avx512)
+ (MB_MGR_AES_OOO *state, JOB_AES_HMAC *job) = submit_job_aes256_enc_avx;
+
+static JOB_AES_HMAC *
+(*flush_job_aes128_enc_avx512)
+ (MB_MGR_AES_OOO *state) = flush_job_aes128_enc_avx;
+
+static JOB_AES_HMAC *
+(*flush_job_aes192_enc_avx512)
+ (MB_MGR_AES_OOO *state) = flush_job_aes192_enc_avx;
+
+static JOB_AES_HMAC *
+(*flush_job_aes256_enc_avx512)
+ (MB_MGR_AES_OOO *state) = flush_job_aes256_enc_avx;
+
+static void
+(*aes_cbc_dec_128_avx512) (const void *in, const uint8_t *IV,
+ const void *keys, void *out,
+ uint64_t len_bytes) = aes_cbc_dec_128_avx;
+static void
+(*aes_cbc_dec_192_avx512) (const void *in, const uint8_t *IV,
+ const void *keys, void *out,
+ uint64_t len_bytes) = aes_cbc_dec_192_avx;
+static void
+(*aes_cbc_dec_256_avx512) (const void *in, const uint8_t *IV,
+ const void *keys, void *out,
+ uint64_t len_bytes) = aes_cbc_dec_256_avx;
+
+void
+init_mb_mgr_avx512(MB_MGR *state)
+{
+ unsigned int j, vaes_support = 0;
+ uint8_t *p;
+ size_t size;
+
+ state->features = cpu_feature_adjust(state->flags,
+ cpu_feature_detect());
+
+ if (!(state->features & IMB_FEATURE_AESNI)) {
+ init_mb_mgr_sse_no_aesni(state);
+ return;
+ }
+ if ((state->features & IMB_FEATURE_VAES) == IMB_FEATURE_VAES) {
+ vaes_support = 1;
+ aes_cbc_dec_128_avx512 = aes_cbc_dec_128_vaes_avx512;
+ aes_cbc_dec_192_avx512 = aes_cbc_dec_192_vaes_avx512;
+ aes_cbc_dec_256_avx512 = aes_cbc_dec_256_vaes_avx512;
+ submit_job_aes128_enc_avx512 =
+ submit_job_aes128_enc_vaes_avx512;
+ flush_job_aes128_enc_avx512 =
+ flush_job_aes128_enc_vaes_avx512;
+ submit_job_aes192_enc_avx512 =
+ submit_job_aes192_enc_vaes_avx512;
+ flush_job_aes192_enc_avx512 =
+ flush_job_aes192_enc_vaes_avx512;
+ submit_job_aes256_enc_avx512 =
+ submit_job_aes256_enc_vaes_avx512;
+ flush_job_aes256_enc_avx512 =
+ flush_job_aes256_enc_vaes_avx512;
+ }
+
+ /* Init AES out-of-order fields */
+ if (vaes_support) {
+ /* init 16 lanes */
+ memset(state->aes128_ooo.lens, 0,
+ sizeof(state->aes128_ooo.lens));
+ memset(state->aes128_ooo.job_in_lane, 0,
+ sizeof(state->aes128_ooo.job_in_lane));
+ state->aes128_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->aes128_ooo.num_lanes_inuse = 0;
+
+ memset(state->aes192_ooo.lens, 0,
+ sizeof(state->aes192_ooo.lens));
+ memset(state->aes192_ooo.job_in_lane, 0,
+ sizeof(state->aes192_ooo.job_in_lane));
+ state->aes192_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->aes192_ooo.num_lanes_inuse = 0;
+
+ memset(state->aes256_ooo.lens, 0,
+ sizeof(state->aes256_ooo.lens));
+ memset(state->aes256_ooo.job_in_lane, 0,
+ sizeof(state->aes256_ooo.job_in_lane));
+ state->aes256_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->aes256_ooo.num_lanes_inuse = 0;
+ } else {
+ /* init 8 lanes */
+ memset(state->aes128_ooo.lens, 0xFF,
+ sizeof(state->aes128_ooo.lens));
+ memset(&state->aes128_ooo.lens[0], 0,
+ sizeof(state->aes128_ooo.lens[0]) * 8);
+ memset(state->aes128_ooo.job_in_lane, 0,
+ sizeof(state->aes128_ooo.job_in_lane));
+ state->aes128_ooo.unused_lanes = 0xF76543210;
+ state->aes128_ooo.num_lanes_inuse = 0;
+
+ memset(state->aes192_ooo.lens, 0xFF,
+ sizeof(state->aes192_ooo.lens));
+ memset(&state->aes192_ooo.lens[0], 0,
+ sizeof(state->aes192_ooo.lens[0]) * 8);
+ memset(state->aes192_ooo.job_in_lane, 0,
+ sizeof(state->aes192_ooo.job_in_lane));
+ state->aes192_ooo.unused_lanes = 0xF76543210;
+ state->aes192_ooo.num_lanes_inuse = 0;
+
+ memset(&state->aes256_ooo.lens, 0xFF,
+ sizeof(state->aes256_ooo.lens));
+ memset(&state->aes256_ooo.lens[0], 0,
+ sizeof(state->aes256_ooo.lens[0]) * 8);
+ memset(state->aes256_ooo.job_in_lane, 0,
+ sizeof(state->aes256_ooo.job_in_lane));
+ state->aes256_ooo.unused_lanes = 0xF76543210;
+ state->aes256_ooo.num_lanes_inuse = 0;
+ }
+
+
+ /* DOCSIS SEC BPI (AES CBC + AES CFB for partial block)
+ * uses same settings as AES128 CBC.
+ */
+ if (vaes_support) {
+ /* init 16 lanes */
+ memset(state->docsis_sec_ooo.lens, 0,
+ sizeof(state->docsis_sec_ooo.lens));
+ memset(state->docsis_sec_ooo.job_in_lane, 0,
+ sizeof(state->docsis_sec_ooo.job_in_lane));
+ state->docsis_sec_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->docsis_sec_ooo.num_lanes_inuse = 0;
+ } else {
+ /* init 8 lanes */
+ memset(state->docsis_sec_ooo.lens, 0xFF,
+ sizeof(state->docsis_sec_ooo.lens));
+ memset(&state->docsis_sec_ooo.lens[0], 0,
+ sizeof(state->docsis_sec_ooo.lens[0]) * 8);
+ memset(state->docsis_sec_ooo.job_in_lane, 0,
+ sizeof(state->docsis_sec_ooo.job_in_lane));
+ state->docsis_sec_ooo.unused_lanes = 0xF76543210;
+ state->docsis_sec_ooo.num_lanes_inuse = 0;
+ }
+
+
+ /* DES, 3DES and DOCSIS DES (DES CBC + DES CFB for partial block) */
+ /* - separate DES OOO for encryption */
+ for (j = 0; j < AVX512_NUM_DES_LANES; j++) {
+ state->des_enc_ooo.lens[j] = 0;
+ state->des_enc_ooo.job_in_lane[j] = NULL;
+ }
+ state->des_enc_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->des_enc_ooo.num_lanes_inuse = 0;
+ memset(&state->des_enc_ooo.args, 0, sizeof(state->des_enc_ooo.args));
+
+ /* - separate DES OOO for decryption */
+ for (j = 0; j < AVX512_NUM_DES_LANES; j++) {
+ state->des_dec_ooo.lens[j] = 0;
+ state->des_dec_ooo.job_in_lane[j] = NULL;
+ }
+ state->des_dec_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->des_dec_ooo.num_lanes_inuse = 0;
+ memset(&state->des_dec_ooo.args, 0, sizeof(state->des_dec_ooo.args));
+
+ /* - separate 3DES OOO for encryption */
+ for (j = 0; j < AVX512_NUM_DES_LANES; j++) {
+ state->des3_enc_ooo.lens[j] = 0;
+ state->des3_enc_ooo.job_in_lane[j] = NULL;
+ }
+ state->des3_enc_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->des3_enc_ooo.num_lanes_inuse = 0;
+ memset(&state->des3_enc_ooo.args, 0, sizeof(state->des3_enc_ooo.args));
+
+ /* - separate 3DES OOO for decryption */
+ for (j = 0; j < AVX512_NUM_DES_LANES; j++) {
+ state->des3_dec_ooo.lens[j] = 0;
+ state->des3_dec_ooo.job_in_lane[j] = NULL;
+ }
+ state->des3_dec_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->des3_dec_ooo.num_lanes_inuse = 0;
+ memset(&state->des3_dec_ooo.args, 0, sizeof(state->des3_dec_ooo.args));
+
+ /* - separate DOCSIS DES OOO for encryption */
+ for (j = 0; j < AVX512_NUM_DES_LANES; j++) {
+ state->docsis_des_enc_ooo.lens[j] = 0;
+ state->docsis_des_enc_ooo.job_in_lane[j] = NULL;
+ }
+ state->docsis_des_enc_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->docsis_des_enc_ooo.num_lanes_inuse = 0;
+ memset(&state->docsis_des_enc_ooo.args, 0,
+ sizeof(state->docsis_des_enc_ooo.args));
+
+ /* - separate DES OOO for decryption */
+ for (j = 0; j < AVX512_NUM_DES_LANES; j++) {
+ state->docsis_des_dec_ooo.lens[j] = 0;
+ state->docsis_des_dec_ooo.job_in_lane[j] = NULL;
+ }
+ state->docsis_des_dec_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->docsis_des_dec_ooo.num_lanes_inuse = 0;
+ memset(&state->docsis_des_dec_ooo.args, 0,
+ sizeof(state->docsis_des_dec_ooo.args));
+
+ /* Init HMAC/SHA1 out-of-order fields */
+ state->hmac_sha_1_ooo.lens[0] = 0;
+ state->hmac_sha_1_ooo.lens[1] = 0;
+ state->hmac_sha_1_ooo.lens[2] = 0;
+ state->hmac_sha_1_ooo.lens[3] = 0;
+ state->hmac_sha_1_ooo.lens[4] = 0;
+ state->hmac_sha_1_ooo.lens[5] = 0;
+ state->hmac_sha_1_ooo.lens[6] = 0;
+ state->hmac_sha_1_ooo.lens[7] = 0;
+ state->hmac_sha_1_ooo.lens[8] = 0;
+ state->hmac_sha_1_ooo.lens[9] = 0;
+ state->hmac_sha_1_ooo.lens[10] = 0;
+ state->hmac_sha_1_ooo.lens[11] = 0;
+ state->hmac_sha_1_ooo.lens[12] = 0;
+ state->hmac_sha_1_ooo.lens[13] = 0;
+ state->hmac_sha_1_ooo.lens[14] = 0;
+ state->hmac_sha_1_ooo.lens[15] = 0;
+ state->hmac_sha_1_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->hmac_sha_1_ooo.num_lanes_inuse = 0;
+ for (j = 0; j < AVX512_NUM_SHA1_LANES; j++) {
+ state->hmac_sha_1_ooo.ldata[j].job_in_lane = NULL;
+ state->hmac_sha_1_ooo.ldata[j].extra_block[64] = 0x80;
+ memset(state->hmac_sha_1_ooo.ldata[j].extra_block + 65,
+ 0x00,
+ 64 + 7);
+ p = state->hmac_sha_1_ooo.ldata[j].outer_block;
+ memset(p + 5*4 + 1,
+ 0x00,
+ 64 - 5*4 - 1 - 2);
+ p[5 * 4] = 0x80;
+ p[64 - 2] = 0x02;
+ p[64 - 1] = 0xA0;
+ }
+
+ /* Init HMAC/SHA224 out-of-order fields */
+ state->hmac_sha_224_ooo.lens[0] = 0;
+ state->hmac_sha_224_ooo.lens[1] = 0;
+ state->hmac_sha_224_ooo.lens[2] = 0;
+ state->hmac_sha_224_ooo.lens[3] = 0;
+ state->hmac_sha_224_ooo.lens[4] = 0;
+ state->hmac_sha_224_ooo.lens[5] = 0;
+ state->hmac_sha_224_ooo.lens[6] = 0;
+ state->hmac_sha_224_ooo.lens[7] = 0;
+ state->hmac_sha_224_ooo.lens[8] = 0;
+ state->hmac_sha_224_ooo.lens[9] = 0;
+ state->hmac_sha_224_ooo.lens[10] = 0;
+ state->hmac_sha_224_ooo.lens[11] = 0;
+ state->hmac_sha_224_ooo.lens[12] = 0;
+ state->hmac_sha_224_ooo.lens[13] = 0;
+ state->hmac_sha_224_ooo.lens[14] = 0;
+ state->hmac_sha_224_ooo.lens[15] = 0;
+ state->hmac_sha_224_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->hmac_sha_224_ooo.num_lanes_inuse = 0;
+ /* sha256 and sha224 are very similar except for
+ * digest constants and output size
+ */
+ for (j = 0; j < AVX512_NUM_SHA256_LANES; j++) {
+ state->hmac_sha_224_ooo.ldata[j].job_in_lane = NULL;
+
+ p = state->hmac_sha_224_ooo.ldata[j].extra_block;
+ size = sizeof(state->hmac_sha_224_ooo.ldata[j].extra_block);
+ memset (p, 0x00, size);
+ p[64] = 0x80;
+
+ p = state->hmac_sha_224_ooo.ldata[j].outer_block;
+ size = sizeof(state->hmac_sha_224_ooo.ldata[j].outer_block);
+ memset(p, 0x00, size);
+ p[7 * 4] = 0x80; /* digest 7 words long */
+ p[64 - 2] = 0x02; /* length in little endian = 0x02E0 */
+ p[64 - 1] = 0xE0;
+ }
+
+ /* Init HMAC/SHA256 out-of-order fields */
+ state->hmac_sha_256_ooo.lens[0] = 0;
+ state->hmac_sha_256_ooo.lens[1] = 0;
+ state->hmac_sha_256_ooo.lens[2] = 0;
+ state->hmac_sha_256_ooo.lens[3] = 0;
+ state->hmac_sha_256_ooo.lens[4] = 0;
+ state->hmac_sha_256_ooo.lens[5] = 0;
+ state->hmac_sha_256_ooo.lens[6] = 0;
+ state->hmac_sha_256_ooo.lens[7] = 0;
+ state->hmac_sha_256_ooo.lens[8] = 0;
+ state->hmac_sha_256_ooo.lens[9] = 0;
+ state->hmac_sha_256_ooo.lens[10] = 0;
+ state->hmac_sha_256_ooo.lens[11] = 0;
+ state->hmac_sha_256_ooo.lens[12] = 0;
+ state->hmac_sha_256_ooo.lens[13] = 0;
+ state->hmac_sha_256_ooo.lens[14] = 0;
+ state->hmac_sha_256_ooo.lens[15] = 0;
+ state->hmac_sha_256_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->hmac_sha_256_ooo.num_lanes_inuse = 0;
+ for (j = 0; j < AVX512_NUM_SHA256_LANES; j++) {
+ state->hmac_sha_256_ooo.ldata[j].job_in_lane = NULL;
+ state->hmac_sha_256_ooo.ldata[j].extra_block[64] = 0x80;
+ memset(state->hmac_sha_256_ooo.ldata[j].extra_block + 65,
+ 0x00,
+ 64 + 7);
+ /* hmac related */
+ p = state->hmac_sha_256_ooo.ldata[j].outer_block;
+ memset(p + 8*4 + 1,
+ 0x00,
+ 64 - 8*4 - 1 - 2);
+ p[8 * 4] = 0x80; /* 8 digest words */
+ p[64 - 2] = 0x03; /* length */
+ p[64 - 1] = 0x00;
+ }
+
+ /* Init HMAC/SHA384 out-of-order fields */
+ state->hmac_sha_384_ooo.lens[0] = 0;
+ state->hmac_sha_384_ooo.lens[1] = 0;
+ state->hmac_sha_384_ooo.lens[2] = 0;
+ state->hmac_sha_384_ooo.lens[3] = 0;
+ state->hmac_sha_384_ooo.lens[4] = 0;
+ state->hmac_sha_384_ooo.lens[5] = 0;
+ state->hmac_sha_384_ooo.lens[6] = 0;
+ state->hmac_sha_384_ooo.lens[7] = 0;
+ state->hmac_sha_384_ooo.unused_lanes = 0xF76543210;
+ for (j = 0; j < AVX512_NUM_SHA512_LANES; j++) {
+ MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_384_ooo;
+
+ ctx->ldata[j].job_in_lane = NULL;
+ ctx->ldata[j].extra_block[SHA_384_BLOCK_SIZE] = 0x80;
+ memset(ctx->ldata[j].extra_block + (SHA_384_BLOCK_SIZE + 1),
+ 0x00, SHA_384_BLOCK_SIZE + 7);
+ p = ctx->ldata[j].outer_block;
+ /* special end point because this length is constant */
+ memset(p + SHA384_DIGEST_SIZE_IN_BYTES + 1, 0x00,
+ SHA_384_BLOCK_SIZE -
+ SHA384_DIGEST_SIZE_IN_BYTES - 1 - 2);
+ /* mark the end */
+ p[SHA384_DIGEST_SIZE_IN_BYTES] = 0x80;
+ /* hmac outer block length always of fixed size,
+ * it is OKey length, a whole message block length, 1024 bits,
+ * with padding plus the length of the inner digest,
+ * which is 384 bits, 1408 bits == 0x0580.
+ * The input message block needs to be converted to big endian
+ * within the sha implementation before use.
+ */
+ p[SHA_384_BLOCK_SIZE - 2] = 0x05;
+ p[SHA_384_BLOCK_SIZE - 1] = 0x80;
+ }
+
+ /* Init HMAC/SHA512 out-of-order fields */
+ state->hmac_sha_512_ooo.lens[0] = 0;
+ state->hmac_sha_512_ooo.lens[1] = 0;
+ state->hmac_sha_512_ooo.lens[2] = 0;
+ state->hmac_sha_512_ooo.lens[3] = 0;
+ state->hmac_sha_512_ooo.lens[4] = 0;
+ state->hmac_sha_512_ooo.lens[5] = 0;
+ state->hmac_sha_512_ooo.lens[6] = 0;
+ state->hmac_sha_512_ooo.lens[7] = 0;
+ state->hmac_sha_512_ooo.unused_lanes = 0xF76543210;
+ for (j = 0; j < AVX512_NUM_SHA512_LANES; j++) {
+ MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_512_ooo;
+
+ ctx->ldata[j].job_in_lane = NULL;
+ ctx->ldata[j].extra_block[SHA_512_BLOCK_SIZE] = 0x80;
+ memset(ctx->ldata[j].extra_block + (SHA_512_BLOCK_SIZE + 1),
+ 0x00, SHA_512_BLOCK_SIZE + 7);
+ p = ctx->ldata[j].outer_block;
+ /* special end point because this length is constant */
+ memset(p + SHA512_DIGEST_SIZE_IN_BYTES + 1, 0x00,
+ SHA_512_BLOCK_SIZE -
+ SHA512_DIGEST_SIZE_IN_BYTES - 1 - 2);
+ /* mark the end */
+ p[SHA512_DIGEST_SIZE_IN_BYTES] = 0x80;
+ /* hmac outer block length always of fixed size,
+ * it is OKey length, a whole message block length, 1024 bits,
+ * with padding plus the length of the inner digest,
+ * which is 512 bits, 1536 bits == 0x600.
+ * The input message block needs to be converted to big endian
+ * within the sha implementation before use.
+ */
+ p[SHA_512_BLOCK_SIZE - 2] = 0x06;
+ p[SHA_512_BLOCK_SIZE - 1] = 0x00;
+ }
+
+ /* Init HMAC/MD5 out-of-order fields */
+ state->hmac_md5_ooo.lens[0] = 0;
+ state->hmac_md5_ooo.lens[1] = 0;
+ state->hmac_md5_ooo.lens[2] = 0;
+ state->hmac_md5_ooo.lens[3] = 0;
+ state->hmac_md5_ooo.lens[4] = 0;
+ state->hmac_md5_ooo.lens[5] = 0;
+ state->hmac_md5_ooo.lens[6] = 0;
+ state->hmac_md5_ooo.lens[7] = 0;
+ state->hmac_md5_ooo.lens[8] = 0;
+ state->hmac_md5_ooo.lens[9] = 0;
+ state->hmac_md5_ooo.lens[10] = 0;
+ state->hmac_md5_ooo.lens[11] = 0;
+ state->hmac_md5_ooo.lens[12] = 0;
+ state->hmac_md5_ooo.lens[13] = 0;
+ state->hmac_md5_ooo.lens[14] = 0;
+ state->hmac_md5_ooo.lens[15] = 0;
+ state->hmac_md5_ooo.unused_lanes = 0xFEDCBA9876543210;
+ state->hmac_md5_ooo.num_lanes_inuse = 0;
+ for (j = 0; j < AVX512_NUM_MD5_LANES; j++) {
+ state->hmac_md5_ooo.ldata[j].job_in_lane = NULL;
+
+ p = state->hmac_md5_ooo.ldata[j].extra_block;
+ size = sizeof(state->hmac_md5_ooo.ldata[j].extra_block);
+ memset (p, 0x00, size);
+ p[64] = 0x80;
+
+ p = state->hmac_md5_ooo.ldata[j].outer_block;
+ size = sizeof(state->hmac_md5_ooo.ldata[j].outer_block);
+ memset(p, 0x00, size);
+ p[4 * 4] = 0x80;
+ p[64 - 7] = 0x02;
+ p[64 - 8] = 0x80;
+ }
+
+ /* Init AES/XCBC OOO fields */
+ state->aes_xcbc_ooo.lens[0] = 0;
+ state->aes_xcbc_ooo.lens[1] = 0;
+ state->aes_xcbc_ooo.lens[2] = 0;
+ state->aes_xcbc_ooo.lens[3] = 0;
+ state->aes_xcbc_ooo.lens[4] = 0;
+ state->aes_xcbc_ooo.lens[5] = 0;
+ state->aes_xcbc_ooo.lens[6] = 0;
+ state->aes_xcbc_ooo.lens[7] = 0;
+ state->aes_xcbc_ooo.unused_lanes = 0xF76543210;
+ for (j = 0; j < 8 ; j++) {
+ state->aes_xcbc_ooo.ldata[j].job_in_lane = NULL;
+ state->aes_xcbc_ooo.ldata[j].final_block[16] = 0x80;
+ memset(state->aes_xcbc_ooo.ldata[j].final_block + 17, 0x00, 15);
+ }
+
+ /* Init AES-CCM auth out-of-order fields */
+ for (j = 0; j < 8; j++) {
+ state->aes_ccm_ooo.init_done[j] = 0;
+ state->aes_ccm_ooo.lens[j] = 0;
+ state->aes_ccm_ooo.job_in_lane[j] = NULL;
+ }
+ state->aes_ccm_ooo.unused_lanes = 0xF76543210;
+
+ /* Init AES-CMAC auth out-of-order fields */
+ for (j = 0; j < 8; j++) {
+ state->aes_cmac_ooo.init_done[j] = 0;
+ state->aes_cmac_ooo.lens[j] = 0;
+ state->aes_cmac_ooo.job_in_lane[j] = NULL;
+ }
+ state->aes_cmac_ooo.unused_lanes = 0xF76543210;
+
+ /* Init "in order" components */
+ state->next_job = 0;
+ state->earliest_job = -1;
+
+ /* set handlers */
+ state->get_next_job = get_next_job_avx512;
+ state->submit_job = submit_job_avx512;
+ state->submit_job_nocheck = submit_job_nocheck_avx512;
+ state->get_completed_job = get_completed_job_avx512;
+ state->flush_job = flush_job_avx512;
+ state->queue_size = queue_size_avx512;
+ state->keyexp_128 = aes_keyexp_128_avx512;
+ state->keyexp_192 = aes_keyexp_192_avx512;
+ state->keyexp_256 = aes_keyexp_256_avx512;
+ state->cmac_subkey_gen_128 = aes_cmac_subkey_gen_avx512;
+ state->xcbc_keyexp = aes_xcbc_expand_key_avx512;
+ state->des_key_sched = des_key_schedule;
+ state->sha1_one_block = sha1_one_block_avx512;
+ state->sha1 = sha1_avx512;
+ state->sha224_one_block = sha224_one_block_avx512;
+ state->sha224 = sha224_avx512;
+ state->sha256_one_block = sha256_one_block_avx512;
+ state->sha256 = sha256_avx512;
+ state->sha384_one_block = sha384_one_block_avx512;
+ state->sha384 = sha384_avx512;
+ state->sha512_one_block = sha512_one_block_avx512;
+ state->sha512 = sha512_avx512;
+ state->md5_one_block = md5_one_block_avx512;
+ state->aes128_cfb_one = aes_cfb_128_one_avx512;
+
+ state->eea3_1_buffer = zuc_eea3_1_buffer_avx;
+ state->eea3_4_buffer = zuc_eea3_4_buffer_avx;
+ state->eea3_n_buffer = zuc_eea3_n_buffer_avx;
+ state->eia3_1_buffer = zuc_eia3_1_buffer_avx;
+
+ state->f8_1_buffer = kasumi_f8_1_buffer_avx;
+ state->f8_1_buffer_bit = kasumi_f8_1_buffer_bit_avx;
+ state->f8_2_buffer = kasumi_f8_2_buffer_avx;
+ state->f8_3_buffer = kasumi_f8_3_buffer_avx;
+ state->f8_4_buffer = kasumi_f8_4_buffer_avx;
+ state->f8_n_buffer = kasumi_f8_n_buffer_avx;
+ state->f9_1_buffer = kasumi_f9_1_buffer_avx;
+ state->f9_1_buffer_user = kasumi_f9_1_buffer_user_avx;
+ state->kasumi_init_f8_key_sched = kasumi_init_f8_key_sched_avx;
+ state->kasumi_init_f9_key_sched = kasumi_init_f9_key_sched_avx;
+ state->kasumi_key_sched_size = kasumi_key_sched_size_avx;
+
+ state->snow3g_f8_1_buffer_bit = snow3g_f8_1_buffer_bit_avx2;
+ state->snow3g_f8_1_buffer = snow3g_f8_1_buffer_avx2;
+ state->snow3g_f8_2_buffer = snow3g_f8_2_buffer_avx2;
+ state->snow3g_f8_4_buffer = snow3g_f8_4_buffer_avx2;
+ state->snow3g_f8_8_buffer = snow3g_f8_8_buffer_avx2;
+ state->snow3g_f8_n_buffer = snow3g_f8_n_buffer_avx2;
+ state->snow3g_f8_8_buffer_multikey = snow3g_f8_8_buffer_multikey_avx2;
+ state->snow3g_f8_n_buffer_multikey = snow3g_f8_n_buffer_multikey_avx2;
+ state->snow3g_f9_1_buffer = snow3g_f9_1_buffer_avx2;
+ state->snow3g_init_key_sched = snow3g_init_key_sched_avx2;
+ state->snow3g_key_sched_size = snow3g_key_sched_size_avx2;
+
+ if ((state->features & IMB_FEATURE_VAES) == IMB_FEATURE_VAES) {
+ submit_job_aes_cntr_avx512 = vaes_submit_cntr_avx512;
+ submit_job_aes_cntr_bit_avx512 = vaes_submit_cntr_bit_avx512;
+ }
+#ifndef NO_GCM
+ if ((state->features & (IMB_FEATURE_VAES | IMB_FEATURE_VPCLMULQDQ)) ==
+ (IMB_FEATURE_VAES | IMB_FEATURE_VPCLMULQDQ)) {
+ state->gcm128_enc = aes_gcm_enc_128_vaes_avx512;
+ state->gcm192_enc = aes_gcm_enc_192_vaes_avx512;
+ state->gcm256_enc = aes_gcm_enc_256_vaes_avx512;
+ state->gcm128_dec = aes_gcm_dec_128_vaes_avx512;
+ state->gcm192_dec = aes_gcm_dec_192_vaes_avx512;
+ state->gcm256_dec = aes_gcm_dec_256_vaes_avx512;
+ state->gcm128_init = aes_gcm_init_128_vaes_avx512;
+ state->gcm192_init = aes_gcm_init_192_vaes_avx512;
+ state->gcm256_init = aes_gcm_init_256_vaes_avx512;
+ state->gcm128_enc_update = aes_gcm_enc_128_update_vaes_avx512;
+ state->gcm192_enc_update = aes_gcm_enc_192_update_vaes_avx512;
+ state->gcm256_enc_update = aes_gcm_enc_256_update_vaes_avx512;
+ state->gcm128_dec_update = aes_gcm_dec_128_update_vaes_avx512;
+ state->gcm192_dec_update = aes_gcm_dec_192_update_vaes_avx512;
+ state->gcm256_dec_update = aes_gcm_dec_256_update_vaes_avx512;
+ state->gcm128_enc_finalize =
+ aes_gcm_enc_128_finalize_vaes_avx512;
+ state->gcm192_enc_finalize =
+ aes_gcm_enc_192_finalize_vaes_avx512;
+ state->gcm256_enc_finalize =
+ aes_gcm_enc_256_finalize_vaes_avx512;
+ state->gcm128_dec_finalize =
+ aes_gcm_dec_128_finalize_vaes_avx512;
+ state->gcm192_dec_finalize =
+ aes_gcm_dec_192_finalize_vaes_avx512;
+ state->gcm256_dec_finalize =
+ aes_gcm_dec_256_finalize_vaes_avx512;
+ state->gcm128_precomp = aes_gcm_precomp_128_vaes_avx512;
+ state->gcm192_precomp = aes_gcm_precomp_192_vaes_avx512;
+ state->gcm256_precomp = aes_gcm_precomp_256_vaes_avx512;
+ state->gcm128_pre = aes_gcm_pre_128_vaes_avx512;
+ state->gcm192_pre = aes_gcm_pre_192_vaes_avx512;
+ state->gcm256_pre = aes_gcm_pre_256_vaes_avx512;
+
+ submit_job_aes_gcm_enc_avx512 = vaes_submit_gcm_enc_avx512;
+ submit_job_aes_gcm_dec_avx512 = vaes_submit_gcm_dec_avx512;
+ } else {
+ state->gcm128_enc = aes_gcm_enc_128_avx512;
+ state->gcm192_enc = aes_gcm_enc_192_avx512;
+ state->gcm256_enc = aes_gcm_enc_256_avx512;
+ state->gcm128_dec = aes_gcm_dec_128_avx512;
+ state->gcm192_dec = aes_gcm_dec_192_avx512;
+ state->gcm256_dec = aes_gcm_dec_256_avx512;
+ state->gcm128_init = aes_gcm_init_128_avx512;
+ state->gcm192_init = aes_gcm_init_192_avx512;
+ state->gcm256_init = aes_gcm_init_256_avx512;
+ state->gcm128_enc_update = aes_gcm_enc_128_update_avx512;
+ state->gcm192_enc_update = aes_gcm_enc_192_update_avx512;
+ state->gcm256_enc_update = aes_gcm_enc_256_update_avx512;
+ state->gcm128_dec_update = aes_gcm_dec_128_update_avx512;
+ state->gcm192_dec_update = aes_gcm_dec_192_update_avx512;
+ state->gcm256_dec_update = aes_gcm_dec_256_update_avx512;
+ state->gcm128_enc_finalize = aes_gcm_enc_128_finalize_avx512;
+ state->gcm192_enc_finalize = aes_gcm_enc_192_finalize_avx512;
+ state->gcm256_enc_finalize = aes_gcm_enc_256_finalize_avx512;
+ state->gcm128_dec_finalize = aes_gcm_dec_128_finalize_avx512;
+ state->gcm192_dec_finalize = aes_gcm_dec_192_finalize_avx512;
+ state->gcm256_dec_finalize = aes_gcm_dec_256_finalize_avx512;
+ state->gcm128_precomp = aes_gcm_precomp_128_avx512;
+ state->gcm192_precomp = aes_gcm_precomp_192_avx512;
+ state->gcm256_precomp = aes_gcm_precomp_256_avx512;
+ state->gcm128_pre = aes_gcm_pre_128_avx512;
+ state->gcm192_pre = aes_gcm_pre_192_avx512;
+ state->gcm256_pre = aes_gcm_pre_256_avx512;
+ }
+#endif
+}
+
+#include "mb_mgr_code.h"
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_des_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_des_avx512.asm
new file mode 100644
index 000000000..decea625b
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_des_avx512.asm
@@ -0,0 +1,524 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; In System V AMD64 ABI
+;; calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RAX R8 R9 R10 R11
+;; Windows preserves: RBX RCX RDX RBP RSI RDI R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Linux clobbers: RAX RCX RDX R10 R11
+;; Linux preserves: RBX RBP RSI RDI R8 R9 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Clobbers ZMM0-31, K1-7 (K1-2 and K4-6 here but DES underneath clobbers K1-7).
+
+%include "include/os.asm"
+%include "include/reg_sizes.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "constants.asm"
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+extern docsis_des_x16_enc_avx512
+extern docsis_des_x16_dec_avx512
+extern des_x16_cbc_enc_avx512
+extern des_x16_cbc_dec_avx512
+extern des3_x16_cbc_enc_avx512
+extern des3_x16_cbc_dec_avx512
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%define arg4 rcx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 r8
+%define arg4 r9
+%endif
+
+%define STATE arg1
+%define JOB arg2
+
+%define IA0 arg3
+%define IA1 arg4
+%define IA2 r10
+
+%define MIN_IDX r11
+%define MIN_LEN rax
+%define LANE r11
+
+%define AVX512_NUM_DES_LANES 16
+
+%define ZTMP0 zmm0
+%define ZTMP1 zmm1
+%define ZTMP2 zmm2
+%define ZTMP3 zmm3
+%define ZTMP4 zmm4
+%define ZTMP5 zmm5
+%define ZTMP6 zmm6
+%define ZTMP7 zmm7
+%define ZTMP8 zmm8
+%define ZTMP9 zmm9
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; MACROS
+;;; ===========================================================================
+;;; ===========================================================================
+
+;;; ===========================================================================
+;;; DES/DOCSIS DES job submit
+;;; ===========================================================================
+;;; DES_DOCSIS [in] - DES, DOCSIS or 3DES cipher selection
+;;; ENC_DEC [in] - ENCrypt or DECrypt seection
+%macro GENERIC_DES_SUBMIT 2
+%define %%DES_DOCSIS %1
+%define %%ENC_DEC %2
+
+ ;; get unsued lane and increment number of lanes in use
+ mov IA0, [STATE + _des_unused_lanes]
+ mov LANE, IA0
+ and LANE, 0xF ;; just a nibble
+ shr IA0, 4
+ mov [STATE + _des_unused_lanes], IA0
+ add qword [STATE + _des_lanes_in_use], 1
+
+ ;; store job info in OOO structure
+ ;; - job pointer
+ mov [STATE + _des_job_in_lane + LANE*8], JOB
+ ;; - key schedule
+%ifidn %%ENC_DEC, ENC
+ mov IA2, [JOB + _aes_enc_key_expanded]
+%else
+ mov IA2, [JOB + _aes_dec_key_expanded]
+%endif
+ mov [STATE + _des_args_keys + LANE*8], IA2
+ ;; - IV
+ mov IA2, [JOB + _iv]
+ mov DWORD(IA0), [IA2]
+ mov DWORD(IA1), [IA2 + 4]
+ mov [STATE + _des_args_IV + LANE*4], DWORD(IA0)
+ mov [STATE + _des_args_IV + LANE*4 + (AVX512_NUM_DES_LANES*4)], DWORD(IA1)
+ ;; - src pointer
+ mov IA0, [JOB + _src]
+ add IA0, [JOB + _cipher_start_src_offset_in_bytes]
+ mov [STATE + _des_args_in + LANE*8], IA0
+ ;; - destination pointer
+ mov IA1, [JOB + _dst]
+ mov [STATE + _des_args_out + LANE*8], IA1
+ ;; - length in bytes (block aligned)
+ mov IA2, [JOB + _msg_len_to_cipher_in_bytes]
+ and IA2, -8
+ mov [STATE + _des_lens + LANE*2], WORD(IA2)
+%ifidn %%DES_DOCSIS, DOCSIS
+ ;; - block length
+ mov [STATE + _des_args_BLen + LANE*4], DWORD(IA2)
+ ;; - last in
+ add IA0, IA2
+ mov [STATE + _des_args_LIn + LANE*8], IA0
+ ;; - last out
+ add IA1, IA2
+ mov [STATE + _des_args_LOut + LANE*8], IA1
+ ;; - partial length
+ mov IA2, [JOB + _msg_len_to_cipher_in_bytes]
+ and IA2, 7
+ mov [STATE + _des_args_PLen + LANE*4], DWORD(IA2)
+%endif ; DOCSIS
+ ;; is there enough jobs to process them in parallel?
+ cmp qword [STATE + _des_lanes_in_use], AVX512_NUM_DES_LANES
+ jb %%_des_submit_null_end
+ ;; schedule the processing
+ ;; - find min job size
+ vmovdqa XWORD(ZTMP0), [STATE + _des_lens + 2*0]
+ vphminposuw XWORD(ZTMP2), XWORD(ZTMP0)
+ vpextrw DWORD(MIN_LEN), XWORD(ZTMP2), 0 ; min value
+ vpextrw DWORD(MIN_IDX), XWORD(ZTMP2), 1 ; min index
+ vmovdqa XWORD(ZTMP1), [STATE + _des_lens + 2*8]
+ vphminposuw XWORD(ZTMP2), XWORD(ZTMP1)
+ vpextrw DWORD(IA2), XWORD(ZTMP2), 0 ; min value
+ cmp DWORD(MIN_LEN), DWORD(IA2)
+ jle %%_use_min
+ vpextrw DWORD(MIN_IDX), XWORD(ZTMP2), 1 ; min index
+ add DWORD(MIN_IDX), 8 ; but index +8
+ mov MIN_LEN, IA2 ; min len
+%%_use_min:
+ cmp MIN_LEN, 0
+ je %%_len_is_0
+
+ vpbroadcastw XWORD(ZTMP3), WORD(MIN_LEN)
+ vpsubw XWORD(ZTMP0), XWORD(ZTMP0), XWORD(ZTMP3)
+ vmovdqa [STATE + _des_lens + 2*0], XWORD(ZTMP0)
+ vpsubw XWORD(ZTMP1), XWORD(ZTMP1), XWORD(ZTMP3)
+ vmovdqa [STATE + _des_lens + 2*8], XWORD(ZTMP1)
+
+ push MIN_IDX
+ mov arg2, MIN_LEN
+%ifidn %%ENC_DEC, ENC
+ ;; encrypt
+%ifidn %%DES_DOCSIS, DOCSIS
+ call docsis_des_x16_enc_avx512
+%endif
+%ifidn %%DES_DOCSIS, DES
+ call des_x16_cbc_enc_avx512
+%endif
+%ifidn %%DES_DOCSIS, 3DES
+ call des3_x16_cbc_enc_avx512
+%endif
+%else ; ENC
+ ;; decrypt
+%ifidn %%DES_DOCSIS, DOCSIS
+ call docsis_des_x16_dec_avx512
+%endif
+%ifidn %%DES_DOCSIS, DES
+ call des_x16_cbc_dec_avx512
+%endif
+%ifidn %%DES_DOCSIS, 3DES
+ call des3_x16_cbc_dec_avx512
+%endif
+%endif ; DEC
+ pop MIN_IDX
+ jmp %%_des_submit_end
+
+%%_des_submit_null_end:
+ xor rax, rax
+ jmp %%_des_submit_return
+
+%%_len_is_0:
+%ifidn %%DES_DOCSIS, DOCSIS
+ cmp dword [STATE + _des_args_PLen + MIN_IDX*4], 0
+ jz %%_des_submit_end
+ push MIN_IDX
+ xor arg2, arg2 ; len is 0
+%ifidn %%ENC_DEC, ENC
+ call docsis_des_x16_enc_avx512
+%else ; ENC
+ call docsis_des_x16_dec_avx512
+%endif ; DEC
+ pop MIN_IDX
+%endif ; DOCSIS
+ ;; fall trough
+%%_des_submit_end:
+ ;; return a job
+ ;; - decrement number of jobs in use
+ sub qword [STATE + _des_lanes_in_use], 1
+ ;; - put the lane back to free lanes pool
+ mov IA0, [STATE + _des_unused_lanes]
+ shl IA0, 4
+ or IA0, MIN_IDX
+ mov [STATE + _des_unused_lanes], IA0
+ ;; - mark job as complete
+ ;; - clear job pointer
+ mov rax, [STATE + _des_job_in_lane + MIN_IDX*8]
+ mov qword [STATE + _des_job_in_lane + MIN_IDX*8], 0
+ or dword [rax + _status], STS_COMPLETED_AES
+
+%ifdef SAFE_DATA
+ ;; Clear IV
+ mov dword [STATE + _des_args_IV + MIN_IDX*4], 0
+ mov dword [STATE + _des_args_IV + MIN_IDX*4 + (AVX512_NUM_DES_LANES*4)], 0
+%endif
+ vzeroupper
+%%_des_submit_return:
+%endmacro
+
+;;; ===========================================================================
+;;; DES/DOCSIS DES flush
+;;; ===========================================================================
+;;; DES_DOCSIS [in] - DES, DOCSIS or 3DES cipher selection
+;;; ENC_DEC [in] - ENCrypt or DECrypt selection
+;;;
+;;; Clobbers k1, k2, k4, k5 and k6
+%macro GENERIC_DES_FLUSH 2
+%define %%DES_DOCSIS %1
+%define %%ENC_DEC %2
+
+ cmp qword [STATE + _des_lanes_in_use], 0
+ je %%_des_flush_null_end
+
+ ;; find non-null job
+ vpxord ZTMP0, ZTMP0, ZTMP0
+ vmovdqu64 ZTMP1, [STATE + _des_job_in_lane + (0*PTR_SZ)]
+ vmovdqu64 ZTMP2, [STATE + _des_job_in_lane + (8*PTR_SZ)]
+ vpcmpq k1, ZTMP1, ZTMP0, 4 ; NEQ
+ vpcmpq k2, ZTMP2, ZTMP0, 4 ; NEQ
+ xor IA0, IA0
+ xor IA1, IA1
+ kmovw DWORD(IA0), k1
+ kmovw DWORD(IA1), k2
+ mov DWORD(IA2), DWORD(IA1)
+ shl DWORD(IA2), 8
+ or DWORD(IA2), DWORD(IA0) ; mask of non-null jobs in IA2
+ not BYTE(IA0)
+ kmovw k4, DWORD(IA0)
+ not BYTE(IA1)
+ kmovw k5, DWORD(IA1)
+ mov DWORD(IA0), DWORD(IA2)
+ not WORD(IA0)
+ kmovw k6, DWORD(IA0) ; mask of NULL jobs in k4, k5 and k6
+ mov DWORD(IA0), DWORD(IA2)
+ xor IA2, IA2
+ bsf WORD(IA2), WORD(IA0) ; index of the 1st set bit in IA2
+
+ ;; copy good lane data into NULL lanes
+ ;; - k1(L8)/k2(H8) - masks of non-null jobs
+ ;; - k4(L8)/k5(H8)/k6 - masks of NULL jobs
+ ;; - IA2 index of 1st non-null job
+
+ ;; - in pointer
+ mov IA0, [STATE + _des_args_in + IA2*8]
+ vpbroadcastq ZTMP1, IA0
+ vmovdqu64 [STATE + _des_args_in + (0*PTR_SZ)]{k4}, ZTMP1
+ vmovdqu64 [STATE + _des_args_in + (8*PTR_SZ)]{k5}, ZTMP1
+ ;; - out pointer
+ mov IA0, [STATE + _des_args_out + IA2*8]
+ vpbroadcastq ZTMP1, IA0
+ vmovdqu64 [STATE + _des_args_out + (0*PTR_SZ)]{k4}, ZTMP1
+ vmovdqu64 [STATE + _des_args_out + (8*PTR_SZ)]{k5}, ZTMP1
+ ;; - key schedule
+ mov IA0, [STATE + _des_args_keys + IA2*8]
+ vpbroadcastq ZTMP1, IA0
+ vmovdqu64 [STATE + _des_args_keys + (0*PTR_SZ)]{k4}, ZTMP1
+ vmovdqu64 [STATE + _des_args_keys + (8*PTR_SZ)]{k5}, ZTMP1
+ ;; - zero partial len
+ vmovdqu32 [STATE + _des_args_PLen]{k6}, ZTMP0
+ ;; - set len to UINT16_MAX
+ mov WORD(IA0), 0xffff
+ vpbroadcastw ZTMP1, WORD(IA0)
+ vmovdqu16 [STATE + _des_lens]{k6}, ZTMP1
+
+ ;; - IV
+ mov DWORD(IA0), [STATE + _des_args_IV + IA2*4]
+ mov DWORD(IA1), [STATE + _des_args_IV + IA2*4 + (16*4)]
+ vpbroadcastd ZTMP1, DWORD(IA0)
+ vpbroadcastd ZTMP2, DWORD(IA1)
+ vmovdqu32 [STATE + _des_args_IV]{k6}, ZTMP1
+ vmovdqu32 [STATE + _des_args_IV + (16*4)]{k6}, ZTMP2
+
+ ;; schedule the processing
+ ;; - find min job size
+ vmovdqa XWORD(ZTMP0), [STATE + _des_lens + 2*0]
+ vphminposuw XWORD(ZTMP2), XWORD(ZTMP0)
+ vpextrw DWORD(MIN_LEN), XWORD(ZTMP2), 0 ; min value
+ vpextrw DWORD(MIN_IDX), XWORD(ZTMP2), 1 ; min index
+ vmovdqa XWORD(ZTMP1), [STATE + _des_lens + 2*8]
+ vphminposuw XWORD(ZTMP2), XWORD(ZTMP1)
+ vpextrw DWORD(IA2), XWORD(ZTMP2), 0 ; min value
+ cmp DWORD(MIN_LEN), DWORD(IA2)
+ jle %%_use_min
+ vpextrw DWORD(MIN_IDX), XWORD(ZTMP2), 1 ; min index
+ add DWORD(MIN_IDX), 8 ; but index +8
+ mov MIN_LEN, IA2 ; min len
+%%_use_min:
+ vpbroadcastw XWORD(ZTMP3), WORD(MIN_LEN)
+ vpsubw XWORD(ZTMP0), XWORD(ZTMP0), XWORD(ZTMP3)
+ vmovdqa [STATE + _des_lens + 2*0], XWORD(ZTMP0)
+ vpsubw XWORD(ZTMP1), XWORD(ZTMP1), XWORD(ZTMP3)
+ vmovdqa [STATE + _des_lens + 2*8], XWORD(ZTMP1)
+
+ push MIN_IDX
+%ifdef SAFE_DATA
+ ;; Save k6, which may be clobbered by following functions
+ kmovq IA0, k6
+ push IA0
+%endif
+
+ mov arg2, MIN_LEN
+%ifidn %%ENC_DEC, ENC
+ ;; encrypt
+%ifidn %%DES_DOCSIS, DOCSIS
+ call docsis_des_x16_enc_avx512
+%endif
+%ifidn %%DES_DOCSIS, DES
+ call des_x16_cbc_enc_avx512
+%endif
+%ifidn %%DES_DOCSIS, 3DES
+ call des3_x16_cbc_enc_avx512
+%endif
+%else ; ENC
+ ;; decrypt
+%ifidn %%DES_DOCSIS, DOCSIS
+ call docsis_des_x16_dec_avx512
+%endif
+%ifidn %%DES_DOCSIS, DES
+ call des_x16_cbc_dec_avx512
+%endif
+%ifidn %%DES_DOCSIS, 3DES
+ call des3_x16_cbc_dec_avx512
+%endif
+%endif ; DEC
+%ifdef SAFE_DATA
+ ;; Restore k6, which may have been clobbered by previous functions
+ pop IA0
+ kmovq k6, IA0
+%endif
+ pop MIN_IDX
+ jmp %%_des_flush_end
+
+%%_des_flush_null_end:
+ xor rax, rax
+ jmp %%_des_flush_return
+%%_des_flush_end:
+ ;; return a job
+ ;; - decrement number of jobs in use
+ sub qword [STATE + _des_lanes_in_use], 1
+ ;; - put the lane back to free lanes pool
+ mov IA0, [STATE + _des_unused_lanes]
+ shl IA0, 4
+ or IA0, MIN_IDX
+ mov [STATE + _des_unused_lanes], IA0
+ ;; - mark job as complete
+ mov rax, [STATE + _des_job_in_lane + MIN_IDX*8]
+ or dword [rax + _status], STS_COMPLETED_AES
+ ;; - clear job pointer
+ mov qword [STATE + _des_job_in_lane + MIN_IDX*8], 0
+%ifdef SAFE_DATA
+ ; Set bit of lane of returned job
+ xor DWORD(IA0), DWORD(IA0)
+ bts DWORD(IA0), DWORD(MIN_IDX)
+ kmovd k1, DWORD(IA0)
+ kord k6, k1, k6
+
+ ;; Clear IV of returned job and "NULL lanes" (k6 contains the mask of the jobs)
+ vpxorq ZTMP1, ZTMP1
+ vmovdqa32 [STATE + _des_args_IV]{k6}, ZTMP1
+ vmovdqa32 [STATE + _des_args_IV + (16*4)]{k6}, ZTMP1
+%endif
+%%_des_flush_return:
+ vzeroupper
+%endmacro
+
+;;; ========================================================
+;;; DATA
+
+section .data
+default rel
+
+;;; ========================================================
+;;; CODE
+section .text
+
+;;; arg 1 : pointer to DES OOO structure
+;;; arg 2 : job
+align 64
+MKGLOBAL(submit_job_des_cbc_enc_avx512,function,internal)
+submit_job_des_cbc_enc_avx512:
+ GENERIC_DES_SUBMIT DES, ENC
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+;;; arg 2 : job
+align 64
+MKGLOBAL(submit_job_des_cbc_dec_avx512,function,internal)
+submit_job_des_cbc_dec_avx512:
+ GENERIC_DES_SUBMIT DES, DEC
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+;;; arg 2 : job
+align 64
+MKGLOBAL(submit_job_docsis_des_enc_avx512,function,internal)
+submit_job_docsis_des_enc_avx512:
+ GENERIC_DES_SUBMIT DOCSIS, ENC
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+;;; arg 2 : job
+align 64
+MKGLOBAL(submit_job_docsis_des_dec_avx512,function,internal)
+submit_job_docsis_des_dec_avx512:
+ GENERIC_DES_SUBMIT DOCSIS, DEC
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+;;; arg 2 : job
+align 64
+MKGLOBAL(submit_job_3des_cbc_enc_avx512,function,internal)
+submit_job_3des_cbc_enc_avx512:
+ GENERIC_DES_SUBMIT 3DES, ENC
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+;;; arg 2 : job
+align 64
+MKGLOBAL(submit_job_3des_cbc_dec_avx512,function,internal)
+submit_job_3des_cbc_dec_avx512:
+ GENERIC_DES_SUBMIT 3DES, DEC
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+align 64
+MKGLOBAL(flush_job_des_cbc_enc_avx512,function,internal)
+flush_job_des_cbc_enc_avx512:
+ GENERIC_DES_FLUSH DES, ENC
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+align 64
+MKGLOBAL(flush_job_des_cbc_dec_avx512,function,internal)
+flush_job_des_cbc_dec_avx512:
+ GENERIC_DES_FLUSH DES, DEC
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+align 64
+MKGLOBAL(flush_job_docsis_des_enc_avx512,function,internal)
+flush_job_docsis_des_enc_avx512:
+ GENERIC_DES_FLUSH DOCSIS, ENC
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+align 64
+MKGLOBAL(flush_job_docsis_des_dec_avx512,function,internal)
+flush_job_docsis_des_dec_avx512:
+ GENERIC_DES_FLUSH DOCSIS, DEC
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+align 64
+MKGLOBAL(flush_job_3des_cbc_enc_avx512,function,internal)
+flush_job_3des_cbc_enc_avx512:
+ GENERIC_DES_FLUSH 3DES, ENC
+ ret
+
+;;; arg 1 : pointer to DES OOO structure
+align 64
+MKGLOBAL(flush_job_3des_cbc_dec_avx512,function,internal)
+flush_job_3des_cbc_dec_avx512:
+ GENERIC_DES_FLUSH 3DES, DEC
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_flush_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_flush_avx512.asm
new file mode 100644
index 000000000..5fa08053f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_flush_avx512.asm
@@ -0,0 +1,367 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; In System V AMD64 ABI
+;; calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RAX RCX RDX R8 R9 R10 R11
+;; Windows preserves: RBX RBP RSI RDI R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Linux clobbers: RAX RSI RDI R8 R9 R10 R11
+;; Linux preserves: RBX RCX RDX RBP R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Clobbers ZMM0-31
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+;; %define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+extern sha1_x16_avx512
+
+section .data
+default rel
+
+align 16
+byteswap:
+ dq 0x0405060700010203
+ dq 0x0c0d0e0f08090a0b
+
+align 32
+len_masks:
+ dq 0x000000000000FFFF, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
+ dq 0x00000000FFFF0000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
+ dq 0x0000FFFF00000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x000000000000FFFF, 0x0000000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x00000000FFFF0000, 0x0000000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000FFFF00000000, 0x0000000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0xFFFF000000000000, 0x0000000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000000000000000, 0x000000000000FFFF, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000000000000000, 0x00000000FFFF0000, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000000000000000, 0x0000FFFF00000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000000000000000, 0xFFFF000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x000000000000FFFF
+ dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x00000000FFFF0000
+ dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000FFFF00000000
+ dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFF000000000000
+
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+lane_8: dq 8
+lane_9: dq 9
+lane_10: dq 10
+lane_11: dq 11
+lane_12: dq 12
+lane_13: dq 13
+lane_14: dq 14
+lane_15: dq 15
+
+section .text
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx needs to be in rbx, rdi, rbp
+%define idx rbp
+
+%define unused_lanes r9
+%define lane_data r9
+%define tmp2 r9
+%define num_lanes_inuse r12
+%define len_upper r13
+%define idx_upper r14
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+
+%endif
+
+; we clobber rbp, called routine clobbers r12-r15
+struc STACK
+_gpr_save: resq 5
+_rsp_save: resq 1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* flush_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state)
+; arg 1 : rcx : state
+MKGLOBAL(flush_job_hmac_avx512,function,internal)
+flush_job_hmac_avx512:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -32 ; align stack to 32 byte boundary
+ mov [rsp + _gpr_save + 8*0], rbp
+ mov [rsp + _gpr_save + 8*1], r12
+ mov [rsp + _gpr_save + 8*2], r13
+ mov [rsp + _gpr_save + 8*3], r14
+ mov [rsp + _gpr_save + 8*4], r15
+ mov [rsp + _rsp_save], rax
+
+ DBGPRINTL "---------- start hmac flush avx512 -----------"
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse_sha1] ;empty?
+ cmp num_lanes_inuse, 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 15
+ cmp qword [state + _ldata + (I * _HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
+ cmovne idx, [rel APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+copy_lane_data:
+ ; copy valid lane (idx) to empty lanes
+ vmovdqa ymm0, [state + _lens]
+ mov tmp, [state + _args_data_ptr + PTR_SZ*idx]
+
+%assign I 0
+%rep 16
+ cmp qword [state + _ldata + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args_data_ptr + PTR_SZ*I], tmp
+ vpor ymm0, ymm0, [rel len_masks + 32*I] ; 32 for ymm, 16 for xmm
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+ vmovdqa [state + _lens], ymm0
+
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...7)
+
+ vmovdqa xmm2, [state + _lens + 8*2]
+ vphminposuw xmm3, xmm2
+ vpextrw DWORD(len_upper), xmm3, 0 ; min value
+ vpextrw DWORD(idx_upper), xmm3, 1 ; min index (8...F)
+
+ cmp len2, len_upper
+ jle use_min
+
+ vmovdqa xmm1, xmm3
+ mov len2, len_upper
+ mov idx, idx_upper ; idx would be in range 0..7
+ add idx, 8 ; to reflect that index is in 8..F range
+
+use_min:
+ DBGPRINTL64 "FLUSH min_length", len2
+ DBGPRINTL64 "FLUSH min_length index ", idx
+ cmp len2, 0
+ je len_is_0
+
+ vpbroadcastw xmm1, xmm1
+ DBGPRINTL_XMM "FLUSH lens after shuffle", xmm1
+
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens], xmm0
+ vpsubw xmm2, xmm2, xmm1
+ vmovdqa [state + _lens + 8*2], xmm2
+ DBGPRINTL_XMM "FLUSH lens immediately after min subtraction (0..7)", xmm0
+ DBGPRINTL_XMM "FLUSH lens immediately after min subtraction (8..F)", xmm2
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_x16_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+
+ vmovd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3
+ vpshufb xmm0, xmm0, [rel byteswap]
+ mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ vmovdqa [lane_data + _outer_block], xmm0
+ mov [lane_data + _outer_block + 4*4], DWORD(tmp)
+
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ mov DWORD(tmp), [tmp + 4*4]
+ vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3
+ mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp)
+ jmp copy_lane_data
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp copy_lane_data
+
+return_null:
+ DBGPRINTL "FLUSH *** ---------- return null"
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4 ;; a nibble
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse_sha1], 1
+
+ mov p, [job_rax + _auth_tag_output]
+
+ ; copy 12 bytes
+ mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(r12), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+ bswap DWORD(r12)
+ mov [p + 0*4], DWORD(tmp2)
+ mov [p + 1*4], DWORD(tmp4)
+ mov [p + 2*4], DWORD(r12)
+
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 12
+ je clear_ret
+
+ ;; copy remaining 8 bytes to return 20 byte digest
+ mov DWORD(r13), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(r14), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(r13)
+ bswap DWORD(r14)
+ mov [p + 3*SHA1_DIGEST_WORD_SIZE], DWORD(r13)
+ mov [p + 4*SHA1_DIGEST_WORD_SIZE], DWORD(r14)
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ vpxorq zmm0, zmm0
+
+ ;; Clear digest (20B), outer_block (20B) and extra_block (64B)
+ ;; of returned job and NULL jobs
+%assign I 0
+%rep 16
+ cmp qword [state + _ldata + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
+ jne APPEND(skip_clear_,I)
+
+ ;; Clear digest
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 0*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 1*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 2*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 3*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 4*SHA1_DIGEST_ROW_SIZE], 0
+
+ lea lane_data, [state + _ldata + (I*_HMAC_SHA1_LANE_DATA_size)]
+
+ ;; Clear first 64 bytes of extra_block
+ vmovdqu64 [lane_data + _extra_block], zmm0
+
+ ;; Clear first 20 bytes of outer_block
+ vmovdqu64 [lane_data + _outer_block], xmm0
+ mov dword [lane_data + _outer_block + 16], 0
+
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SAFE_DATA
+
+return:
+ DBGPRINTL "---------- exit hmac flush avx512 -----------"
+ vzeroupper
+
+ mov rbp, [rsp + _gpr_save + 8*0]
+ mov r12, [rsp + _gpr_save + 8*1]
+ mov r13, [rsp + _gpr_save + 8*2]
+ mov r14, [rsp + _gpr_save + 8*3]
+ mov r15, [rsp + _gpr_save + 8*4]
+ mov rsp, [rsp + _rsp_save]
+ ret
+
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_224_flush_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_224_flush_avx512.asm
new file mode 100644
index 000000000..656e854d5
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_224_flush_avx512.asm
@@ -0,0 +1,28 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+%define SHA224
+%include "avx512/mb_mgr_hmac_sha_256_flush_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_224_submit_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_224_submit_avx512.asm
new file mode 100644
index 000000000..60a98918a
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_224_submit_avx512.asm
@@ -0,0 +1,28 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+%define SHA224
+%include "avx512/mb_mgr_hmac_sha_256_submit_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_256_flush_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_256_flush_avx512.asm
new file mode 100644
index 000000000..023eb3454
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_256_flush_avx512.asm
@@ -0,0 +1,433 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; In System V AMD64 ABI
+;; calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RAX RCX RDX R8 R9 R10 R11
+;; Windows preserves: RBX RBP RSI RDI R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Linux clobbers: RAX RCX RDX RSI RDI R8 R9 R10 R11
+;; Linux preserves: RBX RBP R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Clobbers ZMM0-31
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+;; %define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+extern sha256_x16_avx512
+
+section .data
+default rel
+align 16
+byteswap:
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+align 32
+len_masks:
+ dq 0x000000000000FFFF, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
+ dq 0x00000000FFFF0000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
+ dq 0x0000FFFF00000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x000000000000FFFF, 0x0000000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x00000000FFFF0000, 0x0000000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000FFFF00000000, 0x0000000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0xFFFF000000000000, 0x0000000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000000000000000, 0x000000000000FFFF, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000000000000000, 0x00000000FFFF0000, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000000000000000, 0x0000FFFF00000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000000000000000, 0xFFFF000000000000, 0x0000000000000000
+ dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x000000000000FFFF
+ dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x00000000FFFF0000
+ dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000FFFF00000000
+ dq 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xFFFF000000000000
+
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+lane_8: dq 8
+lane_9: dq 9
+lane_10: dq 10
+lane_11: dq 11
+lane_12: dq 12
+lane_13: dq 13
+lane_14: dq 14
+lane_15: dq 15
+
+section .text
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbp, r15
+%define idx rbp
+
+%define unused_lanes r10
+%define tmp5 r10
+
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 arg3
+%define tmp r9
+
+%define len_upper r13
+%define idx_upper r14
+
+
+; we clobber rsi, rbp; called routine also clobbers rax, r9 to r15
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+; JOB* flush_job_hmac_sha_224_avx512(MB_MGR_HMAC_SHA_256_OOO *state)
+; JOB* flush_job_hmac_sha_256_avx512(MB_MGR_HMAC_SHA_256_OOO *state)
+; arg 1 : state
+align 32
+%ifdef SHA224
+MKGLOBAL(flush_job_hmac_sha_224_avx512,function,internal)
+flush_job_hmac_sha_224_avx512:
+%else
+MKGLOBAL(flush_job_hmac_sha_256_avx512,function,internal)
+flush_job_hmac_sha_256_avx512:
+%endif
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -32
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ ; if bit (32+3) is set, then all lanes are empty
+ cmp dword [state + _num_lanes_inuse_sha256], 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+
+%assign I 1
+%rep 15
+ cmp qword [state + _ldata_sha256 + (I * _HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
+ cmovne idx, [rel APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+copy_lane_data:
+ ; copy idx to empty lanes
+ vmovdqa ymm0, [state + _lens_sha256]
+ mov tmp, [state + _args_data_ptr_sha256 + PTR_SZ*idx]
+
+%assign I 0
+%rep 16
+ cmp qword [state + _ldata_sha256 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args_data_ptr_sha256 + PTR_SZ*I], tmp
+ vpor ymm0, ymm0, [rel len_masks + 32*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ vmovdqa [state + _lens_sha256 ], ymm0
+
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...7)
+
+ vmovdqa xmm2, [state + _lens_sha256 + 8*2]
+ vphminposuw xmm3, xmm2
+ vpextrw DWORD(len_upper), xmm3, 0 ; min value
+ vpextrw DWORD(idx_upper), xmm3, 1 ; min index (8...F)
+
+ cmp len2, len_upper
+ jle use_min
+
+ vmovdqa xmm1, xmm3
+ mov len2, len_upper
+ mov idx, idx_upper ; idx would be in range 0..7
+ add idx, 8 ; to reflect that index is in 8..F range
+
+use_min:
+ cmp len2, 0
+ je len_is_0
+
+ vpbroadcastw xmm1, xmm1 ; duplicate words across all lanes
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_sha256], xmm0
+ vpsubw xmm2, xmm2, xmm1
+ vmovdqa [state + _lens_sha256 + 8*2], xmm2
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_x16_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens_sha256 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp
+
+ vmovd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3
+ vpshufb xmm0, xmm0, [rel byteswap]
+ vmovd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2
+%ifndef SHA224
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3
+%endif
+ vpshufb xmm1, xmm1, [rel byteswap]
+
+ vmovdqa [lane_data + _outer_block], xmm0
+ vmovdqa [lane_data + _outer_block + 4*4], xmm1
+%ifdef SHA224
+ mov dword [lane_data + _outer_block + 7*4], 0x80
+%endif
+
+ mov job, [lane_data + _job_in_lane]
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ vmovdqu xmm1, [tmp + 4*4]
+ vmovd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3
+ vmovd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2
+ vpextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3
+ jmp copy_lane_data
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp copy_lane_data
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha256], unused_lanes
+
+ sub dword [state + _num_lanes_inuse_sha256], 1
+
+ mov p, [job_rax + _auth_tag_output]
+
+%ifdef SHA224
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 14
+ jne copy_full_digest
+%else
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 16
+ jne copy_full_digest
+%endif
+
+ ;; copy SHA224 14 bytes / SHA256 16 bytes
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+ bswap DWORD(tmp5)
+ mov [p + 0*4], DWORD(tmp)
+ mov [p + 1*4], DWORD(tmp2)
+ mov [p + 2*4], DWORD(tmp4)
+%ifdef SHA224
+ mov [p + 3*4], WORD(tmp5)
+%else
+ mov [p + 3*4], DWORD(tmp5)
+%endif
+ jmp clear_ret
+
+copy_full_digest:
+ ;; copy SHA224 28 bytes / SHA256 32 bytes
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+ bswap DWORD(tmp5)
+ mov [p + 0*4], DWORD(tmp)
+ mov [p + 1*4], DWORD(tmp2)
+ mov [p + 2*4], DWORD(tmp4)
+ mov [p + 3*4], DWORD(tmp5)
+
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE]
+%ifndef SHA224
+ mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE]
+%endif
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp4)
+%ifndef SHA224
+ bswap DWORD(tmp5)
+%endif
+ mov [p + 4*4], DWORD(tmp)
+ mov [p + 5*4], DWORD(tmp2)
+ mov [p + 6*4], DWORD(tmp4)
+%ifndef SHA224
+ mov [p + 7*4], DWORD(tmp5)
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ vpxorq zmm0, zmm0
+
+ ;; Clear digest (28B/32B), outer_block (28B/32B) and extra_block (64B)
+ ;; of returned job and NULL jobs
+%assign I 0
+%rep 16
+ cmp qword [state + _ldata_sha256 + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0
+ jne APPEND(skip_clear_,I)
+
+ ;; Clear digest (28 bytes for SHA-224, 32 bytes for SHA-256 bytes)
+%assign J 0
+%rep 7
+ mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*I + J*SHA256_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+%ifndef SHA224
+ mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*I + 7*SHA256_DIGEST_ROW_SIZE], 0
+%endif
+
+ lea lane_data, [state + _ldata_sha256 + (I*_HMAC_SHA1_LANE_DATA_size)]
+ ;; Clear first 64 bytes of extra_block
+ vmovdqu64 [lane_data + _extra_block], zmm0
+
+ ;; Clear first 28 bytes (SHA-224) or 32 bytes (SHA-256) of outer_block
+%ifdef SHA224
+ vmovdqa64 [lane_data + _outer_block], xmm0
+ mov qword [lane_data + _outer_block + 16], 0
+ mov dword [lane_data + _outer_block + 24], 0
+%else
+ vmovdqu64 [lane_data + _outer_block], ymm0
+%endif
+
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SAFE_DATA
+
+return:
+ vzeroupper
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_256_submit_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_256_submit_avx512.asm
new file mode 100644
index 000000000..baadef492
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_256_submit_avx512.asm
@@ -0,0 +1,445 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; In System V AMD64 ABI
+;; calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RAX RCX RDX R8 R9 R10 R11
+;; Windows preserves: RBX RBP RSI RDI R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Linux clobbers: RAX RCX RDX RSI RDI R8 R9 R10 R11
+;; Linux preserves: RBX RBP R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Clobbers ZMM0-31
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+
+;; %define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+extern sha256_x16_avx512
+
+section .data
+default rel
+
+align 16
+byteswap:
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+section .text
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rcx
+%define arg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 rdi
+%define arg4 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx needs to be in rbp, r15
+%define last_len rbp
+%define idx rbp
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+%define p2 rbx
+%define tmp4 rbx
+
+%define job_rax rax
+%define len rax
+
+%define size_offset arg3
+%define tmp2 arg3
+
+%define lane arg4
+%define tmp3 arg4
+
+%define extra_blocks r8
+%define tmp r9
+%define lane_data r10
+
+%define len_upper r13
+%define idx_upper r14
+
+; we clobber rbx, rsi, rdi, rbp; called routine also clobbers r9 to r15
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+ align 32
+%ifdef SHA224
+MKGLOBAL(submit_job_hmac_sha_224_avx512,function,internal)
+submit_job_hmac_sha_224_avx512:
+%else
+MKGLOBAL(submit_job_hmac_sha_256_avx512,function,internal)
+submit_job_hmac_sha_256_avx512:
+%endif
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -32
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ mov lane, unused_lanes
+ and lane, 0xF ;; just a nibble
+ shr unused_lanes, 4
+
+ imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ mov [state + _unused_lanes_sha256], unused_lanes
+
+ add dword [state + _num_lanes_inuse_sha256], 1
+
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov tmp, len
+ shr tmp, 6 ; divide by 64, len in terms of blocks
+
+ mov [lane_data + _job_in_lane], job
+ mov dword [lane_data + _outer_done], 0
+ mov [state + _lens_sha256 + 2*lane], WORD(tmp)
+
+ mov last_len, len
+ and last_len, 63
+ lea extra_blocks, [last_len + 9 + 63]
+ shr extra_blocks, 6
+ mov [lane_data + _extra_blocks], DWORD(extra_blocks)
+
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _args_data_ptr_sha256 + PTR_SZ*lane], p
+
+ cmp len, 64
+ jb copy_lt64
+
+fast_copy:
+ vmovdqu32 zmm0, [p - 64 + len]
+ vmovdqu32 [lane_data + _extra_block], zmm0
+end_fast_copy:
+
+ mov size_offset, extra_blocks
+ shl size_offset, 6
+ sub size_offset, last_len
+ add size_offset, 64-8
+ mov [lane_data + _size_offset], DWORD(size_offset)
+ mov start_offset, 64
+ sub start_offset, last_len
+ mov [lane_data + _start_offset], DWORD(start_offset)
+
+ lea tmp, [8*64 + 8*len]
+ bswap tmp
+ mov [lane_data + _extra_block + size_offset], tmp
+
+ mov tmp, [job + _auth_key_xor_ipad]
+ vmovdqu xmm0, [tmp]
+ vmovdqu xmm1, [tmp + 4*4]
+ vmovd [state + _args_digest_sha256 + 4*lane + 0*SHA256_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest_sha256 + 4*lane + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest_sha256 + 4*lane + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest_sha256 + 4*lane + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3
+ vmovd [state + _args_digest_sha256 + 4*lane + 4*SHA256_DIGEST_ROW_SIZE], xmm1
+ vpextrd [state + _args_digest_sha256 + 4*lane + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1
+ vpextrd [state + _args_digest_sha256 + 4*lane + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2
+ vpextrd [state + _args_digest_sha256 + 4*lane + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3
+
+ test len, ~63
+ jnz ge64_bytes
+
+lt64_bytes:
+ mov [state + _lens_sha256 + 2*lane], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_sha256 + PTR_SZ*lane], tmp
+ mov dword [lane_data + _extra_blocks], 0
+
+ge64_bytes:
+ cmp dword [state + _num_lanes_inuse_sha256], 0x10 ; all 16 lanes used?
+ jne return_null
+ jmp start_loop
+
+ align 16
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens_sha256]
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...7)
+
+ vmovdqa xmm2, [state + _lens_sha256 + 8*2]
+ vphminposuw xmm3, xmm2
+ vpextrw DWORD(len_upper), xmm3, 0 ; min value
+ vpextrw DWORD(idx_upper), xmm3, 1 ; min index (8...F)
+
+ cmp len2, len_upper
+ jle use_min
+
+ vmovdqa xmm1, xmm3
+ mov len2, len_upper
+ mov idx, idx_upper ; idx is in range 0..7
+ add idx, 8 ; to reflect that real index is in 8..F range
+use_min:
+ cmp len2, 0
+ je len_is_0
+
+ vpbroadcastw xmm1, xmm1 ; duplicate words across all lanes
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_sha256 + 0*2], xmm0
+ vpsubw xmm2, xmm2, xmm1
+ vmovdqa [state + _lens_sha256 + 8*2], xmm2
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_x16_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens_sha256 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp
+
+ vmovd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3
+ vpshufb xmm0, xmm0, [rel byteswap]
+ vmovd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2
+%ifndef SHA224
+ vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3
+%endif
+ vpshufb xmm1, xmm1, [rel byteswap]
+ vmovdqa [lane_data + _outer_block], xmm0
+ vmovdqa [lane_data + _outer_block + 4*4], xmm1
+%ifdef SHA224
+ mov dword [lane_data + _outer_block + 7*4], 0x80
+%endif
+
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ vmovdqu xmm1, [tmp + 4*4]
+ vmovd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3
+ vmovd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1
+ vpextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2
+ vpextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3
+
+ jmp start_loop
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr_sha256 + 8*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp start_loop
+
+ align 16
+copy_lt64:
+ ;; less than one message block of data
+ ;; beginning of source block
+ ;; destination extrablock but backwards by len from where 0x80 pre-populated
+ lea p2, [lane_data + _extra_block + 64]
+ sub p2, len
+ memcpy_avx2_64_1 p2, p, len, tmp, tmp2, ymm0, ymm1
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes_sha256]
+ mov qword [lane_data + _job_in_lane], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha256], unused_lanes
+ sub dword [state + _num_lanes_inuse_sha256], 1
+
+ vzeroupper
+
+ mov p, [job_rax + _auth_tag_output]
+
+%ifdef SHA224
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 14
+ jne copy_full_digest
+%else
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 16
+ jne copy_full_digest
+%endif
+
+ ;; copy 14 bytes for SHA224 // 16 bytes for SHA256
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+ bswap DWORD(tmp4)
+ mov [p + 0*4], DWORD(tmp)
+ mov [p + 1*4], DWORD(tmp2)
+ mov [p + 2*4], DWORD(tmp3)
+%ifdef SHA224
+ mov [p + 3*4], WORD(tmp4)
+%else
+ mov [p + 3*4], DWORD(tmp4)
+%endif
+ jmp clear_ret
+copy_full_digest:
+ ;; copy 28 bytes for SHA224 // 32 bytes for SHA256
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+ bswap DWORD(tmp4)
+ mov [p + 0*4], DWORD(tmp)
+ mov [p + 1*4], DWORD(tmp2)
+ mov [p + 2*4], DWORD(tmp3)
+ mov [p + 3*4], DWORD(tmp4)
+
+ mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE]
+%ifndef SHA224
+ mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE]
+%endif
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+%ifndef SHA224
+ bswap DWORD(tmp4)
+%endif
+ mov [p + 4*4], DWORD(tmp)
+ mov [p + 5*4], DWORD(tmp2)
+ mov [p + 6*4], DWORD(tmp3)
+%ifndef SHA224
+ mov [p + 7*4], DWORD(tmp4)
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ ;; Clear digest (28B/32B), outer_block (28B/32B) and extra_block (64B) of returned job
+%assign J 0
+%rep 7
+ mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*idx + J*SHA256_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+%ifndef SHA224
+ mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*idx + 7*SHA256_DIGEST_ROW_SIZE], 0
+%endif
+
+ vpxorq zmm0, zmm0
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha256 + lane_data]
+ ;; Clear first 64 bytes of extra_block
+ vmovdqu64 [lane_data + _extra_block], zmm0
+
+ ;; Clear first 28 bytes (SHA-224) or 32 bytes (SHA-256) of outer_block
+%ifdef SHA224
+ vmovdqa64 [lane_data + _outer_block], xmm0
+ mov qword [lane_data + _outer_block + 16], 0
+ mov dword [lane_data + _outer_block + 24], 0
+%else
+ vmovdqu64 [lane_data + _outer_block], ymm0
+%endif
+%endif ;; SAFE_DATA
+
+return:
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_384_flush_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_384_flush_avx512.asm
new file mode 100644
index 000000000..698052730
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_384_flush_avx512.asm
@@ -0,0 +1,29 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define SHA384
+%include "avx512/mb_mgr_hmac_sha_512_flush_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_384_submit_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_384_submit_avx512.asm
new file mode 100644
index 000000000..0e9f611de
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_384_submit_avx512.asm
@@ -0,0 +1,29 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%define SHA384
+%include "avx512/mb_mgr_hmac_sha_512_submit_avx512.asm"
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_512_flush_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_512_flush_avx512.asm
new file mode 100644
index 000000000..7d7e56b40
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_512_flush_avx512.asm
@@ -0,0 +1,384 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; In System V AMD64 ABI
+;; calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;; calle saves: RBX, RBP, RDI, RSI, R12-R15
+;;
+;; Clobbers ZMM0-31
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+
+extern sha512_x8_avx512
+
+section .data
+default rel
+
+align 16
+dupw: ;ddq 0x01000100010001000100010001000100
+ dq 0x0100010001000100, 0x0100010001000100
+
+align 16
+byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607
+ dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+
+align 16
+len_masks:
+ ;ddq 0x0000000000000000000000000000FFFF
+ dq 0x000000000000FFFF, 0x0000000000000000
+ ;ddq 0x000000000000000000000000FFFF0000
+ dq 0x00000000FFFF0000, 0x0000000000000000
+ ;ddq 0x00000000000000000000FFFF00000000
+ dq 0x0000FFFF00000000, 0x0000000000000000
+ ;ddq 0x0000000000000000FFFF000000000000
+ dq 0xFFFF000000000000, 0x0000000000000000
+ ;ddq 0x000000000000FFFF0000000000000000
+ dq 0x0000000000000000, 0x000000000000FFFF
+ ;ddq 0x00000000FFFF00000000000000000000
+ dq 0x0000000000000000, 0x00000000FFFF0000
+ ;ddq 0x0000FFFF000000000000000000000000
+ dq 0x0000000000000000, 0x0000FFFF00000000
+ ;ddq 0xFFFF0000000000000000000000000000
+ dq 0x0000000000000000, 0xFFFF000000000000
+
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+
+section .text
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%else
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbp
+%define idx rbp
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define tmp5 r9
+%define tmp6 r10
+
+struc STACK
+_gpr_save: resq 7 ; rbx, rbp, r12-r15, rdi (windows)
+_rsp_save: resq 1
+endstruc
+
+%define APPEND(a,b) a %+ b
+
+%ifndef SHA384
+; JOB* flush_job_hmac_sha_384_avx512(MB_MGR_HMAC_SHA_512_OOO *state)
+; arg 1 : state
+%define SHA_X_DIGEST_SIZE 512
+MKGLOBAL(flush_job_hmac_sha_512_avx512,function,internal)
+align 64
+flush_job_hmac_sha_512_avx512:
+%else
+; JOB* flush_job_hmac_sha_512_avx512(MB_MGR_HMAC_SHA_512_OOO *state)
+; arg 1 : state
+%define SHA_X_DIGEST_SIZE 384
+MKGLOBAL(flush_job_hmac_sha_384_avx512,function,internal)
+align 64
+flush_job_hmac_sha_384_avx512:
+%endif
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -32
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ bt unused_lanes, 32+3
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 7
+ cmp qword [state + _ldata_sha512 + I * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0
+ cmovne idx, [rel APPEND(lane_, I)]
+%assign I (I+1)
+%endrep
+
+copy_lane_data:
+ ; copy good lane (idx) to empty lanes
+ vmovdqa xmm0, [state + _lens_sha512]
+ mov tmp, [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*idx]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata_sha512 + I * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0
+ jne APPEND(skip_,I)
+ mov [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*I], tmp
+ vpor xmm0, xmm0, [rel len_masks + 16*I]
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ vmovdqa [state + _lens_sha512], xmm0
+
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...7)
+ cmp len2, 0
+ je len_is_0
+
+ vpshufb xmm1, [rel dupw] ; duplicate words across all 8 lanes
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_sha512], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_x8_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _SHA512_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha512 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done_sha512], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done_sha512], 1
+ mov DWORD(size_offset), [lane_data + _size_offset_sha512]
+ mov qword [lane_data + _extra_block_sha512 + size_offset], 0
+ mov word [state + _lens_sha512 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block_sha512]
+ mov job, [lane_data + _job_in_lane_sha512]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
+
+ ; move digest into data location
+ %assign I 0
+ %rep (SHA_X_DIGEST_SIZE / (8*16))
+ vmovq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE]
+ vpinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1
+ vpshufb xmm0, [rel byteswap]
+ vmovdqa [lane_data + _outer_block_sha512 + I*2*SHA512_DIGEST_WORD_SIZE], xmm0
+ %assign I (I+1)
+ %endrep
+
+ ; move the opad key into digest
+ mov tmp, [job + _auth_key_xor_opad]
+
+ %assign I 0
+ %rep 4
+ vmovdqu xmm0, [tmp + I * 16]
+ vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 0)*SHA512_DIGEST_ROW_SIZE], xmm0
+ vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1
+ %assign I (I+1)
+ %endrep
+
+ jmp copy_lane_data
+
+ align 32
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset_sha512]
+ mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block_sha512 + start_offset]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks_sha512], 0
+ jmp copy_lane_data
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 32
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane_sha512]
+ mov qword [lane_data + _job_in_lane_sha512], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha512], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+%if (SHA_X_DIGEST_SIZE != 384)
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 32
+ jne copy_full_digest
+%else
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 24
+ jne copy_full_digest
+%endif
+ ;; copy 32 bytes for SHA512 / 24 bytes for SHA384
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
+%endif
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp4)
+ bswap QWORD(tmp6)
+%if (SHA_X_DIGEST_SIZE != 384)
+ bswap QWORD(tmp5)
+%endif
+ mov [p + 0*8], QWORD(tmp2)
+ mov [p + 1*8], QWORD(tmp4)
+ mov [p + 2*8], QWORD(tmp6)
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov [p + 3*8], QWORD(tmp5)
+%endif
+ jmp clear_ret
+
+copy_full_digest:
+ ;; copy 64 bytes for SHA512 / 48 bytes for SHA384
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp4)
+ bswap QWORD(tmp6)
+ bswap QWORD(tmp5)
+ mov [p + 0*8], QWORD(tmp2)
+ mov [p + 1*8], QWORD(tmp4)
+ mov [p + 2*8], QWORD(tmp6)
+ mov [p + 3*8], QWORD(tmp5)
+
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 4*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 5*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA512_DIGEST_ROW_SIZE]
+%endif
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp4)
+%if (SHA_X_DIGEST_SIZE != 384)
+ bswap QWORD(tmp6)
+ bswap QWORD(tmp5)
+%endif
+ mov [p + 4*8], QWORD(tmp2)
+ mov [p + 5*8], QWORD(tmp4)
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov [p + 6*8], QWORD(tmp6)
+ mov [p + 7*8], QWORD(tmp5)
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ vpxorq zmm0, zmm0
+
+ ;; Clear digest (48B/64B), outer_block (48B/64B) and extra_block (128B) of returned job
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata_sha512 + (I*_SHA512_LANE_DATA_size) + _job_in_lane_sha512], 0
+ jne APPEND(skip_clear_,I)
+
+ ;; Clear digest (48 bytes for SHA-384, 64 bytes for SHA-512 bytes)
+%assign J 0
+%rep 6
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + J*SHA512_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + 6*SHA512_DIGEST_ROW_SIZE], 0
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + 7*SHA512_DIGEST_ROW_SIZE], 0
+%endif
+
+ lea lane_data, [state + _ldata_sha512 + (I*_SHA512_LANE_DATA_size)]
+ ;; Clear first 128 bytes of extra_block
+ vmovdqu64 [lane_data + _extra_block], zmm0
+ vmovdqu64 [lane_data + _extra_block + 64], zmm0
+
+ ;; Clear first 48 bytes (SHA-384) or 64 bytes (SHA-512) of outer_block
+%if (SHA_X_DIGEST_SIZE == 384)
+ vmovdqu64 [lane_data + _outer_block], ymm0
+ vmovdqa64 [lane_data + _outer_block + 32], xmm0
+%else
+ vmovdqu64 [lane_data + _outer_block], zmm0
+%endif
+
+APPEND(skip_clear_,I):
+%assign I (I+1)
+%endrep
+
+%endif ;; SAFE_DATA
+
+return:
+ vzeroupper
+
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rdi, [rsp + _gpr_save + 8*6]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_512_submit_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_512_submit_avx512.asm
new file mode 100644
index 000000000..a2b66e54f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_sha_512_submit_avx512.asm
@@ -0,0 +1,413 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; In System V AMD64 ABI
+;; calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+;;
+;; Clobbers ZMM0-31
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+
+extern sha512_x8_avx512
+
+section .data
+default rel
+
+align 16
+dupw: ;ddq 0x01000100010001000100010001000100
+ dq 0x0100010001000100, 0x0100010001000100
+
+align 16
+byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607
+ dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+
+section .text
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rcx
+%define arg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 rdi
+%define arg4 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbp, r13, r14, r16
+%define last_len rbp
+%define idx rbp
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+%define tmp4 rbx
+
+%define job_rax rax
+%define len rax
+
+%define size_offset arg3
+%define tmp2 arg3
+
+%define lane arg4
+%define tmp3 arg4
+
+%define extra_blocks r8
+
+%define tmp r9
+%define p2 r9
+
+%define lane_data r10
+
+; Define stack usage
+
+; we clobber rbx, rsi, rdi, rbp; called routine also clobbers r12
+struc STACK
+_gpr_save: resq 8
+_rsp_save: resq 1
+endstruc
+
+; JOB* FUNC(MB_MGR_HMAC_sha_512_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+align 64
+%ifndef SHA384
+MKGLOBAL(submit_job_hmac_sha_512_avx512,function,internal)
+%define SHA_X_DIGEST_SIZE 512
+submit_job_hmac_sha_512_avx512:
+%else
+MKGLOBAL(submit_job_hmac_sha_384_avx512,function,internal)
+%define SHA_X_DIGEST_SIZE 384
+submit_job_hmac_sha_384_avx512:
+%endif
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -32
+ mov [rsp + _gpr_save + 8*0], rbx
+ mov [rsp + _gpr_save + 8*1], rbp
+ mov [rsp + _gpr_save + 8*2], r12
+ mov [rsp + _gpr_save + 8*3], r13
+ mov [rsp + _gpr_save + 8*4], r14
+ mov [rsp + _gpr_save + 8*5], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*6], rsi
+ mov [rsp + _gpr_save + 8*7], rdi
+%endif
+ mov [rsp + _rsp_save], rax ; original SP
+
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ mov lane, unused_lanes
+ and lane, 15
+ shr unused_lanes, 4
+ imul lane_data, lane, _SHA512_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha512 + lane_data]
+ mov [state + _unused_lanes_sha512], unused_lanes
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov tmp, len
+ shr tmp, 7 ; divide by 128, len in terms of blocks
+
+ mov [lane_data + _job_in_lane_sha512], job
+ mov dword [lane_data + _outer_done_sha512], 0
+ mov [state + _lens_sha512 + 2*lane], WORD(tmp) ; 2 is word size in bytes
+
+ mov last_len, len
+ and last_len, 127
+ lea extra_blocks, [last_len + 17 + 127]
+ shr extra_blocks, 7
+ mov [lane_data + _extra_blocks_sha512], DWORD(extra_blocks)
+
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], p
+
+ cmp len, 128
+ jb copy_lt128
+
+fast_copy:
+ add p, len
+ vmovdqu32 zmm0, [p - 128 + 0*64]
+ vmovdqu32 zmm1, [p - 128 + 1*64]
+ vmovdqu32 [lane_data + _extra_block_sha512 + 0*64], zmm0
+ vmovdqu32 [lane_data + _extra_block_sha512 + 1*64], zmm1
+end_fast_copy:
+
+ mov size_offset, extra_blocks
+ shl size_offset, 7
+ sub size_offset, last_len
+ add size_offset, 128-8
+ mov [lane_data + _size_offset_sha512], DWORD(size_offset)
+ mov start_offset, 128
+ sub start_offset, last_len
+ mov [lane_data + _start_offset_sha512], DWORD(start_offset)
+
+ lea tmp, [8*128 + 8*len]
+ bswap tmp
+ mov [lane_data + _extra_block_sha512 + size_offset], tmp
+
+ mov tmp, [job + _auth_key_xor_ipad]
+
+%assign I 0
+%rep 4
+ vmovdqu xmm0, [tmp + I * 2 * SHA512_DIGEST_WORD_SIZE]
+ vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I + 0)*SHA512_DIGEST_ROW_SIZE], xmm0
+ vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1
+%assign I (I+1)
+%endrep
+
+ test len, ~127
+ jnz ge128_bytes
+
+lt128_bytes:
+ mov [state + _lens_sha512 + 2*lane], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block_sha512 + start_offset]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], tmp ;; 8 to hold a UINT8
+ mov dword [lane_data + _extra_blocks_sha512], 0
+
+ge128_bytes:
+ cmp unused_lanes, 0xf
+ jne return_null
+ jmp start_loop
+
+ align 32
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens_sha512]
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...7)
+ cmp len2, 0
+ je len_is_0
+
+ vpshufb xmm1, [rel dupw] ; duplicate words across all 8 lanes
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens_sha512], xmm0
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_x8_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _SHA512_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha512 + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done_sha512], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done_sha512], 1
+ mov DWORD(size_offset), [lane_data + _size_offset_sha512]
+ mov qword [lane_data + _extra_block_sha512 + size_offset], 0
+ mov word [state + _lens_sha512 + 2*idx], 1
+ lea tmp, [lane_data + _outer_block_sha512]
+ mov job, [lane_data + _job_in_lane_sha512]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp
+
+%assign I 0
+%rep (SHA_X_DIGEST_SIZE / (8 * 16))
+ vmovq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 0)*SHA512_DIGEST_ROW_SIZE]
+ vpinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1
+ vpshufb xmm0, [rel byteswap]
+ vmovdqa [lane_data + _outer_block_sha512 + I * 2 * SHA512_DIGEST_WORD_SIZE], xmm0
+%assign I (I+1)
+%endrep
+
+ mov tmp, [job + _auth_key_xor_opad]
+%assign I 0
+%rep 4
+ vmovdqu xmm0, [tmp + I * 16]
+ vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I+0)*SHA512_DIGEST_ROW_SIZE], xmm0
+ vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1
+%assign I (I+1)
+%endrep
+
+ jmp start_loop
+
+ align 32
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset_sha512]
+ mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block_sha512 + start_offset]
+ mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp ;; idx is index of shortest length message
+ mov dword [lane_data + _extra_blocks_sha512], 0
+ jmp start_loop
+
+ align 32
+copy_lt128:
+ ;; less than one message block of data
+ ;; destination extra block but backwards by len from where 0x80 pre-populated
+ lea p2, [lane_data + _extra_block + 128]
+ sub p2, len
+ memcpy_avx2_128_1 p2, p, len, tmp4, tmp2, ymm0, ymm1, ymm2, ymm3
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 32
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane_sha512]
+ mov unused_lanes, [state + _unused_lanes_sha512]
+ mov qword [lane_data + _job_in_lane_sha512], 0
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes_sha512], unused_lanes
+
+ mov p, [job_rax + _auth_tag_output]
+
+ vzeroupper
+
+%if (SHA_X_DIGEST_SIZE != 384)
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 32
+ jne copy_full_digest
+%else
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 24
+ jne copy_full_digest
+%endif
+
+ ;; copy 32 bytes for SHA512 / 24 bytes for SHA384
+ mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
+%endif
+ bswap QWORD(tmp)
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp3)
+%if (SHA_X_DIGEST_SIZE != 384)
+ bswap QWORD(tmp4)
+%endif
+ mov [p + 0*8], QWORD(tmp)
+ mov [p + 1*8], QWORD(tmp2)
+ mov [p + 2*8], QWORD(tmp3)
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov [p + 3*8], QWORD(tmp4)
+%endif
+ jmp clear_ret
+
+copy_full_digest:
+ ;; copy 64 bytes for SHA512 / 48 bytes for SHA384
+ mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE]
+ bswap QWORD(tmp)
+ bswap QWORD(tmp2)
+ bswap QWORD(tmp3)
+ bswap QWORD(tmp4)
+ mov [p + 0*8], QWORD(tmp)
+ mov [p + 1*8], QWORD(tmp2)
+ mov [p + 2*8], QWORD(tmp3)
+ mov [p + 3*8], QWORD(tmp4)
+
+ mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 4*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 5*SHA512_DIGEST_ROW_SIZE]
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA512_DIGEST_ROW_SIZE]
+ mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA512_DIGEST_ROW_SIZE]
+%endif
+ bswap QWORD(tmp)
+ bswap QWORD(tmp2)
+%if (SHA_X_DIGEST_SIZE != 384)
+ bswap QWORD(tmp3)
+ bswap QWORD(tmp4)
+%endif
+ mov [p + 4*8], QWORD(tmp)
+ mov [p + 5*8], QWORD(tmp2)
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov [p + 6*8], QWORD(tmp3)
+ mov [p + 7*8], QWORD(tmp4)
+%endif
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ ;; Clear digest (48B/64B), outer_block (48B/64B) and extra_block (128B) of returned job
+%assign J 0
+%rep 6
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + J*SHA512_DIGEST_ROW_SIZE], 0
+%assign J (J+1)
+%endrep
+%if (SHA_X_DIGEST_SIZE != 384)
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA256_DIGEST_ROW_SIZE], 0
+ mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA256_DIGEST_ROW_SIZE], 0
+%endif
+
+ vpxorq zmm0, zmm0
+ imul lane_data, idx, _SHA512_LANE_DATA_size
+ lea lane_data, [state + _ldata_sha512 + lane_data]
+ ;; Clear first 128 bytes of extra_block
+ vmovdqu64 [lane_data + _extra_block], zmm0
+ vmovdqu64 [lane_data + _extra_block + 64], zmm0
+
+ ;; Clear first 48 bytes (SHA-384) or 64 bytes (SHA-512) of outer_block
+%if (SHA_X_DIGEST_SIZE == 384)
+ vmovdqu64 [lane_data + _outer_block], ymm0
+ vmovdqa64 [lane_data + _outer_block + 32], xmm0
+%else
+ vmovdqu64 [lane_data + _outer_block], zmm0
+%endif
+%endif ;; SAFE_DATA
+
+return:
+ mov rbx, [rsp + _gpr_save + 8*0]
+ mov rbp, [rsp + _gpr_save + 8*1]
+ mov r12, [rsp + _gpr_save + 8*2]
+ mov r13, [rsp + _gpr_save + 8*3]
+ mov r14, [rsp + _gpr_save + 8*4]
+ mov r15, [rsp + _gpr_save + 8*5]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*6]
+ mov rdi, [rsp + _gpr_save + 8*7]
+%endif
+ mov rsp, [rsp + _rsp_save] ; original SP
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_submit_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_submit_avx512.asm
new file mode 100644
index 000000000..2fe8482a9
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/mb_mgr_hmac_submit_avx512.asm
@@ -0,0 +1,402 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; In System V AMD64 ABI
+;; calle saves: RBX, RBP, R12-R15
+;; Windows x64 ABI
+;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RAX RCX RDX R8 R9 R10 R11
+;; Windows preserves: RBX RBP RSI RDI R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Linux clobbers: RAX RCX RDX RSI RDI R8 R9 R10 R11
+;; Linux preserves: RBX RBP R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Clobbers ZMM0-31
+
+%include "include/os.asm"
+%include "job_aes_hmac.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/reg_sizes.asm"
+%include "include/memcpy.asm"
+
+;; %define DO_DBGPRINT
+%include "include/dbgprint.asm"
+
+extern sha1_x16_avx512
+
+section .data
+default rel
+
+align 16
+byteswap:
+ dq 0x0405060700010203
+ dq 0x0c0d0e0f08090a0b
+
+section .text
+
+%if 1
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define reg3 rcx
+%define reg4 rdx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define reg3 rdi
+%define reg4 rsi
+%endif
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+
+; idx needs to be in rbx, rdi, rbp
+%define last_len rbp
+%define idx rbp
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes r12
+%define tmp4 r12
+
+%define job_rax rax
+%define len rax
+
+%define size_offset reg3
+%define tmp2 reg3
+
+%define lane reg4
+%define tmp3 reg4
+
+%define extra_blocks r8
+
+%define tmp r9
+%define p2 r9
+
+%define lane_data r10
+%define num_lanes_inuse r12
+%define len_upper r13
+%define idx_upper r14
+%endif
+
+; we clobber rsi, rdi, rbp, r12; called routine clobbers also r9-r15
+struc STACK
+_gpr_save: resq 7
+_rsp_save: resq 1
+endstruc
+
+; JOB* submit_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+MKGLOBAL(submit_job_hmac_avx512,function,internal)
+submit_job_hmac_avx512:
+
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -32 ; align to 32 byte boundary
+ mov [rsp + _gpr_save + 8*0], rbp
+ mov [rsp + _gpr_save + 8*1], r12
+ mov [rsp + _gpr_save + 8*2], r13
+ mov [rsp + _gpr_save + 8*3], r14
+ mov [rsp + _gpr_save + 8*4], r15
+%ifndef LINUX
+ mov [rsp + _gpr_save + 8*5], rsi
+ mov [rsp + _gpr_save + 8*6], rdi
+%endif
+ mov [rsp + _rsp_save], rax
+ DBGPRINTL "---------- enter sha1 submit -----------"
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF ;; just a nibble
+ shr unused_lanes, 4
+ imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ DBGPRINTL64 "lane", lane
+ DBGPRINTL64 "unused_lanes", unused_lanes
+
+ add dword [state + _num_lanes_inuse_sha1], 1
+
+ mov len, [job + _msg_len_to_hash_in_bytes]
+ mov tmp, len
+ shr tmp, 6 ; divide by 64, len in terms of blocks
+
+ mov [lane_data + _job_in_lane], job
+ mov dword [lane_data + _outer_done], 0
+ mov [state + _lens + 2*lane], WORD(tmp)
+
+ mov last_len, len
+ DBGPRINTL64 "last_len", last_len
+ and last_len, 63
+ lea extra_blocks, [last_len + 9 + 63]
+ shr extra_blocks, 6
+ DBGPRINTL64 "extra_blocks", extra_blocks
+ mov [lane_data + _extra_blocks], DWORD(extra_blocks)
+
+ mov p, [job + _src]
+ add p, [job + _hash_start_src_offset_in_bytes]
+ mov [state + _args_data_ptr + PTR_SZ*lane], p
+ cmp len, 64
+ jb copy_lt64
+
+fast_copy:
+ vmovdqu32 zmm0, [p - 64 + len]
+ vmovdqu32 [lane_data + _extra_block], zmm0
+
+end_fast_copy:
+ mov size_offset, extra_blocks
+ shl size_offset, 6
+ sub size_offset, last_len
+ add size_offset, 64-8
+ mov [lane_data + _size_offset], DWORD(size_offset)
+ mov start_offset, 64
+ sub start_offset, last_len
+ mov [lane_data + _start_offset], DWORD(start_offset)
+
+ lea tmp, [8*64 + 8*len]
+ bswap tmp
+ mov [lane_data + _extra_block + size_offset], tmp
+
+ mov tmp, [job + _auth_key_xor_ipad]
+ vmovdqu xmm0, [tmp]
+ mov DWORD(tmp), [tmp + 4*4]
+ vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 0*SHA1_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3
+ mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp)
+
+ test len, ~63
+ jnz ge64_bytes
+
+lt64_bytes:
+ DBGPRINTL64 "lt64_bytes extra_blocks", extra_blocks
+ DBGPRINTL64 "lt64_bytes start_offset", start_offset
+ mov [state + _lens + 2*lane], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr + PTR_SZ*lane], tmp
+ mov dword [lane_data + _extra_blocks], 0
+
+ge64_bytes:
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse_sha1]
+ cmp num_lanes_inuse, 0x10 ; all 16 lanes used?
+ jne return_null
+ jmp start_loop
+
+ align 16
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens]
+ vphminposuw xmm1, xmm0
+ vpextrw DWORD(len2), xmm1, 0 ; min value
+ vpextrw DWORD(idx), xmm1, 1 ; min index (0...7)
+
+ vmovdqa xmm2, [state + _lens + 8*2]
+ vphminposuw xmm3, xmm2
+ vpextrw DWORD(len_upper), xmm3, 0 ; min value
+ vpextrw DWORD(idx_upper), xmm3, 1 ; min index (8...F)
+
+ cmp len2, len_upper
+ jle use_min
+
+ vmovdqa xmm1, xmm3
+ mov len2, len_upper
+ mov idx, idx_upper ; idx would be in range 0..7
+ add idx, 8 ; to reflect that index is in 8..F range
+
+use_min:
+ cmp len2, 0
+ je len_is_0
+
+ DBGPRINTL64 "min_length", len2
+ DBGPRINTL64 "min_length index ", idx
+
+ vpbroadcastw xmm1, xmm1
+ DBGPRINTL_XMM "SUBMIT lens after shuffle", xmm1
+
+ vpsubw xmm0, xmm0, xmm1
+ vmovdqa [state + _lens + 0*2], xmm0
+ vpsubw xmm2, xmm2, xmm1
+ vmovdqa [state + _lens + 8*2], xmm2
+ DBGPRINTL_XMM "lengths after subtraction (0..7)", xmm0
+ DBGPRINTL_XMM "lengths after subtraction (8..F)", xmm2
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_x16_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+ mov DWORD(extra_blocks), [lane_data + _extra_blocks]
+ cmp extra_blocks, 0
+ jne proc_extra_blocks
+ cmp dword [lane_data + _outer_done], 0
+ jne end_loop
+
+proc_outer:
+ mov dword [lane_data + _outer_done], 1
+ mov DWORD(size_offset), [lane_data + _size_offset]
+ mov qword [lane_data + _extra_block + size_offset], 0
+ mov word [state + _lens + 2*idx], 1
+ lea tmp, [lane_data + _outer_block]
+ mov job, [lane_data + _job_in_lane]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+
+ vmovd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2
+ vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3
+ vpshufb xmm0, xmm0, [rel byteswap]
+ mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ vmovdqa [lane_data + _outer_block], xmm0
+ mov [lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+
+ mov tmp, [job + _auth_key_xor_opad]
+ vmovdqu xmm0, [tmp]
+ mov DWORD(tmp), [tmp + 4*4]
+ vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2
+ vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3
+ mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp)
+
+ jmp start_loop
+
+ align 16
+proc_extra_blocks:
+ mov DWORD(start_offset), [lane_data + _start_offset]
+ mov [state + _lens + 2*idx], WORD(extra_blocks)
+ lea tmp, [lane_data + _extra_block + start_offset]
+ mov [state + _args_data_ptr + PTR_SZ*idx], tmp
+ mov dword [lane_data + _extra_blocks], 0
+ jmp start_loop
+
+ align 16
+copy_lt64:
+ ;; less than one message block of data
+ ;; beginning of source block
+ ;; destination extrablock but backwards by len from where 0x80 pre-populated
+ lea p2, [lane_data + _extra_block + 64]
+ sub p2, len
+ memcpy_avx2_64_1 p2, p, len, tmp4, tmp2, ymm0, ymm1
+ mov unused_lanes, [state + _unused_lanes]
+ jmp end_fast_copy
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ align 16
+end_loop:
+ mov job_rax, [lane_data + _job_in_lane]
+ or dword [job_rax + _status], STS_COMPLETED_HMAC
+ mov qword [lane_data + _job_in_lane], 0
+
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse_sha1], 1
+
+ mov p, [job_rax + _auth_tag_output]
+
+ vzeroupper
+
+ ; copy 12 bytes
+ mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp3), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ bswap DWORD(tmp3)
+ mov [p + 0*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+ mov [p + 1*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2)
+ mov [p + 2*SHA1_DIGEST_WORD_SIZE], DWORD(tmp3)
+
+ cmp qword [job_rax + _auth_tag_output_len_in_bytes], 12
+ je clear_ret
+
+ ;; copy remaining 8 bytes to return 20 byte digest
+ mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE]
+ mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE]
+ bswap DWORD(tmp)
+ bswap DWORD(tmp2)
+ mov [p + 3*SHA1_DIGEST_WORD_SIZE], DWORD(tmp)
+ mov [p + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2)
+
+clear_ret:
+
+%ifdef SAFE_DATA
+ ;; Clear digest (20B), outer_block (20B) and extra_block (64B) of returned job
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 0
+ mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], 0
+
+ vpxorq zmm0, zmm0
+ imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ ;; Clear first 64 bytes of extra_block
+ vmovdqu64 [lane_data + _extra_block], zmm0
+
+ ;; Clear first 20 bytes of outer_block
+ vmovdqu64 [lane_data + _outer_block], xmm0
+ mov dword [lane_data + _outer_block + 16], 0
+%endif
+
+return:
+ DBGPRINTL "---------- exit sha1 submit -----------"
+ mov rbp, [rsp + _gpr_save + 8*0]
+ mov r12, [rsp + _gpr_save + 8*1]
+ mov r13, [rsp + _gpr_save + 8*2]
+ mov r14, [rsp + _gpr_save + 8*3]
+ mov r15, [rsp + _gpr_save + 8*4]
+%ifndef LINUX
+ mov rsi, [rsp + _gpr_save + 8*5]
+ mov rdi, [rsp + _gpr_save + 8*6]
+%endif
+ mov rsp, [rsp + _rsp_save]
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/sha1_x16_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/sha1_x16_avx512.asm
new file mode 100644
index 000000000..d67046ce5
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/sha1_x16_avx512.asm
@@ -0,0 +1,439 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; Stack must be aligned to 32 bytes before call
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RAX RDX R8 R9 R10 R11 R12 R13 R14 R15
+;; Windows preserves: RBX RCX RBP RSI RDI
+;; -----------------------------------------------------------
+;; Linux clobbers: RAX RDX RSI R9 R10 R11 R12 R13 R14 R15
+;; Linux preserves: RBX RCX RBP RDI R8
+;; -----------------------------------------------------------
+;; Clobbers ZMM0-31
+
+%include "include/os.asm"
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/transpose_avx512.asm"
+%include "include/reg_sizes.asm"
+
+section .data
+default rel
+align 64
+K00_19: ;ddq 0x5A8279995A8279995A8279995A827999
+ ;ddq 0x5A8279995A8279995A8279995A827999
+ ;ddq 0x5A8279995A8279995A8279995A827999
+ ;ddq 0x5A8279995A8279995A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+ ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+ ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+ ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+ ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+ ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+ ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+ ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+ ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+ ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+
+PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+section .text
+
+%define APPEND(a,b) a %+ b
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%define arg4 rcx
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 r8
+%define arg4 r9
+%endif
+
+%define state arg1
+%define SIZE arg2
+%define IDX arg3
+
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define E zmm4
+%define KT zmm5
+%define AA zmm6
+%define BB zmm7
+%define CC zmm8
+%define DD zmm9
+%define EE zmm10
+%define TMP0 zmm11
+%define TMP1 zmm12
+%define TMP2 zmm13
+%define TMP3 zmm14
+%define TMP4 zmm15
+
+%define W0 zmm16
+%define W1 zmm17
+%define W2 zmm18
+%define W3 zmm19
+%define W4 zmm20
+%define W5 zmm21
+%define W6 zmm22
+%define W7 zmm23
+%define W8 zmm24
+%define W9 zmm25
+%define W10 zmm26
+%define W11 zmm27
+%define W12 zmm28
+%define W13 zmm29
+%define W14 zmm30
+%define W15 zmm31
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 rax
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro PROCESS_LOOP 2
+%define %%WT %1
+%define %%F_IMMED %2
+
+ ; T = ROTL_5(A) + Ft(B,C,D) + E + Kt + Wt
+ ; E=D, D=C, C=ROTL_30(B), B=A, A=T
+
+ ; Ft
+ ; 0-19 Ch(B,C,D) = (B&C) ^ (~B&D)
+ ; 20-39, 60-79 Parity(B,C,D) = B ^ C ^ D
+ ; 40-59 Maj(B,C,D) = (B&C) ^ (B&D) ^ (C&D)
+
+ vmovdqa32 TMP1, B ; Copy B
+ vpaddd E, E, %%WT ; E = E + Wt
+ vpternlogd TMP1, C, D, %%F_IMMED ; TMP1 = Ft(B,C,D)
+ vpaddd E, E, KT ; E = E + Wt + Kt
+ vprold TMP0, A, 5 ; TMP0 = ROTL_5(A)
+ vpaddd E, E, TMP1 ; E = Ft(B,C,D) + E + Kt + Wt
+ vprold B, B, 30 ; B = ROTL_30(B)
+ vpaddd E, E, TMP0 ; E = T
+
+ ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_79 4
+%define %%WT %1
+%define %%WTp2 %2
+%define %%WTp8 %3
+%define %%WTp13 %4
+ ; Wt = ROTL_1(Wt-3 ^ Wt-8 ^ Wt-14 ^ Wt-16)
+ ; Wt+16 = ROTL_1(Wt+13 ^ Wt+8 ^ Wt+2 ^ Wt)
+ vpternlogd %%WT, %%WTp2, %%WTp8, 0x96
+ vpxord %%WT, %%WT, %%WTp13
+ vprold %%WT, %%WT, 1
+%endmacro
+
+
+; Note this is reading in two blocks of data from each lane,
+; in preparation for the upcoming needed transpose to build msg schedule.
+; Each register will contain 32 bytes from one lane plus 32 bytes
+; from another lane.
+; The first 8 registers will contain the first 32 bytes of all lanes,
+; where register X (0 <= X <= 7) will contain bytes 0-31 from lane X in the first half
+; and 0-31 bytes from lane X+8 in the second half.
+; The last 8 registers will contain the last 32 bytes of all lanes,
+; where register Y (8 <= Y <= 15) wil contain bytes 32-63 from lane Y-8 in the first half
+; and 32-63 bytes from lane Y in the second half.
+; This method helps reducing the number of shuffles required to transpose the data.
+%macro MSG_SCHED_ROUND_00_15 6
+%define %%Wt %1 ; [out] zmm register to load the next block
+%define %%LANE_IDX %2 ; [in] lane index (0-15)
+%define %%BASE_PTR %3 ; [in] base address of the input data
+%define %%OFFSET_PTR %4 ; [in] offset to get next block of data from the lane
+%define %%TMP1 %5 ; [clobbered] temporary gp register
+%define %%TMP2 %6 ; [clobbered] temporary gp register
+%if (%%LANE_IDX < 8)
+ mov %%TMP1, [%%BASE_PTR + %%LANE_IDX*PTR_SZ]
+ mov %%TMP2, [%%BASE_PTR + (%%LANE_IDX+8)*PTR_SZ]
+ vmovups YWORD(%%Wt), [%%TMP1+%%OFFSET_PTR]
+ vinserti64x4 %%Wt, %%Wt, [%%TMP2+%%OFFSET_PTR], 0x01
+%else
+ mov %%TMP1, [%%BASE_PTR + (%%LANE_IDX-8)*PTR_SZ]
+ mov %%TMP2, [%%BASE_PTR + %%LANE_IDX*PTR_SZ]
+ vmovups YWORD(%%Wt), [%%TMP1+%%OFFSET_PTR+32]
+ vinserti64x4 %%Wt, %%Wt, [%%TMP2+%%OFFSET_PTR+32], 0x01
+%endif
+%endmacro
+
+align 64
+; void sha1_mult_x16_avx3(void **input_data, UINT128 *digest, UINT32 size)
+; arg 1 : pointer to SHA1 args structure
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+MKGLOBAL(sha1_x16_avx512,function,internal)
+sha1_x16_avx512:
+ ;; Initialize digests
+ vmovdqu32 A, [state + 0*SHA1_DIGEST_ROW_SIZE]
+ vmovdqu32 B, [state + 1*SHA1_DIGEST_ROW_SIZE]
+ vmovdqu32 C, [state + 2*SHA1_DIGEST_ROW_SIZE]
+ vmovdqu32 D, [state + 3*SHA1_DIGEST_ROW_SIZE]
+ vmovdqu32 E, [state + 4*SHA1_DIGEST_ROW_SIZE]
+ DBGPRINTL_ZMM "Sha1-AVX512 incoming transposed digest", A, B, C, D, E
+ DBGPRINTL64 "SIZE", SIZE
+
+ xor IDX, IDX
+
+ ;; Load first blocks of data into ZMM registers before
+ ;; performing a 16x16 32-bit transpose.
+ ;; To speed up the transpose, data is loaded in chunks of 32 bytes,
+ ;; interleaving data between lane X and lane X+8.
+ ;; This way, final shuffles between top half and bottom half
+ ;; of the matrix are avoided.
+ mov inp0, [state + _data_ptr_sha1 + 0*PTR_SZ]
+ mov inp1, [state + _data_ptr_sha1 + 1*PTR_SZ]
+ mov inp2, [state + _data_ptr_sha1 + 2*PTR_SZ]
+ mov inp3, [state + _data_ptr_sha1 + 3*PTR_SZ]
+ mov inp4, [state + _data_ptr_sha1 + 4*PTR_SZ]
+ mov inp5, [state + _data_ptr_sha1 + 5*PTR_SZ]
+ mov inp6, [state + _data_ptr_sha1 + 6*PTR_SZ]
+ mov inp7, [state + _data_ptr_sha1 + 7*PTR_SZ]
+
+ TRANSPOSE16_U32_LOAD_FIRST8 W0, W1, W2, W3, W4, W5, W6, W7, \
+ W8, W9, W10, W11, W12, W13, W14, W15, \
+ inp0, inp1, inp2, inp3, inp4, inp5, \
+ inp6, inp7, IDX
+
+ mov inp0, [state + _data_ptr_sha1 + 8*PTR_SZ]
+ mov inp1, [state + _data_ptr_sha1 + 9*PTR_SZ]
+ mov inp2, [state + _data_ptr_sha1 +10*PTR_SZ]
+ mov inp3, [state + _data_ptr_sha1 +11*PTR_SZ]
+ mov inp4, [state + _data_ptr_sha1 +12*PTR_SZ]
+ mov inp5, [state + _data_ptr_sha1 +13*PTR_SZ]
+ mov inp6, [state + _data_ptr_sha1 +14*PTR_SZ]
+ mov inp7, [state + _data_ptr_sha1 +15*PTR_SZ]
+
+ TRANSPOSE16_U32_LOAD_LAST8 W0, W1, W2, W3, W4, W5, W6, W7, \
+ W8, W9, W10, W11, W12, W13, W14, W15, \
+ inp0, inp1, inp2, inp3, inp4, inp5, \
+ inp6, inp7, IDX
+lloop:
+ vmovdqa32 TMP2, [rel PSHUFFLE_BYTE_FLIP_MASK]
+
+ add IDX, 64
+
+ TRANSPOSE16_U32 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1, TMP3, TMP4
+ DBGPRINTL_ZMM "Sha1-AVX512 incoming transposed input", W0, W1, W2, W3, W4, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15
+
+%assign I 0
+%rep 16
+ vpshufb APPEND(W,I), APPEND(W,I), TMP2
+%assign I (I+1)
+%endrep
+
+ ; Save digests for later addition
+ vmovdqa32 AA, A
+ vmovdqa32 BB, B
+ vmovdqa32 CC, C
+ vmovdqa32 DD, D
+ vmovdqa32 EE, E
+
+ vmovdqa32 KT, [rel K00_19]
+%assign I 0xCA
+%assign J 0
+%assign K 2
+%assign L 8
+%assign M 13
+%assign N 0
+%rep 64
+ PROCESS_LOOP APPEND(W,J), I
+ MSG_SCHED_ROUND_16_79 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+ %if N = 19
+ vmovdqa32 KT, [rel K20_39]
+ %assign I 0x96
+ %elif N = 39
+ vmovdqa32 KT, [rel K40_59]
+ %assign I 0xE8
+ %elif N = 59
+ vmovdqa32 KT, [rel K60_79]
+ %assign I 0x96
+ %endif
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%assign N (N+1)
+%endrep
+
+ ; Check if this is the last block
+ sub SIZE, 1
+ je lastLoop
+
+%assign I 0x96
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+ MSG_SCHED_ROUND_00_15 APPEND(W,J), J, state + _data_ptr_sha1, IDX, inp0, inp1
+%assign J (J+1)
+%endrep
+
+ ; Add old digest
+ vpaddd A,A,AA
+ vpaddd B,B,BB
+ vpaddd C,C,CC
+ vpaddd D,D,DD
+ vpaddd E,E,EE
+
+ jmp lloop
+
+lastLoop:
+; Need to reset argument rotation values to Round 64 values
+%xdefine TMP_ A
+%xdefine A B
+%xdefine B C
+%xdefine C D
+%xdefine D E
+%xdefine E TMP_
+
+ ; Process last 16 rounds
+%assign I 0x96
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+%assign J (J+1)
+%endrep
+
+ ; Add old digest
+ vpaddd A,A,AA
+ vpaddd B,B,BB
+ vpaddd C,C,CC
+ vpaddd D,D,DD
+ vpaddd E,E,EE
+
+ ; Write out digest
+ ; Do we need to untranspose digests???
+ vmovdqu32 [state + 0*SHA1_DIGEST_ROW_SIZE], A
+ vmovdqu32 [state + 1*SHA1_DIGEST_ROW_SIZE], B
+ vmovdqu32 [state + 2*SHA1_DIGEST_ROW_SIZE], C
+ vmovdqu32 [state + 3*SHA1_DIGEST_ROW_SIZE], D
+ vmovdqu32 [state + 4*SHA1_DIGEST_ROW_SIZE], E
+ DBGPRINTL_ZMM "Sha1-AVX512 outgoing transposed digest", A, B, C, D, E
+
+ ;; update input pointers
+ mov inp0, [state + _data_ptr_sha1 + 0*PTR_SZ]
+ mov inp1, [state + _data_ptr_sha1 + 1*PTR_SZ]
+ mov inp2, [state + _data_ptr_sha1 + 2*PTR_SZ]
+ mov inp3, [state + _data_ptr_sha1 + 3*PTR_SZ]
+ mov inp4, [state + _data_ptr_sha1 + 4*PTR_SZ]
+ mov inp5, [state + _data_ptr_sha1 + 5*PTR_SZ]
+ mov inp6, [state + _data_ptr_sha1 + 6*PTR_SZ]
+ mov inp7, [state + _data_ptr_sha1 + 7*PTR_SZ]
+ add inp0, IDX
+ add inp1, IDX
+ add inp2, IDX
+ add inp3, IDX
+ add inp4, IDX
+ add inp5, IDX
+ add inp6, IDX
+ add inp7, IDX
+ mov [state + _data_ptr_sha1 + 0*PTR_SZ], inp0
+ mov [state + _data_ptr_sha1 + 1*PTR_SZ], inp1
+ mov [state + _data_ptr_sha1 + 2*PTR_SZ], inp2
+ mov [state + _data_ptr_sha1 + 3*PTR_SZ], inp3
+ mov [state + _data_ptr_sha1 + 4*PTR_SZ], inp4
+ mov [state + _data_ptr_sha1 + 5*PTR_SZ], inp5
+ mov [state + _data_ptr_sha1 + 6*PTR_SZ], inp6
+ mov [state + _data_ptr_sha1 + 7*PTR_SZ], inp7
+
+ mov inp0, [state + _data_ptr_sha1 + 8*PTR_SZ]
+ mov inp1, [state + _data_ptr_sha1 + 9*PTR_SZ]
+ mov inp2, [state + _data_ptr_sha1 + 10*PTR_SZ]
+ mov inp3, [state + _data_ptr_sha1 + 11*PTR_SZ]
+ mov inp4, [state + _data_ptr_sha1 + 12*PTR_SZ]
+ mov inp5, [state + _data_ptr_sha1 + 13*PTR_SZ]
+ mov inp6, [state + _data_ptr_sha1 + 14*PTR_SZ]
+ mov inp7, [state + _data_ptr_sha1 + 15*PTR_SZ]
+ add inp0, IDX
+ add inp1, IDX
+ add inp2, IDX
+ add inp3, IDX
+ add inp4, IDX
+ add inp5, IDX
+ add inp6, IDX
+ add inp7, IDX
+ mov [state + _data_ptr_sha1 + 8*PTR_SZ], inp0
+ mov [state + _data_ptr_sha1 + 9*PTR_SZ], inp1
+ mov [state + _data_ptr_sha1 + 10*PTR_SZ], inp2
+ mov [state + _data_ptr_sha1 + 11*PTR_SZ], inp3
+ mov [state + _data_ptr_sha1 + 12*PTR_SZ], inp4
+ mov [state + _data_ptr_sha1 + 13*PTR_SZ], inp5
+ mov [state + _data_ptr_sha1 + 14*PTR_SZ], inp6
+ mov [state + _data_ptr_sha1 + 15*PTR_SZ], inp7
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/sha256_x16_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/sha256_x16_avx512.asm
new file mode 100644
index 000000000..cdbb61ea3
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/sha256_x16_avx512.asm
@@ -0,0 +1,758 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; Stack must be aligned to 32 bytes before call
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RAX RDX RSI RDI R9 R10 R11 R12 R13 R14 R15
+;; Windows preserves: RCX
+;; -----------------------------------------------------------
+;; Linux clobbers: RAX RCX RDX RSI R9 R10 R11 R12 R13 R14 R15
+;; Linux preserves: RDI
+;; -----------------------------------------------------------
+;; Clobbers ZMM0-31
+
+%include "include/os.asm"
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/transpose_avx512.asm"
+%include "include/reg_sizes.asm"
+
+; re-use K256 from sha256_oct_avx2.asm
+extern K256
+
+;; code to compute x16 SHA256 using AVX512
+
+%define APPEND(a,b) a %+ b
+
+; Define Stack Layout
+START_FIELDS
+;;; name size align
+FIELD _DIGEST_SAVE, 8*64, 64
+FIELD _rsp, 8, 8
+%assign STACK_SPACE _FIELD_OFFSET
+
+%ifdef LINUX
+; Linux register definitions
+ %define arg1 rdi
+ %define arg2 rsi
+ %define arg3 rcx
+ %define arg4 rdx
+%else
+; Windows definitions
+ %define arg1 rcx
+ %define arg2 rdx
+ %define arg3 rsi
+ %define arg4 rdi
+%endif
+
+%define STATE arg1
+%define INP_SIZE arg2
+%define IDX arg3
+%define TBL arg4
+
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define E zmm4
+%define F zmm5
+%define G zmm6
+%define H zmm7
+%define T1 zmm8
+%define TMP0 zmm9
+%define TMP1 zmm10
+%define TMP2 zmm11
+%define TMP3 zmm12
+%define TMP4 zmm13
+%define TMP5 zmm14
+%define TMP6 zmm15
+
+%define W0 zmm16
+%define W1 zmm17
+%define W2 zmm18
+%define W3 zmm19
+%define W4 zmm20
+%define W5 zmm21
+%define W6 zmm22
+%define W7 zmm23
+%define W8 zmm24
+%define W9 zmm25
+%define W10 zmm26
+%define W11 zmm27
+%define W12 zmm28
+%define W13 zmm29
+%define W14 zmm30
+%define W15 zmm31
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 rax
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ H
+%xdefine H G
+%xdefine G F
+%xdefine F E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+;; CH(A, B, C) = (A&B) ^ (~A&C)
+;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G)
+;; SIGMA0 = ROR_2 ^ ROR_13 ^ ROR_22
+;; SIGMA1 = ROR_6 ^ ROR_11 ^ ROR_25
+;; sigma0 = ROR_7 ^ ROR_18 ^ SHR_3
+;; sigma1 = ROR_17 ^ ROR_19 ^ SHR_10
+
+; Main processing loop per round
+%macro PROCESS_LOOP 2
+%define %%WT %1
+%define %%ROUND %2
+ ;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt
+ ;; T2 = SIGMA0(A) + MAJ(A, B, C)
+ ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
+
+ ;; H becomes T2, then add T1 for A
+ ;; D becomes D + T1 for E
+
+ vpaddd T1, H, TMP3 ; T1 = H + Kt
+ vmovdqa32 TMP0, E
+ vprord TMP1, E, 6 ; ROR_6(E)
+ vprord TMP2, E, 11 ; ROR_11(E)
+ vprord TMP3, E, 25 ; ROR_25(E)
+ vpternlogd TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G)
+ vpaddd T1, T1, %%WT ; T1 = T1 + Wt
+ vpternlogd TMP1, TMP2, TMP3, 0x96 ; TMP1 = SIGMA1(E)
+ vpaddd T1, T1, TMP0 ; T1 = T1 + CH(E,F,G)
+ vpaddd T1, T1, TMP1 ; T1 = T1 + SIGMA1(E)
+ vpaddd D, D, T1 ; D = D + T1
+
+ vprord H, A, 2 ; ROR_2(A)
+ vprord TMP2, A, 13 ; ROR_13(A)
+ vprord TMP3, A, 22 ; ROR_22(A)
+ vmovdqa32 TMP0, A
+ vpternlogd TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C)
+ vpternlogd H, TMP2, TMP3, 0x96 ; H(T2) = SIGMA0(A)
+ vpaddd H, H, TMP0 ; H(T2) = SIGMA0(A) + MAJ(A,B,C)
+ vpaddd H, H, T1 ; H(A) = H(T2) + T1
+
+ vmovdqa32 TMP3, [TBL + ((%%ROUND+1)*64)] ; Next Kt
+
+ ;; Rotate the args A-H (rotation of names associated with regs)
+ ROTATE_ARGS
+%endmacro
+
+; This is supposed to be SKL optimized assuming:
+; vpternlog, vpaddd ports 5,8
+; vprord ports 1,8
+; However, vprord is only working on port 8
+;
+; Main processing loop per round
+; Get the msg schedule word 16 from the current, now unneccessary word
+%macro PROCESS_LOOP_00_47 5
+%define %%WT %1
+%define %%ROUND %2
+%define %%WTp1 %3
+%define %%WTp9 %4
+%define %%WTp14 %5
+ ;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt
+ ;; T2 = SIGMA0(A) + MAJ(A, B, C)
+ ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
+
+ ;; H becomes T2, then add T1 for A
+ ;; D becomes D + T1 for E
+
+ ;; For next value in msg schedule
+ ;; Wt+16 = sigma1(Wt+14) + Wt+9 + sigma0(Wt+1) + Wt
+
+ vmovdqa32 TMP0, E
+ vprord TMP1, E, 6 ; ROR_6(E)
+ vprord TMP2, E, 11 ; ROR_11(E)
+ vprord TMP3, E, 25 ; ROR_25(E)
+ vpternlogd TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G)
+ vpaddd T1, H, %%WT ; T1 = H + Wt
+ vpternlogd TMP1, TMP2, TMP3, 0x96 ; TMP1 = SIGMA1(E)
+ vpaddd T1, T1, TMP6 ; T1 = T1 + Kt
+ vprord H, A, 2 ; ROR_2(A)
+ vpaddd T1, T1, TMP0 ; T1 = T1 + CH(E,F,G)
+ vprord TMP2, A, 13 ; ROR_13(A)
+ vmovdqa32 TMP0, A
+ vprord TMP3, A, 22 ; ROR_22(A)
+ vpaddd T1, T1, TMP1 ; T1 = T1 + SIGMA1(E)
+ vpternlogd TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C)
+ vpaddd D, D, T1 ; D = D + T1
+ vpternlogd H, TMP2, TMP3, 0x96 ; H(T2) = SIGMA0(A)
+ vprord TMP4, %%WTp14, 17 ; ROR_17(Wt-2)
+ vpaddd H, H, TMP0 ; H(T2) = SIGMA0(A) + MAJ(A,B,C)
+ vprord TMP5, %%WTp14, 19 ; ROR_19(Wt-2)
+ vpsrld TMP6, %%WTp14, 10 ; SHR_10(Wt-2)
+ vpaddd H, H, T1 ; H(A) = H(T2) + T1
+ vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma1(Wt-2)
+ vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2)
+ vprord TMP4, %%WTp1, 7 ; ROR_7(Wt-15)
+ vprord TMP5, %%WTp1, 18 ; ROR_18(Wt-15)
+ vpaddd %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma1(Wt-2) + Wt-7
+ vpsrld TMP6, %%WTp1, 3 ; SHR_3(Wt-15)
+ vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma0(Wt-15)
+ vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) +
+ ; Wt-7 + sigma0(Wt-15) +
+
+ vmovdqa32 TMP6, [TBL + ((%%ROUND+1)*64)] ; Next Kt
+
+ ;; Rotate the args A-H (rotation of names associated with regs)
+ ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_63 4
+%define %%WT %1
+%define %%WTp1 %2
+%define %%WTp9 %3
+%define %%WTp14 %4
+ vprord TMP4, %%WTp14, 17 ; ROR_17(Wt-2)
+ vprord TMP5, %%WTp14, 19 ; ROR_19(Wt-2)
+ vpsrld TMP6, %%WTp14, 10 ; SHR_10(Wt-2)
+ vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma1(Wt-2)
+
+ vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2)
+ vpaddd %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma1(Wt-2) + Wt-7
+
+ vprord TMP4, %%WTp1, 7 ; ROR_7(Wt-15)
+ vprord TMP5, %%WTp1, 18 ; ROR_18(Wt-15)
+ vpsrld TMP6, %%WTp1, 3 ; SHR_3(Wt-15)
+ vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma0(Wt-15)
+
+ vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) +
+ ; Wt-7 + sigma0(Wt-15) +
+%endmacro
+
+; Note this is reading in two blocks of data from each lane,
+; in preparation for the upcoming needed transpose to build msg schedule.
+; Each register will contain 32 bytes from one lane plus 32 bytes
+; from another lane.
+; The first 8 registers will contain the first 32 bytes of all lanes,
+; where register X (0 <= X <= 7) will contain bytes 0-31 from lane X in the first half
+; and 0-31 bytes from lane X+8 in the second half.
+; The last 8 registers will contain the last 32 bytes of all lanes,
+; where register Y (8 <= Y <= 15) wil contain bytes 32-63 from lane Y-8 in the first half
+; and 32-63 bytes from lane Y in the second half.
+; This method helps reducing the number of shuffles required to transpose the data.
+%macro MSG_SCHED_ROUND_00_15 6
+%define %%Wt %1 ; [out] zmm register to load the next block
+%define %%LANE_IDX %2 ; [in] lane index (0-15)
+%define %%BASE_PTR %3 ; [in] base address of the input data
+%define %%OFFSET_PTR %4 ; [in] offset to get next block of data from the lane
+%define %%TMP1 %5 ; [clobbered] temporary gp register
+%define %%TMP2 %6 ; [clobbered] temporary gp register
+%if (%%LANE_IDX < 8)
+ mov %%TMP1, [%%BASE_PTR + %%LANE_IDX*PTR_SZ]
+ mov %%TMP2, [%%BASE_PTR + (%%LANE_IDX+8)*PTR_SZ]
+ vmovups YWORD(%%Wt), [%%TMP1+%%OFFSET_PTR]
+ vinserti64x4 %%Wt, %%Wt, [%%TMP2+%%OFFSET_PTR], 0x01
+%else
+ mov %%TMP1, [%%BASE_PTR + (%%LANE_IDX-8)*PTR_SZ]
+ mov %%TMP2, [%%BASE_PTR + %%LANE_IDX*PTR_SZ]
+ vmovups YWORD(%%Wt), [%%TMP1+%%OFFSET_PTR+32]
+ vinserti64x4 %%Wt, %%Wt, [%%TMP2+%%OFFSET_PTR+32], 0x01
+%endif
+%endmacro
+
+ section .data
+default rel
+align 64
+TABLE:
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK:
+ ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ ;ddq 0x0c0d0e0f08090a0b0405060700010203
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+section .text
+
+;; void sha256_x16_avx512(void **input_data, UINT128 *digest[16], UINT64 size)
+;; arg 1 : pointer to SHA256 args structure
+;; arg 2 : size (in blocks) ;; assumed to be >= 1
+;; arg 1 : rcx : pointer to array of pointers to input data
+;; arg 2 : rdx : pointer to array of pointers to digest
+;; arg 3 : r8 : size of input in bytes
+MKGLOBAL(sha256_x16_avx512,function,internal)
+align 64
+sha256_x16_avx512:
+ mov rax, rsp
+ sub rsp, STACK_SPACE
+ and rsp, ~63 ; align stack to multiple of 64
+ mov [rsp + _rsp], rax
+
+ ;; Initialize digests
+ vmovdqu32 A, [STATE + 0*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu32 B, [STATE + 1*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu32 C, [STATE + 2*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu32 D, [STATE + 3*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu32 E, [STATE + 4*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu32 F, [STATE + 5*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu32 G, [STATE + 6*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu32 H, [STATE + 7*SHA256_DIGEST_ROW_SIZE]
+
+ lea TBL, [rel TABLE]
+
+ ; Do we need to transpose digests???
+ ; SHA1 does not, but SHA256 has been
+
+ xor IDX, IDX
+
+ ;; Load first blocks of data into ZMM registers before
+ ;; performing a 16x16 32-bit transpose.
+ ;; To speed up the transpose, data is loaded in chunks of 32 bytes,
+ ;; interleaving data between lane X and lane X+8.
+ ;; This way, final shuffles between top half and bottom half
+ ;; of the matrix are avoided.
+ mov inp0, [STATE + _data_ptr_sha256 + 0*PTR_SZ]
+ mov inp1, [STATE + _data_ptr_sha256 + 1*PTR_SZ]
+ mov inp2, [STATE + _data_ptr_sha256 + 2*PTR_SZ]
+ mov inp3, [STATE + _data_ptr_sha256 + 3*PTR_SZ]
+ mov inp4, [STATE + _data_ptr_sha256 + 4*PTR_SZ]
+ mov inp5, [STATE + _data_ptr_sha256 + 5*PTR_SZ]
+ mov inp6, [STATE + _data_ptr_sha256 + 6*PTR_SZ]
+ mov inp7, [STATE + _data_ptr_sha256 + 7*PTR_SZ]
+
+ TRANSPOSE16_U32_LOAD_FIRST8 W0, W1, W2, W3, W4, W5, W6, W7, \
+ W8, W9, W10, W11, W12, W13, W14, W15, \
+ inp0, inp1, inp2, inp3, inp4, inp5, \
+ inp6, inp7, IDX
+
+ mov inp0, [STATE + _data_ptr_sha256 + 8*PTR_SZ]
+ mov inp1, [STATE + _data_ptr_sha256 + 9*PTR_SZ]
+ mov inp2, [STATE + _data_ptr_sha256 +10*PTR_SZ]
+ mov inp3, [STATE + _data_ptr_sha256 +11*PTR_SZ]
+ mov inp4, [STATE + _data_ptr_sha256 +12*PTR_SZ]
+ mov inp5, [STATE + _data_ptr_sha256 +13*PTR_SZ]
+ mov inp6, [STATE + _data_ptr_sha256 +14*PTR_SZ]
+ mov inp7, [STATE + _data_ptr_sha256 +15*PTR_SZ]
+
+ TRANSPOSE16_U32_LOAD_LAST8 W0, W1, W2, W3, W4, W5, W6, W7, \
+ W8, W9, W10, W11, W12, W13, W14, W15, \
+ inp0, inp1, inp2, inp3, inp4, inp5, \
+ inp6, inp7, IDX
+
+ align 32
+lloop:
+ vmovdqa32 TMP2, [rel PSHUFFLE_BYTE_FLIP_MASK]
+
+ vmovdqa32 TMP3, [TBL] ; First K
+
+ ; Save digests for later addition
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H
+
+ add IDX, 64
+
+ TRANSPOSE16_U32 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1, TMP4, TMP5
+
+%assign I 0
+%rep 16
+ vpshufb APPEND(W,I), APPEND(W,I), TMP2
+%assign I (I+1)
+%endrep
+
+ ; MSG Schedule for W0-W15 is now complete in registers
+ ; Process first 48 rounds
+ ; Calculate next Wt+16 after processing is complete and Wt is unneeded
+
+ ; PROCESS_LOOP_00_47 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M)
+%assign I 0
+%assign J 0
+%assign K 1
+%assign L 9
+%assign M 14
+%rep 48
+ PROCESS_LOOP APPEND(W,J), I
+ MSG_SCHED_ROUND_16_63 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+%assign I (I+1)
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%endrep
+
+ ; Check is this is the last block
+ sub INP_SIZE, 1
+ je lastLoop
+
+ ; Process last 16 rounds
+ ; Read in next block msg data for use in first 16 words of msg sched
+%assign I 48
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+ MSG_SCHED_ROUND_00_15 APPEND(W,J), J, STATE + _data_ptr_sha256, IDX, inp0, inp1
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+
+ ; Add old digest
+ vpaddd A, A, [rsp + _DIGEST_SAVE + 64*0]
+ vpaddd B, B, [rsp + _DIGEST_SAVE + 64*1]
+ vpaddd C, C, [rsp + _DIGEST_SAVE + 64*2]
+ vpaddd D, D, [rsp + _DIGEST_SAVE + 64*3]
+ vpaddd E, E, [rsp + _DIGEST_SAVE + 64*4]
+ vpaddd F, F, [rsp + _DIGEST_SAVE + 64*5]
+ vpaddd G, G, [rsp + _DIGEST_SAVE + 64*6]
+ vpaddd H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+ jmp lloop
+
+lastLoop:
+ ; Process last 16 rounds
+%assign I 48
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+
+ ; Add old digest
+ vpaddd A, A, [rsp + _DIGEST_SAVE + 64*0]
+ vpaddd B, B, [rsp + _DIGEST_SAVE + 64*1]
+ vpaddd C, C, [rsp + _DIGEST_SAVE + 64*2]
+ vpaddd D, D, [rsp + _DIGEST_SAVE + 64*3]
+ vpaddd E, E, [rsp + _DIGEST_SAVE + 64*4]
+ vpaddd F, F, [rsp + _DIGEST_SAVE + 64*5]
+ vpaddd G, G, [rsp + _DIGEST_SAVE + 64*6]
+ vpaddd H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+ ; Write out digest
+ ; Do we need to untranspose digests???
+ vmovdqu32 [STATE + 0*SHA256_DIGEST_ROW_SIZE], A
+ vmovdqu32 [STATE + 1*SHA256_DIGEST_ROW_SIZE], B
+ vmovdqu32 [STATE + 2*SHA256_DIGEST_ROW_SIZE], C
+ vmovdqu32 [STATE + 3*SHA256_DIGEST_ROW_SIZE], D
+ vmovdqu32 [STATE + 4*SHA256_DIGEST_ROW_SIZE], E
+ vmovdqu32 [STATE + 5*SHA256_DIGEST_ROW_SIZE], F
+ vmovdqu32 [STATE + 6*SHA256_DIGEST_ROW_SIZE], G
+ vmovdqu32 [STATE + 7*SHA256_DIGEST_ROW_SIZE], H
+
+ ; update input pointers
+%assign I 0
+%rep 16
+ add [STATE + _data_ptr_sha256 + I*PTR_SZ], IDX
+%assign I (I+1)
+%endrep
+
+%ifdef SAFE_DATA
+ ;; Clear stack frame (8*64 bytes)
+ vpxorq zmm0, zmm0
+%assign i 0
+%rep 8
+ vmovdqa64 [rsp + i*64], zmm0
+%assign i (i+1)
+%endrep
+%endif
+
+ mov rsp, [rsp + _rsp]
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
diff --git a/src/spdk/intel-ipsec-mb/avx512/sha512_x8_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/sha512_x8_avx512.asm
new file mode 100644
index 000000000..48532c3fb
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx512/sha512_x8_avx512.asm
@@ -0,0 +1,595 @@
+;;
+;; Copyright (c) 2017-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;; Stack must be aligned to 32 bytes before call
+;;
+;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; -----------------------------------------------------------
+;; Windows clobbers: RAX RDX RDI R8 R9 R10 R11 R12 R13 R14 R15
+;; Windows preserves: RBX RCX RBP RSI
+;; -----------------------------------------------------------
+;; Linux clobbers: RAX RDX RSI R8 R9 R10 R11 R12 R13 R14 R15
+;; Linux preserves: RBX RCX RBP RDI
+;; -----------------------------------------------------------
+;; Clobbers ZMM0-31
+
+;; code to compute quad SHA512 using AVX512
+
+%include "include/os.asm"
+;%define DO_DBGPRINT
+%include "include/dbgprint.asm"
+%include "mb_mgr_datastruct.asm"
+%include "include/transpose_avx512.asm"
+
+%define APPEND(a,b) a %+ b
+
+%ifdef LINUX
+; Linux register definitions
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rcx
+%define arg4 rdx
+%else
+; Windows definitions
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 rsi
+%define arg4 rdi
+%endif
+
+%define STATE arg1
+%define INP_SIZE arg2
+
+%define IDX arg4
+%define TBL r8
+
+;; retaining XMM_SAVE, because the top half of YMM registers no saving required, only bottom half, the XMM part
+%define NUM_LANES 8
+%define XMM_SAVE (15-5)*16
+%define SZ 8
+%define SZ8 8 * SZ
+%define DIGEST_SZ 8 * SZ8
+%define DIGEST_SAVE NUM_LANES * DIGEST_SZ
+%define RSP_SAVE 1*8
+
+
+; Define Stack Layout
+START_FIELDS
+;;; name size align
+FIELD _DIGEST_SAVE, NUM_LANES*8*64, 64
+FIELD _XMM_SAVE, XMM_SAVE, 16
+FIELD _RSP, 8, 8
+%assign STACK_SPACE _FIELD_OFFSET
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 rax
+
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define E zmm4
+%define F zmm5
+%define G zmm6
+%define H zmm7
+%define T1 zmm8
+%define TMP0 zmm9
+%define TMP1 zmm10
+%define TMP2 zmm11
+%define TMP3 zmm12
+%define TMP4 zmm13
+%define TMP5 zmm14
+%define TMP6 zmm15
+
+
+%define W0 zmm16
+%define W1 zmm17
+%define W2 zmm18
+%define W3 zmm19
+%define W4 zmm20
+%define W5 zmm21
+%define W6 zmm22
+%define W7 zmm23
+%define W8 zmm24
+%define W9 zmm25
+%define W10 zmm26
+%define W11 zmm27
+%define W12 zmm28
+%define W13 zmm29
+%define W14 zmm30
+%define W15 zmm31
+
+; from sha256_fips180-2.pdf
+; define rotates for Sigma function for main loop steps
+%define BIG_SIGMA_0_0 28 ; Sigma0
+%define BIG_SIGMA_0_1 34
+%define BIG_SIGMA_0_2 39
+%define BIG_SIGMA_1_0 14 ; Sigma1
+%define BIG_SIGMA_1_1 18
+%define BIG_SIGMA_1_2 41
+
+; define rotates for Sigma function for scheduling steps
+%define SMALL_SIGMA_0_0 1 ; sigma0
+%define SMALL_SIGMA_0_1 8
+%define SMALL_SIGMA_0_2 7
+%define SMALL_SIGMA_1_0 19 ; sigma1
+%define SMALL_SIGMA_1_1 61
+%define SMALL_SIGMA_1_2 6
+
+%define SHA_MAX_ROUNDS 80
+%define SHA_ROUNDS_LESS_16 (SHA_MAX_ROUNDS - 16)
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ H
+%xdefine H G
+%xdefine G F
+%xdefine F E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+;; CH(A, B, C) = (A&B) ^ (~A&C)
+;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G)
+;; SIGMA0 = ROR_28 ^ ROR_34 ^ ROR_39
+;; SIGMA1 = ROR_14 ^ ROR_18 ^ ROR_41
+;; sigma0 = ROR_1 ^ ROR_8 ^ SHR_7
+;; sigma1 = ROR_19 ^ ROR_61 ^ SHR_6
+
+;; Main processing loop per round
+;; equivalent to %macro ROUND_00_15 2
+%macro PROCESS_LOOP 2
+%define %%WT %1
+%define %%ROUND %2
+ ;; T1 = H + BIG_SIGMA_1(E) + CH(E, F, G) + Kt + Wt
+ ;; T2 = BIG_SIGMA_0(A) + MAJ(A, B, C)
+ ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
+
+ ;; H becomes T2, then add T1 for A
+ ;; D becomes D + T1 for E
+
+ vpaddq T1, H, TMP3 ; T1 = H + Kt
+ vmovdqa32 TMP0, E
+ ;; compute BIG_SIGMA_1(E)
+ vprorq TMP1, E, BIG_SIGMA_1_0 ; ROR_14(E)
+ vprorq TMP2, E, BIG_SIGMA_1_1 ; ROR_18(E)
+ vprorq TMP3, E, BIG_SIGMA_1_2 ; ROR_41(E)
+ vpternlogq TMP1, TMP2, TMP3, 0x96 ; TMP1 = BIG_SIGMA_1(E)
+ vpternlogq TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G)
+ vpaddq T1, T1, %%WT ; T1 = T1 + Wt
+ vpaddq T1, T1, TMP0 ; T1 = T1 + CH(E,F,G)
+ vpaddq T1, T1, TMP1 ; T1 = T1 + BIG_SIGMA_1(E)
+ vpaddq D, D, T1 ; D = D + T1
+ vprorq H, A, BIG_SIGMA_0_0 ;ROR_28(A)
+ vprorq TMP2, A, BIG_SIGMA_0_1 ;ROR_34(A)
+ vprorq TMP3, A, BIG_SIGMA_0_2 ;ROR_39(A)
+ vmovdqa32 TMP0, A
+ vpternlogq TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C)
+ vpternlogq H, TMP2, TMP3, 0x96 ; H(T2) = BIG_SIGMA_0(A)
+ vpaddq H, H, TMP0 ; H(T2) = BIG_SIGMA_0(A) + MAJ(A,B,C)
+ vpaddq H, H, T1 ; H(A) = H(T2) + T1
+ vmovdqa32 TMP3, [TBL + ((%%ROUND+1)*64)] ; Next Kt
+
+ ;; Rotate the args A-H (rotation of names associated with regs)
+ ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_79 4
+%define %%WT %1
+%define %%WTp1 %2
+%define %%WTp9 %3
+%define %%WTp14 %4
+ vprorq TMP4, %%WTp14, SMALL_SIGMA_1_0 ; ROR_19(Wt-2)
+ vprorq TMP5, %%WTp14, SMALL_SIGMA_1_1 ; ROR_61(Wt-2)
+ vpsrlq TMP6, %%WTp14, SMALL_SIGMA_1_2 ; SHR_6(Wt-2)
+ vpternlogq TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma_1(Wt-2)
+
+ vpaddq %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma_1(Wt-2)
+ vpaddq %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma_1(Wt-2) + Wt-7
+
+ vprorq TMP4, %%WTp1, SMALL_SIGMA_0_0 ; ROR_1(Wt-15)
+ vprorq TMP5, %%WTp1, SMALL_SIGMA_0_1 ; ROR_8(Wt-15)
+ vpsrlq TMP6, %%WTp1, SMALL_SIGMA_0_2 ; SHR_7(Wt-15)
+ vpternlogq TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma_0(Wt-15)
+
+ vpaddq %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma_1(Wt-2) +
+ ; Wt-7 + sigma_0(Wt-15) +
+%endmacro
+
+section .data
+default rel
+
+align 64
+; 80 constants for SHA512
+; replicating for each lane, thus 8*80
+; to aid in SIMD .. space tradeoff for time!
+; local to asm file, used nowhere else
+TABLE:
+ dq 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22
+ dq 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22
+ dq 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd
+ dq 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd
+ dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
+ dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
+ dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
+ dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
+ dq 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538
+ dq 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538
+ dq 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019
+ dq 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019
+ dq 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b
+ dq 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b
+ dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
+ dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
+ dq 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242
+ dq 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242
+ dq 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe
+ dq 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe
+ dq 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c
+ dq 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c
+ dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
+ dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
+ dq 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f
+ dq 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f
+ dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
+ dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
+ dq 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235
+ dq 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235
+ dq 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694
+ dq 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694
+ dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
+ dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
+ dq 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3
+ dq 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3
+ dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
+ dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
+ dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
+ dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
+ dq 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275
+ dq 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275
+ dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
+ dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
+ dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
+ dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
+ dq 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5
+ dq 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5
+ dq 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab
+ dq 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab
+ dq 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210
+ dq 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210
+ dq 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f
+ dq 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f
+ dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
+ dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
+ dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
+ dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
+ dq 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725
+ dq 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725
+ dq 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f
+ dq 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f
+ dq 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70
+ dq 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70
+ dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc
+ dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc
+ dq 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926
+ dq 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926
+ dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
+ dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
+ dq 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df
+ dq 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df
+ dq 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de
+ dq 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de
+ dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
+ dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
+ dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6
+ dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6
+ dq 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b
+ dq 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b
+ dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
+ dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
+ dq 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001
+ dq 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001
+ dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791
+ dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791
+ dq 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30
+ dq 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30
+ dq 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218
+ dq 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218
+ dq 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910
+ dq 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910
+ dq 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a
+ dq 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a
+ dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8
+ dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8
+ dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
+ dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
+ dq 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53
+ dq 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53
+ dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99
+ dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99
+ dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
+ dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
+ dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
+ dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
+ dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
+ dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
+ dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373
+ dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373
+ dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
+ dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
+ dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc
+ dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc
+ dq 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60
+ dq 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60
+ dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72
+ dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72
+ dq 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec
+ dq 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec
+ dq 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28
+ dq 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28
+ dq 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9
+ dq 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9
+ dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
+ dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
+ dq 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b
+ dq 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b
+ dq 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c
+ dq 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c
+ dq 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207
+ dq 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207
+ dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
+ dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
+ dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
+ dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
+ dq 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba
+ dq 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba
+ dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
+ dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
+ dq 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae
+ dq 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae
+ dq 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b
+ dq 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b
+ dq 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84
+ dq 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84
+ dq 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493
+ dq 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493
+ dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
+ dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
+ dq 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c
+ dq 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c
+ dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
+ dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
+ dq 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a
+ dq 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a
+ dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
+ dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
+ dq 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817
+ dq 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817
+
+align 64
+; this does the big endian to little endian conversion over a quad word .. ZMM
+;; shuffle on ZMM is shuffle on 4 XMM size chunks, 128 bits
+PSHUFFLE_BYTE_FLIP_MASK:
+ ;ddq 0x08090a0b0c0d0e0f0001020304050607
+ dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+ ;ddq 0x18191a1b1c1d1e1f1011121314151617
+ dq 0x1011121314151617, 0x18191a1b1c1d1e1f
+ ;ddq 0x28292a2b2c2d2e2f2021222324252627
+ dq 0x2021222324252627, 0x28292a2b2c2d2e2f
+ ;ddq 0x38393a3b3c3d3e3f3031323334353637
+ dq 0x3031323334353637, 0x38393a3b3c3d3e3f
+
+section .text
+
+;; void sha512_x8_avx512(void *input_data, UINT64 *digest[NUM_LANES], const int size)
+;; arg 1 : rcx : pointer to input data
+;; arg 2 : rdx : pointer to UINT64 digest[8][num_lanes]
+;; arg 3 : size in message block lengths (= 128 bytes)
+MKGLOBAL(sha512_x8_avx512,function,internal)
+align 64
+sha512_x8_avx512:
+ mov rax, rsp
+ sub rsp, STACK_SPACE
+ and rsp, ~63 ; align stack to multiple of 64
+ mov [rsp + _RSP], rax
+
+ ;; Initialize digests ; organized uint64 digest[8][num_lanes]; no transpose required
+ ;; Digest is an array of pointers to digests
+ vmovdqu32 A, [STATE + 0*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu32 B, [STATE + 1*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu32 C, [STATE + 2*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu32 D, [STATE + 3*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu32 E, [STATE + 4*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu32 F, [STATE + 5*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu32 G, [STATE + 6*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu32 H, [STATE + 7*SHA512_DIGEST_ROW_SIZE]
+
+ lea TBL,[rel TABLE]
+ xor IDX, IDX
+ ;; Read in input data address, saving them in registers because
+ ;; they will serve as variables, which we shall keep incrementing
+ mov inp0, [STATE + _data_ptr_sha512 + 0*PTR_SZ]
+ mov inp1, [STATE + _data_ptr_sha512 + 1*PTR_SZ]
+ mov inp2, [STATE + _data_ptr_sha512 + 2*PTR_SZ]
+ mov inp3, [STATE + _data_ptr_sha512 + 3*PTR_SZ]
+ mov inp4, [STATE + _data_ptr_sha512 + 4*PTR_SZ]
+ mov inp5, [STATE + _data_ptr_sha512 + 5*PTR_SZ]
+ mov inp6, [STATE + _data_ptr_sha512 + 6*PTR_SZ]
+ mov inp7, [STATE + _data_ptr_sha512 + 7*PTR_SZ]
+ jmp lloop
+
+align 32
+lloop:
+ ;; Load 64-byte blocks of data into ZMM registers before
+ ;; performing a 8x8 64-bit transpose.
+ ;; To speed up the transpose, data is loaded in chunks of 32 bytes,
+ ;; interleaving data between lane X and lane X+4.
+ ;; This way, final shuffles between top half and bottom half
+ ;; of the matrix are avoided.
+ TRANSPOSE8_U64_LOAD8 W0, W1, W2, W3, W4, W5, W6, W7, \
+ inp0, inp1, inp2, inp3, inp4, inp5, \
+ inp6, inp7, IDX
+
+ TRANSPOSE8_U64 W0, W1, W2, W3, W4, W5, W6, W7, TMP0, TMP1, TMP2, TMP3
+ ;; Load next 512 bytes
+ TRANSPOSE8_U64_LOAD8 W8, W9, W10, W11, W12, W13, W14, W15, \
+ inp0, inp1, inp2, inp3, inp4, inp5, \
+ inp6, inp7, IDX+SZ8
+
+ TRANSPOSE8_U64 W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1, TMP2, TMP3
+
+ vmovdqa32 TMP2, [rel PSHUFFLE_BYTE_FLIP_MASK]
+
+ vmovdqa32 TMP3, [TBL] ; First K
+
+ ; Save digests for later addition
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H
+
+ add IDX, 128 ; increment by message block length in bytes
+
+%assign I 0
+%rep 16
+;;; little endian to big endian
+ vpshufb APPEND(W,I), APPEND(W,I), TMP2
+%assign I (I+1)
+%endrep
+
+ ; MSG Schedule for W0-W15 is now complete in registers
+ ; Process first (max-rounds -16)
+ ; Calculate next Wt+16 after processing is complete and Wt is unneeded
+ ; PROCESS_LOOP_00_79 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M)
+
+%assign I 0
+%assign J 0
+%assign K 1
+%assign L 9
+%assign M 14
+%rep SHA_ROUNDS_LESS_16
+ PROCESS_LOOP APPEND(W,J), I
+ MSG_SCHED_ROUND_16_79 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+%assign I (I+1)
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%endrep
+ ; Check is this is the last block
+ sub INP_SIZE, 1
+ je lastLoop
+
+ ; Process last 16 rounds
+ ; Read in next block msg data for use in first 16 words of msg sched
+%assign I SHA_ROUNDS_LESS_16
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+ ; Add old digest
+ vpaddq A, A, [rsp + _DIGEST_SAVE + 64*0]
+ vpaddq B, B, [rsp + _DIGEST_SAVE + 64*1]
+ vpaddq C, C, [rsp + _DIGEST_SAVE + 64*2]
+ vpaddq D, D, [rsp + _DIGEST_SAVE + 64*3]
+ vpaddq E, E, [rsp + _DIGEST_SAVE + 64*4]
+ vpaddq F, F, [rsp + _DIGEST_SAVE + 64*5]
+ vpaddq G, G, [rsp + _DIGEST_SAVE + 64*6]
+ vpaddq H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+ jmp lloop
+
+align 32
+lastLoop:
+ ; Process last 16 rounds
+%assign I SHA_ROUNDS_LESS_16
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+
+ ; Add old digest
+ vpaddq A, A, [rsp + _DIGEST_SAVE + 64*0]
+ vpaddq B, B, [rsp + _DIGEST_SAVE + 64*1]
+ vpaddq C, C, [rsp + _DIGEST_SAVE + 64*2]
+ vpaddq D, D, [rsp + _DIGEST_SAVE + 64*3]
+ vpaddq E, E, [rsp + _DIGEST_SAVE + 64*4]
+ vpaddq F, F, [rsp + _DIGEST_SAVE + 64*5]
+ vpaddq G, G, [rsp + _DIGEST_SAVE + 64*6]
+ vpaddq H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+ ; Write out digest
+ ;; results in A, B, C, D, E, F, G, H
+ vmovdqu32 [STATE + 0*SHA512_DIGEST_ROW_SIZE], A
+ vmovdqu32 [STATE + 1*SHA512_DIGEST_ROW_SIZE], B
+ vmovdqu32 [STATE + 2*SHA512_DIGEST_ROW_SIZE], C
+ vmovdqu32 [STATE + 3*SHA512_DIGEST_ROW_SIZE], D
+ vmovdqu32 [STATE + 4*SHA512_DIGEST_ROW_SIZE], E
+ vmovdqu32 [STATE + 5*SHA512_DIGEST_ROW_SIZE], F
+ vmovdqu32 [STATE + 6*SHA512_DIGEST_ROW_SIZE], G
+ vmovdqu32 [STATE + 7*SHA512_DIGEST_ROW_SIZE], H
+
+ ; update input pointers
+%assign I 0
+%rep 8
+ add [STATE + _data_ptr_sha512 + I*PTR_SZ], IDX
+%assign I (I+1)
+%endrep
+
+
+%ifdef SAFE_DATA
+ ;; Clear stack frame ((NUM_LANES*8)*64 bytes)
+ vpxorq zmm0, zmm0
+%assign i 0
+%rep (NUM_LANES*8)
+ vmovdqa64 [rsp + i*64], zmm0
+%assign i (i+1)
+%endrep
+%endif
+ mov rsp, [rsp + _RSP]
+;hash_done:
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif