diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/spdk/intel-ipsec-mb/avx | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/intel-ipsec-mb/avx')
55 files changed, 20721 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/avx/aes128_cbc_dec_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes128_cbc_dec_by8_avx.asm new file mode 100644 index 000000000..a4de936ff --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes128_cbc_dec_by8_avx.asm @@ -0,0 +1,306 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; routine to do AES128 CBC decrypt "by8" + +;; clobbers xmm0-15 + +%include "include/os.asm" + +%define CONCAT(a,b) a %+ b +%define VMOVDQ vmovdqu + +%define xdata0 xmm0 +%define xdata1 xmm1 +%define xdata2 xmm2 +%define xdata3 xmm3 +%define xdata4 xmm4 +%define xdata5 xmm5 +%define xdata6 xmm6 +%define xdata7 xmm7 +%define xIV xmm8 +%define xkey0 xmm9 +%define xkey2 xmm10 +%define xkey4 xmm11 +%define xkey6 xmm12 +%define xkey8 xmm13 +%define xkey10 xmm14 +%define xkeytmp xmm15 + +%ifdef LINUX +%define p_in rdi +%define p_IV rsi +%define p_keys rdx +%define p_out rcx +%define num_bytes r8 +%else +%define p_in rcx +%define p_IV rdx +%define p_keys r8 +%define p_out r9 +%define num_bytes rax +%endif + +%define tmp r10 + +%macro do_aes_load 1 + do_aes %1, 1 +%endmacro + +%macro do_aes_noload 1 + do_aes %1, 0 +%endmacro + +; do_aes num_in_par load_keys +; This increments p_in, but not p_out +%macro do_aes 2 +%define %%by %1 +%define %%load_keys %2 + +%if (%%load_keys) + vmovdqa xkey0, [p_keys + 0*16] +%endif + +%assign i 0 +%rep %%by + VMOVDQ CONCAT(xdata,i), [p_in + i*16] +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey2, [p_keys + 2*16] +%endif +%assign i 0 +%rep %%by + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkey0 +%assign i (i+1) +%endrep + + add p_in, 16*%%by + + vmovdqa xkeytmp, [p_keys + 1*16] +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey4, [p_keys + 4*16] +%endif +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey2 +%assign i (i+1) +%endrep + + vmovdqa xkeytmp, [p_keys + 3*16] +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey6, [p_keys + 6*16] +%endif +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey4 +%assign i (i+1) +%endrep + + vmovdqa xkeytmp, [p_keys + 5*16] +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey8, [p_keys + 8*16] +%endif +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey6 +%assign i (i+1) +%endrep + + vmovdqa xkeytmp, [p_keys + 7*16] +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey10, [p_keys + 10*16] +%endif +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey8 +%assign i (i+1) +%endrep + + vmovdqa xkeytmp, [p_keys + 9*16] +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp +%assign i (i+1) +%endrep + +%assign i 0 +%rep %%by + vaesdeclast CONCAT(xdata,i), CONCAT(xdata,i), xkey10 +%assign i (i+1) +%endrep + + vpxor xdata0, xdata0, xIV +%assign i 1 +%if (%%by > 1) +%rep (%%by - 1) + VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by] + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xIV +%assign i (i+1) +%endrep +%endif + VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by] + +%assign i 0 +%rep %%by + VMOVDQ [p_out + i*16], CONCAT(xdata,i) +%assign i (i+1) +%endrep +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +section .text + +;; aes_cbc_dec_128_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) +MKGLOBAL(aes_cbc_dec_128_avx,function,internal) +aes_cbc_dec_128_avx: + +%ifndef LINUX + mov num_bytes, [rsp + 8*5] +%endif + + vmovdqu xIV, [p_IV] + + mov tmp, num_bytes + and tmp, 7*16 + jz mult_of_8_blks + + ; 1 <= tmp <= 7 + cmp tmp, 4*16 + jg gt4 + je eq4 + +lt4: + cmp tmp, 2*16 + jg eq3 + je eq2 +eq1: + do_aes_load 1 + add p_out, 1*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq2: + do_aes_load 2 + add p_out, 2*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq3: + do_aes_load 3 + add p_out, 3*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq4: + do_aes_load 4 + add p_out, 4*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +gt4: + cmp tmp, 6*16 + jg eq7 + je eq6 + +eq5: + do_aes_load 5 + add p_out, 5*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq6: + do_aes_load 6 + add p_out, 6*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq7: + do_aes_load 7 + add p_out, 7*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +mult_of_8_blks: + vmovdqa xkey0, [p_keys + 0*16] + vmovdqa xkey2, [p_keys + 2*16] + vmovdqa xkey4, [p_keys + 4*16] + vmovdqa xkey6, [p_keys + 6*16] + vmovdqa xkey8, [p_keys + 8*16] + vmovdqa xkey10, [p_keys + 10*16] + +main_loop2: + ; num_bytes is a multiple of 8 and >0 + do_aes_noload 8 + add p_out, 8*16 + sub num_bytes, 8*16 + jne main_loop2 + +do_return2: +; Don't write back IV +; vmovdqu [p_IV], xIV + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/aes128_cbc_mac_x8.asm b/src/spdk/intel-ipsec-mb/avx/aes128_cbc_mac_x8.asm new file mode 100644 index 000000000..4d08bfde5 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes128_cbc_mac_x8.asm @@ -0,0 +1,31 @@ +;; +;; Copyright (c) 2017-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;;; Routine to compute CBC-MAC. It is based on 128 bit CBC AES encrypt code. + +%define CBC_MAC 1 +%include "avx/aes_cbc_enc_128_x8.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/aes128_cntr_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes128_cntr_by8_avx.asm new file mode 100644 index 000000000..d46a29192 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes128_cntr_by8_avx.asm @@ -0,0 +1,606 @@ +;; +;; Copyright (c) 2012-2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "include/memcpy.asm" +%include "include/const.inc" +%include "include/reg_sizes.asm" + +; routine to do AES128 CNTR enc/decrypt "by8" +; XMM registers are clobbered. Saving/restoring must be done at a higher level +section .data +default rel + +%ifndef CNTR_CCM_AVX +MKGLOBAL(byteswap_const,data,internal) +MKGLOBAL(set_byte15,data,internal) +MKGLOBAL(ddq_add_1,data,internal) +MKGLOBAL(ddq_add_2,data,internal) +MKGLOBAL(ddq_add_3,data,internal) +MKGLOBAL(ddq_add_4,data,internal) +MKGLOBAL(ddq_add_5,data,internal) +MKGLOBAL(ddq_add_6,data,internal) +MKGLOBAL(ddq_add_7,data,internal) +MKGLOBAL(ddq_add_8,data,internal) +%endif ;; CNTR_CCM_AVX + +align 16 +byteswap_const: ;DDQ 0x000102030405060708090A0B0C0D0E0F + DQ 0x08090A0B0C0D0E0F, 0x0001020304050607 +set_byte15: DQ 0x0000000000000000, 0x0100000000000000 + +ddq_add_1: ;DDQ 0x00000000000000000000000000000001 + DQ 0x0000000000000001, 0x0000000000000000 +ddq_add_2: ;DDQ 0x00000000000000000000000000000002 + DQ 0x0000000000000002, 0x0000000000000000 +ddq_add_3: ;DDQ 0x00000000000000000000000000000003 + DQ 0x0000000000000003, 0x0000000000000000 +ddq_add_4: ;DDQ 0x00000000000000000000000000000004 + DQ 0x0000000000000004, 0x0000000000000000 +ddq_add_5: ;DDQ 0x00000000000000000000000000000005 + DQ 0x0000000000000005, 0x0000000000000000 +ddq_add_6: ;DDQ 0x00000000000000000000000000000006 + DQ 0x0000000000000006, 0x0000000000000000 +ddq_add_7: ;DDQ 0x00000000000000000000000000000007 + DQ 0x0000000000000007, 0x0000000000000000 +ddq_add_8: ;DDQ 0x00000000000000000000000000000008 + DQ 0x0000000000000008, 0x0000000000000000 + +section .text + +%define CONCAT(a,b) a %+ b +%define VMOVDQ vmovdqu + +%define xdata0 xmm0 +%define xdata1 xmm1 +%define xpart xmm1 +%define xdata2 xmm2 +%define xdata3 xmm3 +%define xdata4 xmm4 +%define xdata5 xmm5 +%define xdata6 xmm6 +%define xdata7 xmm7 +%define xcounter xmm8 +%define xtmp xmm8 +%define xbyteswap xmm9 +%define xtmp2 xmm9 +%define xkey0 xmm10 +%define xtmp3 xmm10 +%define xkey3 xmm11 +%define xkey6 xmm12 +%define xkey9 xmm13 +%define xkeyA xmm14 +%define xkeyB xmm15 + +%ifdef CNTR_CCM_AVX +%ifdef LINUX +%define job rdi +%define p_in rsi +%define p_keys rdx +%define p_out rcx +%define num_bytes r8 +%define p_ivlen r9 +%else ;; LINUX +%define job rcx +%define p_in rdx +%define p_keys r8 +%define p_out r9 +%define num_bytes r10 +%define p_ivlen rax +%endif ;; LINUX +%define p_IV r11 +%else ;; CNTR_CCM_AVX +%ifdef LINUX +%define p_in rdi +%define p_IV rsi +%define p_keys rdx +%define p_out rcx +%define num_bytes r8 +%define num_bits r8 +%define p_ivlen r9 +%else ;; LINUX +%define p_in rcx +%define p_IV rdx +%define p_keys r8 +%define p_out r9 +%define num_bytes r10 +%define num_bits r10 +%define p_ivlen qword [rsp + 8*6] +%endif ;; LINUX +%endif ;; CNTR_CCM_AVX + +%define tmp r11 +%define flags r11 + +%define r_bits r12 +%define tmp2 r13 +%define mask r14 + +%macro do_aes_load 2 + do_aes %1, %2, 1 +%endmacro + +%macro do_aes_noload 2 + do_aes %1, %2, 0 +%endmacro + +; do_aes num_in_par load_keys +; This increments p_in, but not p_out +%macro do_aes 3 +%define %%by %1 +%define %%cntr_type %2 +%define %%load_keys %3 + +%if (%%load_keys) + vmovdqa xkey0, [p_keys + 0*16] +%endif + + vpshufb xdata0, xcounter, xbyteswap +%assign i 1 +%rep (%%by - 1) + vpaddd CONCAT(xdata,i), xcounter, [rel CONCAT(ddq_add_,i)] + vpshufb CONCAT(xdata,i), CONCAT(xdata,i), xbyteswap +%assign i (i + 1) +%endrep + + vmovdqa xkeyA, [p_keys + 1*16] + + vpxor xdata0, xkey0 +%ifidn %%cntr_type, CNTR_BIT + vpaddd xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)] +%else + vpaddq xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)] +%endif + +%assign i 1 +%rep (%%by - 1) + vpxor CONCAT(xdata,i), xkey0 +%assign i (i + 1) +%endrep + + vmovdqa xkeyB, [p_keys + 2*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 1 +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey3, [p_keys + 3*16] +%endif +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 2 +%assign i (i+1) +%endrep + + add p_in, 16*%%by + + vmovdqa xkeyB, [p_keys + 4*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey3 ; key 3 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 5*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 4 +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey6, [p_keys + 6*16] +%endif +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 5 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 7*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey6 ; key 6 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 8*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 7 +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey9, [p_keys + 9*16] +%endif +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 8 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 10*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey9 ; key 9 +%assign i (i+1) +%endrep + +%assign i 0 +%rep %%by + vaesenclast CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 10 +%assign i (i+1) +%endrep + +%assign i 0 +%rep (%%by / 2) +%assign j (i+1) + VMOVDQ xkeyA, [p_in + i*16 - 16*%%by] + VMOVDQ xkeyB, [p_in + j*16 - 16*%%by] + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA + vpxor CONCAT(xdata,j), CONCAT(xdata,j), xkeyB +%assign i (i+2) +%endrep +%if (i < %%by) + VMOVDQ xkeyA, [p_in + i*16 - 16*%%by] + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%endif + +%ifidn %%cntr_type, CNTR_BIT + ;; check if this is the end of the message + mov tmp, num_bytes + and tmp, ~(%%by*16) + jnz %%skip_preserve + ;; Check if there is a partial byte + or r_bits, r_bits + jz %%skip_preserve + +%assign idx (%%by - 1) + ;; Load output to get last partial byte + vmovdqu xtmp, [p_out + idx * 16] + + ;; Save RCX in temporary GP register + mov tmp, rcx + mov mask, 0xff + mov cl, BYTE(r_bits) + shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111 + mov rcx, tmp + + vmovq xtmp2, mask + vpslldq xtmp2, 15 + ;; At this point, xtmp2 contains a mask with all 0s, but with some ones + ;; in the partial byte + + ;; Clear all the bits that do not need to be preserved from the output + vpand xtmp, xtmp, xtmp2 + + ;; Clear all bits from the input that are not to be ciphered + vpandn CONCAT(xdata,idx), xtmp2, CONCAT(xdata,idx) + vpor CONCAT(xdata,idx), xtmp + +%%skip_preserve: +%endif + +%assign i 0 +%rep %%by + VMOVDQ [p_out + i*16], CONCAT(xdata,i) +%assign i (i+1) +%endrep +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Macro performing AES-CTR. +;; +%macro DO_CNTR 1 +%define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT/CCM) + +%ifidn %%CNTR_TYPE, CCM + mov p_in, [job + _src] + add p_in, [job + _cipher_start_src_offset_in_bytes] + mov p_ivlen, [job + _iv_len_in_bytes] + mov num_bytes, [job + _msg_len_to_cipher_in_bytes] + mov p_keys, [job + _aes_enc_key_expanded] + mov p_out, [job + _dst] + + vmovdqa xbyteswap, [rel byteswap_const] + ;; Prepare IV ;; + + ;; Byte 0: flags with L' + ;; Calculate L' = 15 - Nonce length - 1 = 14 - IV length + mov flags, 14 + sub flags, p_ivlen + vmovd xcounter, DWORD(flags) + ;; Bytes 1 - 13: Nonce (7 - 13 bytes long) + + ;; Bytes 1 - 7 are always copied (first 7 bytes) + mov p_IV, [job + _iv] + vpinsrb xcounter, [p_IV], 1 + vpinsrw xcounter, [p_IV + 1], 1 + vpinsrd xcounter, [p_IV + 3], 1 + + cmp p_ivlen, 7 + je _finish_nonce_move + + cmp p_ivlen, 8 + je _iv_length_8 + cmp p_ivlen, 9 + je _iv_length_9 + cmp p_ivlen, 10 + je _iv_length_10 + cmp p_ivlen, 11 + je _iv_length_11 + cmp p_ivlen, 12 + je _iv_length_12 + + ;; Bytes 8 - 13 +_iv_length_13: + vpinsrb xcounter, [p_IV + 12], 13 +_iv_length_12: + vpinsrb xcounter, [p_IV + 11], 12 +_iv_length_11: + vpinsrd xcounter, [p_IV + 7], 2 + jmp _finish_nonce_move +_iv_length_10: + vpinsrb xcounter, [p_IV + 9], 10 +_iv_length_9: + vpinsrb xcounter, [p_IV + 8], 9 +_iv_length_8: + vpinsrb xcounter, [p_IV + 7], 8 + +_finish_nonce_move: + ; last byte = 1 + vpor xcounter, [rel set_byte15] +%else ;; CNTR/CNTR_BIT +%ifndef LINUX + mov num_bytes, [rsp + 8*5] ; arg5 +%endif + +%ifidn %%CNTR_TYPE, CNTR_BIT + push r12 + push r13 + push r14 +%endif + + vmovdqa xbyteswap, [rel byteswap_const] +%ifidn %%CNTR_TYPE, CNTR + test p_ivlen, 16 + jnz %%iv_is_16_bytes + ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001 + mov DWORD(tmp), 0x01000000 + vpinsrq xcounter, [p_IV], 0 + vpinsrd xcounter, [p_IV + 8], 2 + vpinsrd xcounter, DWORD(tmp), 3 + +%else ;; CNTR_BIT + ; Read 16 byte IV: Nonce + 8-byte block counter (BE) + vmovdqu xcounter, [p_IV] +%endif +%endif ;; CNTR/CNTR_BIT/CCM +%%bswap_iv: + vpshufb xcounter, xbyteswap + + ;; calculate len + ;; convert bits to bytes (message length in bits for CNTR_BIT) +%ifidn %%CNTR_TYPE, CNTR_BIT + mov r_bits, num_bits + add num_bits, 7 + shr num_bits, 3 ; "num_bits" and "num_bytes" registers are the same + and r_bits, 7 ; Check if there are remainder bits (0-7) +%endif + + mov tmp, num_bytes + and tmp, 7*16 + jz %%chk ; x8 > or < 15 (not 7 lines) + + ; 1 <= tmp <= 7 + cmp tmp, 4*16 + jg %%gt4 + je %%eq4 + +%%lt4: + cmp tmp, 2*16 + jg %%eq3 + je %%eq2 +%%eq1: + do_aes_load 1, %%CNTR_TYPE + add p_out, 1*16 + jmp %%chk + +%%eq2: + do_aes_load 2, %%CNTR_TYPE + add p_out, 2*16 + jmp %%chk + +%%eq3: + do_aes_load 3, %%CNTR_TYPE + add p_out, 3*16 + jmp %%chk + +%%eq4: + do_aes_load 4, %%CNTR_TYPE + add p_out, 4*16 + jmp %%chk + +%%gt4: + cmp tmp, 6*16 + jg %%eq7 + je %%eq6 + +%%eq5: + do_aes_load 5, %%CNTR_TYPE + add p_out, 5*16 + jmp %%chk + +%%eq6: + do_aes_load 6, %%CNTR_TYPE + add p_out, 6*16 + jmp %%chk + +%%eq7: + do_aes_load 7, %%CNTR_TYPE + add p_out, 7*16 + ; fall through to chk +%%chk: + and num_bytes, ~(7*16) + jz %%do_return2 + + cmp num_bytes, 16 + jb %%last + + ; process multiples of 8 blocks + vmovdqa xkey0, [p_keys + 0*16] + vmovdqa xkey3, [p_keys + 3*16] + vmovdqa xkey6, [p_keys + 6*16] + vmovdqa xkey9, [p_keys + 9*16] + jmp %%main_loop2 + +align 32 +%%main_loop2: + ; num_bytes is a multiple of 8 blocks + partial bytes + do_aes_noload 8, %%CNTR_TYPE + add p_out, 8*16 + sub num_bytes, 8*16 + cmp num_bytes, 8*16 + jae %%main_loop2 + + ; Check if there is a partial block + or num_bytes, num_bytes + jnz %%last + +%%do_return2: +%ifidn %%CNTR_TYPE, CCM + mov rax, job + or dword [rax + _status], STS_COMPLETED_AES +%endif + +%ifidn %%CNTR_TYPE, CNTR_BIT + pop r14 + pop r13 + pop r12 +%endif + + ret + +%%last: + + ; load partial block into XMM register + simd_load_avx_15_1 xpart, p_in, num_bytes + +%%final_ctr_enc: + ; Encryption of a single partial block + vpshufb xcounter, xbyteswap + vmovdqa xdata0, xcounter + vpxor xdata0, [p_keys + 16*0] +%assign i 1 +%rep 9 + vaesenc xdata0, [p_keys + 16*i] +%assign i (i+1) +%endrep + ; created keystream + vaesenclast xdata0, [p_keys + 16*i] + + ; xor keystream with the message (scratch) + vpxor xdata0, xpart + +%ifidn %%CNTR_TYPE, CNTR_BIT + ;; Check if there is a partial byte + or r_bits, r_bits + jz %%store_output + + ;; Load output to get last partial byte + simd_load_avx_15_1 xtmp, p_out, num_bytes + + ;; Save RCX in temporary GP register + mov tmp, rcx + mov mask, 0xff +%ifidn r_bits, rcx +%error "r_bits cannot be mapped to rcx!" +%endif + mov cl, BYTE(r_bits) + shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111 + mov rcx, tmp + + vmovq xtmp2, mask + + ;; Get number of full bytes in last block of 16 bytes + mov tmp, num_bytes + dec tmp + XVPSLLB xtmp2, tmp, xtmp3, tmp2 + ;; At this point, xtmp2 contains a mask with all 0s, but with some ones + ;; in the partial byte + + ;; Clear all the bits that do not need to be preserved from the output + vpand xtmp, xtmp, xtmp2 + + ;; Clear the bits from the input that are not to be ciphered + vpandn xdata0, xtmp2, xdata0 + vpor xdata0, xtmp +%endif + +%%store_output: + ; copy result into the output buffer + simd_store_avx_15 p_out, xdata0, num_bytes, tmp, rax + + jmp %%do_return2 + +%%iv_is_16_bytes: + ; Read 16 byte IV: Nonce + ESP IV + block counter (BE) + vmovdqu xcounter, [p_IV] + jmp %%bswap_iv +%endmacro + +align 32 +%ifdef CNTR_CCM_AVX +; JOB_AES_HMAC * aes_cntr_ccm_128_avx(JOB_AES_HMAC *job) +; arg 1 : job +MKGLOBAL(aes_cntr_ccm_128_avx,function,internal) +aes_cntr_ccm_128_avx: + DO_CNTR CCM +%else +;; aes_cntr_128_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, +;; UINT64 iv_len) +MKGLOBAL(aes_cntr_128_avx,function,internal) +aes_cntr_128_avx: + DO_CNTR CNTR + +;; aes_cntr_bit_128_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bits, +;; UINT64 iv_len) +MKGLOBAL(aes_cntr_bit_128_avx,function,internal) +aes_cntr_bit_128_avx: + DO_CNTR CNTR_BIT +%endif ;; CNTR_CCM_AVX + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/aes128_cntr_ccm_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes128_cntr_ccm_by8_avx.asm new file mode 100644 index 000000000..1a4c11602 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes128_cntr_ccm_by8_avx.asm @@ -0,0 +1,32 @@ +;; +;; Copyright (c) 2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define CNTR_CCM_AVX +%ifndef AES_CNTR_CCM_128 +%define AES_CNTR_CCM_128 aes_cntr_ccm_128_avx +%endif +%include "avx/aes128_cntr_by8_avx.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/aes192_cbc_dec_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes192_cbc_dec_by8_avx.asm new file mode 100644 index 000000000..9952c2552 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes192_cbc_dec_by8_avx.asm @@ -0,0 +1,328 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; routine to do AES192 CBC decrypt "by8" + +; XMM registers are clobbered. Saving/restoring must be done at a higher level +%include "include/os.asm" + +%define CONCAT(a,b) a %+ b +%define VMOVDQ vmovdqu + +%define xdata0 xmm0 +%define xdata1 xmm1 +%define xdata2 xmm2 +%define xdata3 xmm3 +%define xdata4 xmm4 +%define xdata5 xmm5 +%define xdata6 xmm6 +%define xdata7 xmm7 +%define xIV xmm8 +%define xkey0 xmm9 +%define xkey3 xmm10 +%define xkey6 xmm11 +%define xkey9 xmm12 +%define xkey12 xmm13 +%define xkeyA xmm14 +%define xkeyB xmm15 + +%ifdef LINUX +%define p_in rdi +%define p_IV rsi +%define p_keys rdx +%define p_out rcx +%define num_bytes r8 +%else +%define p_in rcx +%define p_IV rdx +%define p_keys r8 +%define p_out r9 +%define num_bytes rax +%endif + +%define tmp r10 + +%macro do_aes_load 1 + do_aes %1, 1 +%endmacro + +%macro do_aes_noload 1 + do_aes %1, 0 +%endmacro + +; do_aes num_in_par load_keys +; This increments p_in, but not p_out +%macro do_aes 2 +%define %%by %1 +%define %%load_keys %2 + +%if (%%load_keys) + vmovdqa xkey0, [p_keys + 0*16] +%endif + +%assign i 0 +%rep %%by + VMOVDQ CONCAT(xdata,i), [p_in + i*16] +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 1*16] + +%assign i 0 +%rep %%by + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkey0 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 2*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%assign i (i+1) +%endrep + + add p_in, 16*%%by + +%if (%%load_keys) + vmovdqa xkey3, [p_keys + 3*16] +%endif + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 4*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey3 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 5*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey6, [p_keys + 6*16] +%endif + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 7*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey6 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 8*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey9, [p_keys + 9*16] +%endif + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 10*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey9 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 11*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey12, [p_keys + 12*16] +%endif + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB +%assign i (i+1) +%endrep + +%assign i 0 +%rep %%by + vaesdeclast CONCAT(xdata,i), CONCAT(xdata,i), xkey12 +%assign i (i+1) +%endrep + + vpxor xdata0, xdata0, xIV +%assign i 1 +%if (%%by > 1) +%rep (%%by - 1) + VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by] + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xIV +%assign i (i+1) +%endrep +%endif + VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by] + +%assign i 0 +%rep %%by + VMOVDQ [p_out + i*16], CONCAT(xdata,i) +%assign i (i+1) +%endrep +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +section .text + +;; aes_cbc_dec_192_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) +MKGLOBAL(aes_cbc_dec_192_avx,function,internal) +aes_cbc_dec_192_avx: + +%ifndef LINUX + mov num_bytes, [rsp + 8*5] +%endif + + vmovdqu xIV, [p_IV] + + mov tmp, num_bytes + and tmp, 7*16 + jz mult_of_8_blks + + ; 1 <= tmp <= 7 + cmp tmp, 4*16 + jg gt4 + je eq4 + +lt4: + cmp tmp, 2*16 + jg eq3 + je eq2 +eq1: + do_aes_load 1 + add p_out, 1*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq2: + do_aes_load 2 + add p_out, 2*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq3: + do_aes_load 3 + add p_out, 3*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq4: + do_aes_load 4 + add p_out, 4*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +gt4: + cmp tmp, 6*16 + jg eq7 + je eq6 + +eq5: + do_aes_load 5 + add p_out, 5*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq6: + do_aes_load 6 + add p_out, 6*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq7: + do_aes_load 7 + add p_out, 7*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +mult_of_8_blks: + vmovdqa xkey0, [p_keys + 0*16] + vmovdqa xkey3, [p_keys + 3*16] + vmovdqa xkey6, [p_keys + 6*16] + vmovdqa xkey9, [p_keys + 9*16] + vmovdqa xkey12, [p_keys + 12*16] + +main_loop2: + ; num_bytes is a multiple of 8 and >0 + do_aes_noload 8 + add p_out, 8*16 + sub num_bytes, 8*16 + jne main_loop2 + +do_return2: +; Don't write back IV +; vmovdqu [p_IV], xIV + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/aes192_cntr_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes192_cntr_by8_avx.asm new file mode 100644 index 000000000..e926b4413 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes192_cntr_by8_avx.asm @@ -0,0 +1,504 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "include/memcpy.asm" +%include "include/const.inc" +%include "include/reg_sizes.asm" + +; routine to do AES192 CNTR enc/decrypt "by8" +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +extern byteswap_const +extern ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4 +extern ddq_add_5, ddq_add_6, ddq_add_7, ddq_add_8 + +%define CONCAT(a,b) a %+ b +%define VMOVDQ vmovdqu + +%define xdata0 xmm0 +%define xdata1 xmm1 +%define xpart xmm1 +%define xdata2 xmm2 +%define xdata3 xmm3 +%define xdata4 xmm4 +%define xdata5 xmm5 +%define xdata6 xmm6 +%define xdata7 xmm7 +%define xcounter xmm8 +%define xtmp xmm8 +%define xbyteswap xmm9 +%define xtmp2 xmm9 +%define xkey0 xmm10 +%define xtmp3 xmm10 +%define xkey4 xmm11 +%define xkey8 xmm12 +%define xkey12 xmm13 +%define xkeyA xmm14 +%define xkeyB xmm15 + +%ifdef LINUX +%define p_in rdi +%define p_IV rsi +%define p_keys rdx +%define p_out rcx +%define num_bytes r8 +%define num_bits r8 +%define p_ivlen r9 +%else +%define p_in rcx +%define p_IV rdx +%define p_keys r8 +%define p_out r9 +%define num_bytes r10 +%define num_bits r10 +%define p_ivlen qword [rsp + 8*6] +%endif + +%define tmp r11 + +%define r_bits r12 +%define tmp2 r13 +%define mask r14 + +%macro do_aes_load 2 + do_aes %1, %2, 1 +%endmacro + +%macro do_aes_noload 2 + do_aes %1, %2, 0 +%endmacro + +; do_aes num_in_par load_keys +; This increments p_in, but not p_out +%macro do_aes 3 +%define %%by %1 +%define %%cntr_type %2 +%define %%load_keys %3 + +%if (%%load_keys) + vmovdqa xkey0, [p_keys + 0*16] +%endif + + vpshufb xdata0, xcounter, xbyteswap +%assign i 1 +%rep (%%by - 1) + vpaddd CONCAT(xdata,i), xcounter, [rel CONCAT(ddq_add_,i)] + vpshufb CONCAT(xdata,i), CONCAT(xdata,i), xbyteswap +%assign i (i + 1) +%endrep + + vmovdqa xkeyA, [p_keys + 1*16] + + vpxor xdata0, xkey0 +%ifidn %%cntr_type, CNTR_BIT + vpaddd xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)] +%else + vpaddq xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)] +%endif + +%assign i 1 +%rep (%%by - 1) + vpxor CONCAT(xdata,i), xkey0 +%assign i (i + 1) +%endrep + + vmovdqa xkeyB, [p_keys + 2*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 1 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 3*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 2 +%assign i (i+1) +%endrep + + add p_in, 16*%%by + +%if (%%load_keys) + vmovdqa xkey4, [p_keys + 4*16] +%endif +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 3 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 5*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey4 ; key 4 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 6*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 5 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 7*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 6 +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey8, [p_keys + 8*16] +%endif +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 7 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 9*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey8 ; key 8 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 10*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 9 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 11*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 10 +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey12, [p_keys + 12*16] +%endif +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 11 +%assign i (i+1) +%endrep + + +%assign i 0 +%rep %%by + vaesenclast CONCAT(xdata,i), CONCAT(xdata,i), xkey12 ; key 12 +%assign i (i+1) +%endrep + + +%assign i 0 +%rep (%%by / 2) +%assign j (i+1) + VMOVDQ xkeyA, [p_in + i*16 - 16*%%by] + VMOVDQ xkeyB, [p_in + j*16 - 16*%%by] + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA + vpxor CONCAT(xdata,j), CONCAT(xdata,j), xkeyB +%assign i (i+2) +%endrep +%if (i < %%by) + VMOVDQ xkeyA, [p_in + i*16 - 16*%%by] + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%endif + +%ifidn %%cntr_type, CNTR_BIT + ;; check if this is the end of the message + mov tmp, num_bytes + and tmp, ~(%%by*16) + jnz %%skip_preserve + ;; Check if there is a partial byte + or r_bits, r_bits + jz %%skip_preserve + +%assign idx (%%by - 1) + ;; Load output to get last partial byte + vmovdqu xtmp, [p_out + idx * 16] + + ;; Save RCX in temporary GP register + mov tmp, rcx + mov mask, 0xff + mov cl, BYTE(r_bits) + shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111 + mov rcx, tmp + + vmovq xtmp2, mask + vpslldq xtmp2, 15 + ;; At this point, xtmp2 contains a mask with all 0s, but with some ones + ;; in the partial byte + + ;; Clear all the bits that do not need to be preserved from the output + vpand xtmp, xtmp, xtmp2 + + ;; Clear all bits from the input that are not to be ciphered + vpandn CONCAT(xdata,idx), xtmp2, CONCAT(xdata,idx) + vpor CONCAT(xdata,idx), xtmp + +%%skip_preserve: +%endif + +%assign i 0 +%rep %%by + VMOVDQ [p_out + i*16], CONCAT(xdata,i) +%assign i (i+1) +%endrep +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +section .text +;; Macro performing AES-CTR. +;; +%macro DO_CNTR 1 +%define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT/CCM) + +%ifndef LINUX + mov num_bytes, [rsp + 8*5] +%endif + +%ifidn %%CNTR_TYPE, CNTR_BIT + push r12 + push r13 + push r14 +%endif + + vmovdqa xbyteswap, [rel byteswap_const] +%ifidn %%CNTR_TYPE, CNTR + test p_ivlen, 16 + jnz %%iv_is_16_bytes + ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001 + mov DWORD(tmp), 0x01000000 + vpinsrq xcounter, [p_IV], 0 + vpinsrd xcounter, [p_IV + 8], 2 + vpinsrd xcounter, DWORD(tmp), 3 + +%else ;; CNTR_BIT + ; Read 16 byte IV: Nonce + 8-byte block counter (BE) + vmovdqu xcounter, [p_IV] +%endif +%%bswap_iv: + vpshufb xcounter, xbyteswap + + ;; calculate len + ;; convert bits to bytes (message length in bits for CNTR_BIT) +%ifidn %%CNTR_TYPE, CNTR_BIT + mov r_bits, num_bits + add num_bits, 7 + shr num_bits, 3 ; "num_bits" and "num_bytes" registers are the same + and r_bits, 7 ; Check if there are remainder bits (0-7) +%endif + + mov tmp, num_bytes + and tmp, 7*16 + jz %%chk ; x8 > or < 15 (not 7 lines) + + ; 1 <= tmp <= 7 + cmp tmp, 4*16 + jg %%gt4 + je %%eq4 + +%%lt4: + cmp tmp, 2*16 + jg %%eq3 + je %%eq2 +%%eq1: + do_aes_load 1, %%CNTR_TYPE + add p_out, 1*16 + jmp %%chk + +%%eq2: + do_aes_load 2, %%CNTR_TYPE + add p_out, 2*16 + jmp %%chk + +%%eq3: + do_aes_load 3, %%CNTR_TYPE + add p_out, 3*16 + jmp %%chk + +%%eq4: + do_aes_load 4, %%CNTR_TYPE + add p_out, 4*16 + jmp %%chk + +%%gt4: + cmp tmp, 6*16 + jg %%eq7 + je %%eq6 + +%%eq5: + do_aes_load 5, %%CNTR_TYPE + add p_out, 5*16 + jmp %%chk + +%%eq6: + do_aes_load 6, %%CNTR_TYPE + add p_out, 6*16 + jmp %%chk + +%%eq7: + do_aes_load 7, %%CNTR_TYPE + add p_out, 7*16 + ; fall through to chk +%%chk: + and num_bytes, ~(7*16) + jz %%do_return2 + + cmp num_bytes, 16 + jb %%last + + ; process multiples of 8 blocks + vmovdqa xkey0, [p_keys + 0*16] + vmovdqa xkey4, [p_keys + 4*16] + vmovdqa xkey8, [p_keys + 8*16] + vmovdqa xkey12, [p_keys + 12*16] + jmp %%main_loop2 + +align 32 +%%main_loop2: + ; num_bytes is a multiple of 8 blocks + partial bytes + do_aes_noload 8, %%CNTR_TYPE + add p_out, 8*16 + sub num_bytes, 8*16 + cmp num_bytes, 8*16 + jae %%main_loop2 + + ; Check if there is a partial block + or num_bytes, num_bytes + jnz %%last + +%%do_return2: +%ifidn %%CNTR_TYPE, CNTR_BIT + pop r14 + pop r13 + pop r12 +%endif + + ret + +%%last: + + ; load partial block into XMM register + simd_load_avx_15_1 xpart, p_in, num_bytes + +%%final_ctr_enc: + ; Encryption of a single partial block + vpshufb xcounter, xbyteswap + vmovdqa xdata0, xcounter + vpxor xdata0, [p_keys + 16*0] +%assign i 1 +%rep 11 + vaesenc xdata0, [p_keys + 16*i] +%assign i (i+1) +%endrep + ; created keystream + vaesenclast xdata0, [p_keys + 16*i] + + ; xor keystream with the message (scratch) + vpxor xdata0, xpart + +%ifidn %%CNTR_TYPE, CNTR_BIT + ;; Check if there is a partial byte + or r_bits, r_bits + jz %%store_output + + ;; Load output to get last partial byte + simd_load_avx_15_1 xtmp, p_out, num_bytes + + ;; Save RCX in temporary GP register + mov tmp, rcx + mov mask, 0xff +%ifidn r_bits, rcx +%error "r_bits cannot be mapped to rcx!" +%endif + mov cl, BYTE(r_bits) + shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111 + mov rcx, tmp + + vmovq xtmp2, mask + + ;; Get number of full bytes in last block of 16 bytes + mov tmp, num_bytes + dec tmp + XVPSLLB xtmp2, tmp, xtmp3, tmp2 + ;; At this point, xtmp2 contains a mask with all 0s, but with some ones + ;; in the partial byte + + ;; Clear all the bits that do not need to be preserved from the output + vpand xtmp, xtmp, xtmp2 + + ;; Clear the bits from the input that are not to be ciphered + vpandn xdata0, xtmp2, xdata0 + vpor xdata0, xtmp +%endif + +%%store_output: + ; copy result into the output buffer + simd_store_avx_15 p_out, xdata0, num_bytes, tmp, rax + + jmp %%do_return2 + +%%iv_is_16_bytes: + ; Read 16 byte IV: Nonce + ESP IV + block counter (BE) + vmovdqu xcounter, [p_IV] + jmp %%bswap_iv +%endmacro + +align 32 +%ifdef CNTR_CCM_AVX +; JOB_AES_HMAC * aes_cntr_ccm_192_avx(JOB_AES_HMAC *job) +; arg 1 : job +MKGLOBAL(aes_cntr_ccm_192_avx,function,internal) +aes_cntr_ccm_192_avx: + DO_CNTR CCM +%else +;; aes_cntr_192_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, +;; UINT64 iv_len) +MKGLOBAL(aes_cntr_192_avx,function,internal) +aes_cntr_192_avx: + DO_CNTR CNTR + +;; aes_cntr_bit_192_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bits, +;; UINT64 iv_len) +MKGLOBAL(aes_cntr_bit_192_avx,function,internal) +aes_cntr_bit_192_avx: + DO_CNTR CNTR_BIT +%endif ;; CNTR_CCM_AVX + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/aes256_cbc_dec_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes256_cbc_dec_by8_avx.asm new file mode 100644 index 000000000..6a8f100ec --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes256_cbc_dec_by8_avx.asm @@ -0,0 +1,344 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; routine to do AES256 CBC decrypt "by8" + +; XMM registers are clobbered. Saving/restoring must be done at a higher level +%include "include/os.asm" + +%define CONCAT(a,b) a %+ b +%define VMOVDQ vmovdqu + +%define xdata0 xmm0 +%define xdata1 xmm1 +%define xdata2 xmm2 +%define xdata3 xmm3 +%define xdata4 xmm4 +%define xdata5 xmm5 +%define xdata6 xmm6 +%define xdata7 xmm7 +%define xIV xmm8 +%define xkey0 xmm9 +%define xkey3 xmm10 +%define xkey6 xmm11 +%define xkey9 xmm12 +%define xkey12 xmm13 +%define xkeyA xmm14 +%define xkeyB xmm15 + +%ifdef LINUX +%define p_in rdi +%define p_IV rsi +%define p_keys rdx +%define p_out rcx +%define num_bytes r8 +%else +%define p_in rcx +%define p_IV rdx +%define p_keys r8 +%define p_out r9 +%define num_bytes rax +%endif + +%define tmp r10 + +%macro do_aes_load 1 + do_aes %1, 1 +%endmacro + +%macro do_aes_noload 1 + do_aes %1, 0 +%endmacro + +; do_aes num_in_par load_keys +; This increments p_in, but not p_out +%macro do_aes 2 +%define %%by %1 +%define %%load_keys %2 + +%if (%%load_keys) + vmovdqa xkey0, [p_keys + 0*16] +%endif + +%assign i 0 +%rep %%by + VMOVDQ CONCAT(xdata,i), [p_in + i*16] +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 1*16] + +%assign i 0 +%rep %%by + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkey0 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 2*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%assign i (i+1) +%endrep + + add p_in, 16*%%by + +%if (%%load_keys) + vmovdqa xkey3, [p_keys + 3*16] +%endif + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 4*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey3 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 5*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey6, [p_keys + 6*16] +%endif + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 7*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey6 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 8*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey9, [p_keys + 9*16] +%endif + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 10*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey9 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 11*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey12, [p_keys + 12*16] +%endif + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 13*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey12 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 14*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%assign i (i+1) +%endrep + +%assign i 0 +%rep %%by + vaesdeclast CONCAT(xdata,i), CONCAT(xdata,i), xkeyB +%assign i (i+1) +%endrep + + vpxor xdata0, xdata0, xIV +%assign i 1 +%if (%%by > 1) +%rep (%%by - 1) + VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by] + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xIV +%assign i (i+1) +%endrep +%endif + VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by] + +%assign i 0 +%rep %%by + VMOVDQ [p_out + i*16], CONCAT(xdata,i) +%assign i (i+1) +%endrep +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +section .text + +;; aes_cbc_dec_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) +MKGLOBAL(aes_cbc_dec_256_avx,function,internal) +aes_cbc_dec_256_avx: + +%ifndef LINUX + mov num_bytes, [rsp + 8*5] +%endif + + vmovdqu xIV, [p_IV] + + mov tmp, num_bytes + and tmp, 7*16 + jz mult_of_8_blks + + ; 1 <= tmp <= 7 + cmp tmp, 4*16 + jg gt4 + je eq4 + +lt4: + cmp tmp, 2*16 + jg eq3 + je eq2 +eq1: + do_aes_load 1 + add p_out, 1*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq2: + do_aes_load 2 + add p_out, 2*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq3: + do_aes_load 3 + add p_out, 3*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq4: + do_aes_load 4 + add p_out, 4*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +gt4: + cmp tmp, 6*16 + jg eq7 + je eq6 + +eq5: + do_aes_load 5 + add p_out, 5*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq6: + do_aes_load 6 + add p_out, 6*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq7: + do_aes_load 7 + add p_out, 7*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +mult_of_8_blks: + vmovdqa xkey0, [p_keys + 0*16] + vmovdqa xkey3, [p_keys + 3*16] + vmovdqa xkey6, [p_keys + 6*16] + vmovdqa xkey9, [p_keys + 9*16] + vmovdqa xkey12, [p_keys + 12*16] + +main_loop2: + ; num_bytes is a multiple of 8 and >0 + do_aes_noload 8 + add p_out, 8*16 + sub num_bytes, 8*16 + jne main_loop2 + +do_return2: +; Don't write back IV +; vmovdqu [p_IV], xIV + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/aes256_cntr_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes256_cntr_by8_avx.asm new file mode 100644 index 000000000..e201339da --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes256_cntr_by8_avx.asm @@ -0,0 +1,516 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "include/memcpy.asm" +%include "include/const.inc" +%include "include/reg_sizes.asm" + +; routine to do AES256 CNTR enc/decrypt "by8" +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +extern byteswap_const +extern ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4 +extern ddq_add_5, ddq_add_6, ddq_add_7, ddq_add_8 + +%define CONCAT(a,b) a %+ b +%define VMOVDQ vmovdqu + +%define xdata0 xmm0 +%define xdata1 xmm1 +%define xpart xmm1 +%define xdata2 xmm2 +%define xdata3 xmm3 +%define xdata4 xmm4 +%define xdata5 xmm5 +%define xdata6 xmm6 +%define xdata7 xmm7 +%define xcounter xmm8 +%define xtmp xmm8 +%define xbyteswap xmm9 +%define xtmp2 xmm9 +%define xkey0 xmm10 +%define xtmp3 xmm10 +%define xkey4 xmm11 +%define xkey8 xmm12 +%define xkey12 xmm13 +%define xkeyA xmm14 +%define xkeyB xmm15 + +%ifdef LINUX +%define p_in rdi +%define p_IV rsi +%define p_keys rdx +%define p_out rcx +%define num_bytes r8 +%define num_bits r8 +%define p_ivlen r9 +%else +%define p_in rcx +%define p_IV rdx +%define p_keys r8 +%define p_out r9 +%define num_bytes r10 +%define num_bits r10 +%define p_ivlen qword [rsp + 8*6] +%endif + +%define tmp r11 + +%define r_bits r12 +%define tmp2 r13 +%define mask r14 + +%macro do_aes_load 2 + do_aes %1, %2, 1 +%endmacro + +%macro do_aes_noload 2 + do_aes %1, %2, 0 +%endmacro + +; do_aes num_in_par load_keys +; This increments p_in, but not p_out +%macro do_aes 3 +%define %%by %1 +%define %%cntr_type %2 +%define %%load_keys %3 + +%if (%%load_keys) + vmovdqa xkey0, [p_keys + 0*16] +%endif + + vpshufb xdata0, xcounter, xbyteswap +%assign i 1 +%rep (%%by - 1) + vpaddd CONCAT(xdata,i), xcounter, [rel CONCAT(ddq_add_,i)] + vpshufb CONCAT(xdata,i), CONCAT(xdata,i), xbyteswap +%assign i (i + 1) +%endrep + + vmovdqa xkeyA, [p_keys + 1*16] + + vpxor xdata0, xkey0 +%ifidn %%cntr_type, CNTR_BIT + vpaddd xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)] +%else + vpaddq xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)] +%endif + +%assign i 1 +%rep (%%by - 1) + vpxor CONCAT(xdata,i), xkey0 +%assign i (i + 1) +%endrep + + vmovdqa xkeyB, [p_keys + 2*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 1 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 3*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 2 +%assign i (i+1) +%endrep + + add p_in, 16*%%by + +%if (%%load_keys) + vmovdqa xkey4, [p_keys + 4*16] +%endif +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 3 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 5*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey4 ; key 4 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 6*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 5 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 7*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 6 +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey8, [p_keys + 8*16] +%endif +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 7 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 9*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey8 ; key 8 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 10*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 9 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 11*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 10 +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey12, [p_keys + 12*16] +%endif +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 11 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 13*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey12 ; key 12 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 14*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 13 +%assign i (i+1) +%endrep + +%assign i 0 +%rep %%by + vaesenclast CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 14 +%assign i (i+1) +%endrep + +%assign i 0 +%rep (%%by / 2) +%assign j (i+1) + VMOVDQ xkeyA, [p_in + i*16 - 16*%%by] + VMOVDQ xkeyB, [p_in + j*16 - 16*%%by] + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA + vpxor CONCAT(xdata,j), CONCAT(xdata,j), xkeyB +%assign i (i+2) +%endrep +%if (i < %%by) + VMOVDQ xkeyA, [p_in + i*16 - 16*%%by] + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%endif + +%ifidn %%cntr_type, CNTR_BIT + ;; check if this is the end of the message + mov tmp, num_bytes + and tmp, ~(%%by*16) + jnz %%skip_preserve + ;; Check if there is a partial byte + or r_bits, r_bits + jz %%skip_preserve + +%assign idx (%%by - 1) + ;; Load output to get last partial byte + vmovdqu xtmp, [p_out + idx * 16] + + ;; Save RCX in temporary GP register + mov tmp, rcx + mov mask, 0xff + mov cl, BYTE(r_bits) + shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111 + mov rcx, tmp + + vmovq xtmp2, mask + vpslldq xtmp2, 15 + ;; At this point, xtmp2 contains a mask with all 0s, but with some ones + ;; in the partial byte + + ;; Clear all the bits that do not need to be preserved from the output + vpand xtmp, xtmp, xtmp2 + + ;; Clear all bits from the input that are not to be ciphered + vpandn CONCAT(xdata,idx), xtmp2, CONCAT(xdata,idx) + vpor CONCAT(xdata,idx), xtmp + +%%skip_preserve: +%endif + +%assign i 0 +%rep %%by + VMOVDQ [p_out + i*16], CONCAT(xdata,i) +%assign i (i+1) +%endrep +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +section .text +;; Macro performing AES-CTR. +;; +%macro DO_CNTR 1 +%define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT/CCM) + +%ifndef LINUX + mov num_bytes, [rsp + 8*5] +%endif + +%ifidn %%CNTR_TYPE, CNTR_BIT + push r12 + push r13 + push r14 +%endif + + vmovdqa xbyteswap, [rel byteswap_const] +%ifidn %%CNTR_TYPE, CNTR + test p_ivlen, 16 + jnz %%iv_is_16_bytes + ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001 + mov DWORD(tmp), 0x01000000 + vpinsrq xcounter, [p_IV], 0 + vpinsrd xcounter, [p_IV + 8], 2 + vpinsrd xcounter, DWORD(tmp), 3 + +%else ;; CNTR_BIT + ; Read 16 byte IV: Nonce + 8-byte block counter (BE) + vmovdqu xcounter, [p_IV] +%endif +%%bswap_iv: + vpshufb xcounter, xbyteswap + + ;; calculate len + ;; convert bits to bytes (message length in bits for CNTR_BIT) +%ifidn %%CNTR_TYPE, CNTR_BIT + mov r_bits, num_bits + add num_bits, 7 + shr num_bits, 3 ; "num_bits" and "num_bytes" registers are the same + and r_bits, 7 ; Check if there are remainder bits (0-7) +%endif + + mov tmp, num_bytes + and tmp, 7*16 + jz %%chk ; x8 > or < 15 (not 7 lines) + + ; 1 <= tmp <= 7 + cmp tmp, 4*16 + jg %%gt4 + je %%eq4 + +%%lt4: + cmp tmp, 2*16 + jg %%eq3 + je %%eq2 +%%eq1: + do_aes_load 1, %%CNTR_TYPE + add p_out, 1*16 + jmp %%chk + +%%eq2: + do_aes_load 2, %%CNTR_TYPE + add p_out, 2*16 + jmp %%chk + +%%eq3: + do_aes_load 3, %%CNTR_TYPE + add p_out, 3*16 + jmp %%chk + +%%eq4: + do_aes_load 4, %%CNTR_TYPE + add p_out, 4*16 + jmp %%chk + +%%gt4: + cmp tmp, 6*16 + jg %%eq7 + je %%eq6 + +%%eq5: + do_aes_load 5, %%CNTR_TYPE + add p_out, 5*16 + jmp %%chk + +%%eq6: + do_aes_load 6, %%CNTR_TYPE + add p_out, 6*16 + jmp %%chk + +%%eq7: + do_aes_load 7, %%CNTR_TYPE + add p_out, 7*16 + ; fall through to chk +%%chk: + and num_bytes, ~(7*16) + jz %%do_return2 + + cmp num_bytes, 16 + jb %%last + + ; process multiples of 8 blocks + vmovdqa xkey0, [p_keys + 0*16] + vmovdqa xkey4, [p_keys + 4*16] + vmovdqa xkey8, [p_keys + 8*16] + vmovdqa xkey12, [p_keys + 12*16] + jmp %%main_loop2 + +align 32 +%%main_loop2: + ; num_bytes is a multiple of 8 blocks + partial bytes + do_aes_noload 8, %%CNTR_TYPE + add p_out, 8*16 + sub num_bytes, 8*16 + cmp num_bytes, 8*16 + jae %%main_loop2 + + ; Check if there is a partial block + or num_bytes, num_bytes + jnz %%last + +%%do_return2: +%ifidn %%CNTR_TYPE, CNTR_BIT + pop r14 + pop r13 + pop r12 +%endif + + ret + +%%last: + + ; load partial block into XMM register + simd_load_avx_15_1 xpart, p_in, num_bytes + +%%final_ctr_enc: + ; Encryption of a single partial block + vpshufb xcounter, xbyteswap + vmovdqa xdata0, xcounter + vpxor xdata0, [p_keys + 16*0] +%assign i 1 +%rep 13 + vaesenc xdata0, [p_keys + 16*i] +%assign i (i+1) +%endrep + ; created keystream + vaesenclast xdata0, [p_keys + 16*i] + + ; xor keystream with the message (scratch) + vpxor xdata0, xpart + +%ifidn %%CNTR_TYPE, CNTR_BIT + ;; Check if there is a partial byte + or r_bits, r_bits + jz %%store_output + + ;; Load output to get last partial byte + simd_load_avx_15_1 xtmp, p_out, num_bytes + + ;; Save RCX in temporary GP register + mov tmp, rcx + mov mask, 0xff +%ifidn r_bits, rcx +%error "r_bits cannot be mapped to rcx!" +%endif + mov cl, BYTE(r_bits) + shr mask, cl ;; e.g. 3 remaining bits -> mask = 00011111 + mov rcx, tmp + + vmovq xtmp2, mask + + ;; Get number of full bytes in last block of 16 bytes + mov tmp, num_bytes + dec tmp + XVPSLLB xtmp2, tmp, xtmp3, tmp2 + ;; At this point, xtmp2 contains a mask with all 0s, but with some ones + ;; in the partial byte + + ;; Clear all the bits that do not need to be preserved from the output + vpand xtmp, xtmp, xtmp2 + + ;; Clear the bits from the input that are not to be ciphered + vpandn xdata0, xtmp2, xdata0 + vpor xdata0, xtmp +%endif + +%%store_output: + ; copy result into the output buffer + simd_store_avx_15 p_out, xdata0, num_bytes, tmp, rax + + jmp %%do_return2 + +%%iv_is_16_bytes: + ; Read 16 byte IV: Nonce + ESP IV + block counter (BE) + vmovdqu xcounter, [p_IV] + jmp %%bswap_iv +%endmacro + +align 32 +%ifdef CNTR_CCM_AVX +; JOB_AES_HMAC * aes_cntr_ccm_256_avx(JOB_AES_HMAC *job) +; arg 1 : job +MKGLOBAL(aes_cntr_ccm_256_avx,function,internal) +aes_cntr_ccm_256_avx: + DO_CNTR CCM +%else +;; aes_cntr_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, +;; UINT64 iv_len) +MKGLOBAL(aes_cntr_256_avx,function,internal) +aes_cntr_256_avx: + DO_CNTR CNTR + +;; aes_cntr_bit_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bits, +;; UINT64 iv_len) +MKGLOBAL(aes_cntr_bit_256_avx,function,internal) +aes_cntr_bit_256_avx: + DO_CNTR CNTR_BIT +%endif ;; CNTR_CCM_AVX + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_128_x8.asm b/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_128_x8.asm new file mode 100644 index 000000000..745a8e4d4 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_128_x8.asm @@ -0,0 +1,494 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;;; routine to do a 128 bit CBC AES encrypt and CBC MAC + +;; clobbers all registers except for ARG1 and rbp + +%include "include/os.asm" +%include "mb_mgr_datastruct.asm" + +%define VMOVDQ vmovdqu ;; assume buffers not aligned + +%macro VPXOR2 2 + vpxor %1, %1, %2 +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; struct AES_ARGS { +;; void* in[8]; +;; void* out[8]; +;; UINT128* keys[8]; +;; UINT128 IV[8]; +;; } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void aes_cbc_enc_128_x8(AES_ARGS *args, UINT64 len); +;; arg 1: ARG : addr of AES_ARGS structure +;; arg 2: LEN : len (in units of bytes) + +struc STACK +_gpr_save: resq 8 +_len: resq 1 +endstruc + +%define GPR_SAVE_AREA rsp + _gpr_save +%define LEN_AREA rsp + _len + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define arg3 rcx +%define arg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define arg3 rdi +%define arg4 rsi +%endif + +%define ARG arg1 +%define LEN arg2 + +%define IDX rax +%define TMP rbx + +%define KEYS0 arg3 +%define KEYS1 arg4 +%define KEYS2 rbp +%define KEYS3 r8 +%define KEYS4 r9 +%define KEYS5 r10 +%define KEYS6 r11 +%define KEYS7 r12 + +%define IN0 r13 +%define IN2 r14 +%define IN4 r15 +%define IN6 LEN + +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 +%define XDATA4 xmm4 +%define XDATA5 xmm5 +%define XDATA6 xmm6 +%define XDATA7 xmm7 + +%define XKEY0_3 xmm8 +%define XKEY1_4 xmm9 +%define XKEY2_5 xmm10 +%define XKEY3_6 xmm11 +%define XKEY4_7 xmm12 +%define XKEY5_8 xmm13 +%define XKEY6_9 xmm14 +%define XTMP xmm15 + +section .text +%ifdef CBC_MAC +MKGLOBAL(aes128_cbc_mac_x8,function,internal) +aes128_cbc_mac_x8: +%else +MKGLOBAL(aes_cbc_enc_128_x8,function,internal) +aes_cbc_enc_128_x8: +%endif + sub rsp, STACK_size + mov [GPR_SAVE_AREA + 8*0], rbp +%ifdef CBC_MAC + mov [GPR_SAVE_AREA + 8*1], rbx + mov [GPR_SAVE_AREA + 8*2], r12 + mov [GPR_SAVE_AREA + 8*3], r13 + mov [GPR_SAVE_AREA + 8*4], r14 + mov [GPR_SAVE_AREA + 8*5], r15 +%ifndef LINUX + mov [GPR_SAVE_AREA + 8*6], rsi + mov [GPR_SAVE_AREA + 8*7], rdi +%endif +%endif + + mov IDX, 16 + mov [LEN_AREA], LEN + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + mov IN0, [ARG + _aesarg_in + 8*0] + mov IN2, [ARG + _aesarg_in + 8*2] + mov IN4, [ARG + _aesarg_in + 8*4] + mov IN6, [ARG + _aesarg_in + 8*6] + + mov TMP, [ARG + _aesarg_in + 8*1] + VMOVDQ XDATA0, [IN0] ; load first block of plain text + VMOVDQ XDATA1, [TMP] ; load first block of plain text + mov TMP, [ARG + _aesarg_in + 8*3] + VMOVDQ XDATA2, [IN2] ; load first block of plain text + VMOVDQ XDATA3, [TMP] ; load first block of plain text + mov TMP, [ARG + _aesarg_in + 8*5] + VMOVDQ XDATA4, [IN4] ; load first block of plain text + VMOVDQ XDATA5, [TMP] ; load first block of plain text + mov TMP, [ARG + _aesarg_in + 8*7] + VMOVDQ XDATA6, [IN6] ; load first block of plain text + VMOVDQ XDATA7, [TMP] ; load first block of plain text + + + VPXOR2 XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV + VPXOR2 XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV + VPXOR2 XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV + VPXOR2 XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV + VPXOR2 XDATA4, [ARG + _aesarg_IV + 16*4] ; plaintext XOR IV + VPXOR2 XDATA5, [ARG + _aesarg_IV + 16*5] ; plaintext XOR IV + VPXOR2 XDATA6, [ARG + _aesarg_IV + 16*6] ; plaintext XOR IV + VPXOR2 XDATA7, [ARG + _aesarg_IV + 16*7] ; plaintext XOR IV + + mov KEYS0, [ARG + _aesarg_keys + 8*0] + mov KEYS1, [ARG + _aesarg_keys + 8*1] + mov KEYS2, [ARG + _aesarg_keys + 8*2] + mov KEYS3, [ARG + _aesarg_keys + 8*3] + mov KEYS4, [ARG + _aesarg_keys + 8*4] + mov KEYS5, [ARG + _aesarg_keys + 8*5] + mov KEYS6, [ARG + _aesarg_keys + 8*6] + mov KEYS7, [ARG + _aesarg_keys + 8*7] + + VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK + VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK + VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK + VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK + VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK + VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK + VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK + VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK + + vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC + vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC + vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC + vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC + + vmovdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key + + vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC + vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC + vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC + vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC + + vmovdqa XKEY1_4, [KEYS1 + 16*4] ; load round 4 key + + vaesenc XDATA0, XKEY0_3 ; 3. ENC + vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC + vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC + vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC + vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC + vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC + vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC + vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC + + vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + vmovdqa XKEY2_5, [KEYS2 + 16*5] ; load round 5 key + vaesenc XDATA1, XKEY1_4 ; 4. ENC + vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC + vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC + vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC + vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC + + vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + vmovdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key + vaesenc XDATA2, XKEY2_5 ; 5. ENC + vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC + vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC + vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC + vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC + + vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC + vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC + vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC + vmovdqa XKEY4_7, [KEYS4 + 16*7] ; load round 7 key + vaesenc XDATA3, XKEY3_6 ; 6. ENC + vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC + vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC + vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC + vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC + + vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + vmovdqa XKEY5_8, [KEYS5 + 16*8] ; load round 8 key + vaesenc XDATA4, XKEY4_7 ; 7. ENC + vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC + vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC + vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC + + vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC + vmovdqa XKEY6_9, [KEYS6 + 16*9] ; load round 9 key + vaesenc XDATA5, XKEY5_8 ; 8. ENC + vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC + vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC + + vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC + vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC + vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC + vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC + vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC + vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC + mov TMP, [ARG + _aesarg_out + 8*0] + vaesenc XDATA6, XKEY6_9 ; 9. ENC + vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC + + + vaesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC + vaesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC + vaesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC + vaesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC + vaesenclast XDATA4, [KEYS4 + 16*10] ; 10. ENC + vaesenclast XDATA5, [KEYS5 + 16*10] ; 10. ENC + vaesenclast XDATA6, [KEYS6 + 16*10] ; 10. ENC + vaesenclast XDATA7, [KEYS7 + 16*10] ; 10. ENC + +%ifndef CBC_MAC + VMOVDQ [TMP], XDATA0 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*1] + VMOVDQ [TMP], XDATA1 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*2] + VMOVDQ [TMP], XDATA2 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*3] + VMOVDQ [TMP], XDATA3 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*4] + VMOVDQ [TMP], XDATA4 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*5] + VMOVDQ [TMP], XDATA5 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*6] + VMOVDQ [TMP], XDATA6 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*7] + VMOVDQ [TMP], XDATA7 ; write back ciphertext +%endif + cmp [LEN_AREA], IDX + je done + +main_loop: + mov TMP, [ARG + _aesarg_in + 8*1] + VPXOR2 XDATA0, [IN0 + IDX] ; load next block of plain text + VPXOR2 XDATA1, [TMP + IDX] ; load next block of plain text + mov TMP, [ARG + _aesarg_in + 8*3] + VPXOR2 XDATA2, [IN2 + IDX] ; load next block of plain text + VPXOR2 XDATA3, [TMP + IDX] ; load next block of plain text + mov TMP, [ARG + _aesarg_in + 8*5] + VPXOR2 XDATA4, [IN4 + IDX] ; load next block of plain text + VPXOR2 XDATA5, [TMP + IDX] ; load next block of plain text + mov TMP, [ARG + _aesarg_in + 8*7] + VPXOR2 XDATA6, [IN6 + IDX] ; load next block of plain text + VPXOR2 XDATA7, [TMP + IDX] ; load next block of plain text + + VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK + VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK + VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK + VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK + VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK + VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK + VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK + VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK + + vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC + vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC + vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC + vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC + + vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC + vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC + vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC + vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC + + vaesenc XDATA0, XKEY0_3 ; 3. ENC + vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC + vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC + vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC + vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC + vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC + vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC + vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC + + vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + vaesenc XDATA1, XKEY1_4 ; 4. ENC + vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC + vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC + vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC + vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC + + vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + vaesenc XDATA2, XKEY2_5 ; 5. ENC + vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC + vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC + vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC + vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC + + vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC + vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC + vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC + vaesenc XDATA3, XKEY3_6 ; 6. ENC + vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC + vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC + vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC + vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC + + vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + vaesenc XDATA4, XKEY4_7 ; 7. ENC + vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC + vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC + vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC + + vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC + vaesenc XDATA5, XKEY5_8 ; 8. ENC + vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC + vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC + + vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC + vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC + vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC + vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC + vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC + vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC + mov TMP, [ARG + _aesarg_out + 8*0] + vaesenc XDATA6, XKEY6_9 ; 9. ENC + vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC + + + vaesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC + vaesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC + vaesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC + vaesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC + vaesenclast XDATA4, [KEYS4 + 16*10] ; 10. ENC + vaesenclast XDATA5, [KEYS5 + 16*10] ; 10. ENC + vaesenclast XDATA6, [KEYS6 + 16*10] ; 10. ENC + vaesenclast XDATA7, [KEYS7 + 16*10] ; 10. ENC + +%ifndef CBC_MAC + ;; no ciphertext write back for CBC-MAC + VMOVDQ [TMP + IDX], XDATA0 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*1] + VMOVDQ [TMP + IDX], XDATA1 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*2] + VMOVDQ [TMP + IDX], XDATA2 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*3] + VMOVDQ [TMP + IDX], XDATA3 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*4] + VMOVDQ [TMP + IDX], XDATA4 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*5] + VMOVDQ [TMP + IDX], XDATA5 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*6] + VMOVDQ [TMP + IDX], XDATA6 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*7] + VMOVDQ [TMP + IDX], XDATA7 ; write back ciphertext +%endif + add IDX, 16 + cmp [LEN_AREA], IDX + jne main_loop + +done: + ;; update IV for AES128-CBC / store digest for CBC-MAC + vmovdqa [ARG + _aesarg_IV + 16*0], XDATA0 + vmovdqa [ARG + _aesarg_IV + 16*1], XDATA1 + vmovdqa [ARG + _aesarg_IV + 16*2], XDATA2 + vmovdqa [ARG + _aesarg_IV + 16*3], XDATA3 + vmovdqa [ARG + _aesarg_IV + 16*4], XDATA4 + vmovdqa [ARG + _aesarg_IV + 16*5], XDATA5 + vmovdqa [ARG + _aesarg_IV + 16*6], XDATA6 + vmovdqa [ARG + _aesarg_IV + 16*7], XDATA7 + + ;; update IN and OUT + vmovd xmm0, [LEN_AREA] + vpshufd xmm0, xmm0, 0x44 + vpaddq xmm1, xmm0, [ARG + _aesarg_in + 16*0] + vpaddq xmm2, xmm0, [ARG + _aesarg_in + 16*1] + vpaddq xmm3, xmm0, [ARG + _aesarg_in + 16*2] + vpaddq xmm4, xmm0, [ARG + _aesarg_in + 16*3] + vmovdqa [ARG + _aesarg_in + 16*0], xmm1 + vmovdqa [ARG + _aesarg_in + 16*1], xmm2 + vmovdqa [ARG + _aesarg_in + 16*2], xmm3 + vmovdqa [ARG + _aesarg_in + 16*3], xmm4 +%ifndef CBC_MAC + vpaddq xmm5, xmm0, [ARG + _aesarg_out + 16*0] + vpaddq xmm6, xmm0, [ARG + _aesarg_out + 16*1] + vpaddq xmm7, xmm0, [ARG + _aesarg_out + 16*2] + vpaddq xmm8, xmm0, [ARG + _aesarg_out + 16*3] + vmovdqa [ARG + _aesarg_out + 16*0], xmm5 + vmovdqa [ARG + _aesarg_out + 16*1], xmm6 + vmovdqa [ARG + _aesarg_out + 16*2], xmm7 + vmovdqa [ARG + _aesarg_out + 16*3], xmm8 +%endif + + ;; XMMs are saved at a higher level + mov rbp, [GPR_SAVE_AREA + 8*0] +%ifdef CBC_MAC + mov rbx, [GPR_SAVE_AREA + 8*1] + mov r12, [GPR_SAVE_AREA + 8*2] + mov r13, [GPR_SAVE_AREA + 8*3] + mov r14, [GPR_SAVE_AREA + 8*4] + mov r15, [GPR_SAVE_AREA + 8*5] +%ifndef LINUX + mov rsi, [GPR_SAVE_AREA + 8*6] + mov rdi, [GPR_SAVE_AREA + 8*7] +%endif +%endif + + add rsp, STACK_size + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_192_x8.asm b/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_192_x8.asm new file mode 100644 index 000000000..e446f13c3 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_192_x8.asm @@ -0,0 +1,501 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;;; routine to do a 192 bit CBC AES encrypt + +;; clobbers all registers except for ARG1 and rbp + +%include "include/os.asm" +%include "mb_mgr_datastruct.asm" + +%define VMOVDQ vmovdqu ;; assume buffers not aligned + +%macro VPXOR2 2 + vpxor %1, %1, %2 +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; struct AES_ARGS { +;; void* in[8]; +;; void* out[8]; +;; UINT128* keys[8]; +;; UINT128 IV[8]; +;; } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void aes_cbc_enc_192_x8(AES_ARGS *args, UINT64 len); +;; arg 1: ARG : addr of AES_ARGS structure +;; arg 2: LEN : len (in units of bytes) + +struc STACK +_gpr_save: resq 1 +_len: resq 1 +endstruc + +%define GPR_SAVE_AREA rsp + _gpr_save +%define LEN_AREA rsp + _len + +%ifdef LINUX +%define ARG rdi +%define LEN rsi +%define REG3 rcx +%define REG4 rdx +%else +%define ARG rcx +%define LEN rdx +%define REG3 rsi +%define REG4 rdi +%endif + +%define IDX rax +%define TMP rbx + +%define KEYS0 REG3 +%define KEYS1 REG4 +%define KEYS2 rbp +%define KEYS3 r8 +%define KEYS4 r9 +%define KEYS5 r10 +%define KEYS6 r11 +%define KEYS7 r12 + +%define IN0 r13 +%define IN2 r14 +%define IN4 r15 +%define IN6 LEN + +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 +%define XDATA4 xmm4 +%define XDATA5 xmm5 +%define XDATA6 xmm6 +%define XDATA7 xmm7 + +%define XKEY0_3 xmm8 +%define XKEY1_4 xmm9 +%define XKEY2_5 xmm10 +%define XKEY3_6 xmm11 +%define XKEY4_7 xmm12 +%define XKEY5_8 xmm13 +%define XKEY6_9 xmm14 +%define XTMP xmm15 + +section .text + +MKGLOBAL(aes_cbc_enc_192_x8,function,internal) +aes_cbc_enc_192_x8: + + sub rsp, STACK_size + mov [GPR_SAVE_AREA + 8*0], rbp + + mov IDX, 16 + mov [LEN_AREA], LEN + + mov IN0, [ARG + _aesarg_in + 8*0] + mov IN2, [ARG + _aesarg_in + 8*2] + mov IN4, [ARG + _aesarg_in + 8*4] + mov IN6, [ARG + _aesarg_in + 8*6] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + mov TMP, [ARG + _aesarg_in + 8*1] + VMOVDQ XDATA0, [IN0] ; load first block of plain text + VMOVDQ XDATA1, [TMP] ; load first block of plain text + mov TMP, [ARG + _aesarg_in + 8*3] + VMOVDQ XDATA2, [IN2] ; load first block of plain text + VMOVDQ XDATA3, [TMP] ; load first block of plain text + mov TMP, [ARG + _aesarg_in + 8*5] + VMOVDQ XDATA4, [IN4] ; load first block of plain text + VMOVDQ XDATA5, [TMP] ; load first block of plain text + mov TMP, [ARG + _aesarg_in + 8*7] + VMOVDQ XDATA6, [IN6] ; load first block of plain text + VMOVDQ XDATA7, [TMP] ; load first block of plain text + + + VPXOR2 XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV + VPXOR2 XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV + VPXOR2 XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV + VPXOR2 XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV + VPXOR2 XDATA4, [ARG + _aesarg_IV + 16*4] ; plaintext XOR IV + VPXOR2 XDATA5, [ARG + _aesarg_IV + 16*5] ; plaintext XOR IV + VPXOR2 XDATA6, [ARG + _aesarg_IV + 16*6] ; plaintext XOR IV + VPXOR2 XDATA7, [ARG + _aesarg_IV + 16*7] ; plaintext XOR IV + + mov KEYS0, [ARG + _aesarg_keys + 8*0] + mov KEYS1, [ARG + _aesarg_keys + 8*1] + mov KEYS2, [ARG + _aesarg_keys + 8*2] + mov KEYS3, [ARG + _aesarg_keys + 8*3] + mov KEYS4, [ARG + _aesarg_keys + 8*4] + mov KEYS5, [ARG + _aesarg_keys + 8*5] + mov KEYS6, [ARG + _aesarg_keys + 8*6] + mov KEYS7, [ARG + _aesarg_keys + 8*7] + + VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK + VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK + VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK + VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK + VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK + VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK + VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK + VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK + + vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC + vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC + vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC + vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC + + vmovdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key + + vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC + vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC + vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC + vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC + + vmovdqa XKEY1_4, [KEYS1 + 16*4] ; load round 4 key + + vaesenc XDATA0, XKEY0_3 ; 3. ENC + vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC + vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC + vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC + vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC + vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC + vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC + vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC + + vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + vmovdqa XKEY2_5, [KEYS2 + 16*5] ; load round 5 key + vaesenc XDATA1, XKEY1_4 ; 4. ENC + vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC + vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC + vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC + vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC + + vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + vmovdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key + vaesenc XDATA2, XKEY2_5 ; 5. ENC + vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC + vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC + vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC + vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC + + vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC + vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC + vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC + vmovdqa XKEY4_7, [KEYS4 + 16*7] ; load round 7 key + vaesenc XDATA3, XKEY3_6 ; 6. ENC + vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC + vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC + vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC + vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC + + vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + vmovdqa XKEY5_8, [KEYS5 + 16*8] ; load round 8 key + vaesenc XDATA4, XKEY4_7 ; 7. ENC + vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC + vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC + vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC + + vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC + vmovdqa XKEY6_9, [KEYS6 + 16*9] ; load round 9 key + vaesenc XDATA5, XKEY5_8 ; 8. ENC + vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC + vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC + + vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC + vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC + vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC + vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC + vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC + vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC + mov TMP, [ARG + _aesarg_out + 8*0] + vaesenc XDATA6, XKEY6_9 ; 9. ENC + vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC + + + vaesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC + vaesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC + vaesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC + vaesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC + vaesenc XDATA4, [KEYS4 + 16*10] ; 10. ENC + vaesenc XDATA5, [KEYS5 + 16*10] ; 10. ENC + vaesenc XDATA6, [KEYS6 + 16*10] ; 10. ENC + vaesenc XDATA7, [KEYS7 + 16*10] ; 10. ENC + + vaesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC + vaesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC + vaesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC + vaesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC + vaesenc XDATA4, [KEYS4 + 16*11] ; 11. ENC + vaesenc XDATA5, [KEYS5 + 16*11] ; 11. ENC + vaesenc XDATA6, [KEYS6 + 16*11] ; 11. ENC + vaesenc XDATA7, [KEYS7 + 16*11] ; 11. ENC + + + vaesenclast XDATA0, [KEYS0 + 16*12] ; 12. ENC + vaesenclast XDATA1, [KEYS1 + 16*12] ; 12. ENC + vaesenclast XDATA2, [KEYS2 + 16*12] ; 12. ENC + vaesenclast XDATA3, [KEYS3 + 16*12] ; 12. ENC + vaesenclast XDATA4, [KEYS4 + 16*12] ; 12. ENC + vaesenclast XDATA5, [KEYS5 + 16*12] ; 12. ENC + vaesenclast XDATA6, [KEYS6 + 16*12] ; 12. ENC + vaesenclast XDATA7, [KEYS7 + 16*12] ; 12. ENC + + VMOVDQ [TMP], XDATA0 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*1] + VMOVDQ [TMP], XDATA1 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*2] + VMOVDQ [TMP], XDATA2 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*3] + VMOVDQ [TMP], XDATA3 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*4] + VMOVDQ [TMP], XDATA4 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*5] + VMOVDQ [TMP], XDATA5 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*6] + VMOVDQ [TMP], XDATA6 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*7] + VMOVDQ [TMP], XDATA7 ; write back ciphertext + + cmp [LEN_AREA], IDX + je done + +main_loop: + mov TMP, [ARG + _aesarg_in + 8*1] + VPXOR2 XDATA0, [IN0 + IDX] ; load next block of plain text + VPXOR2 XDATA1, [TMP + IDX] ; load next block of plain text + mov TMP, [ARG + _aesarg_in + 8*3] + VPXOR2 XDATA2, [IN2 + IDX] ; load next block of plain text + VPXOR2 XDATA3, [TMP + IDX] ; load next block of plain text + mov TMP, [ARG + _aesarg_in + 8*5] + VPXOR2 XDATA4, [IN4 + IDX] ; load next block of plain text + VPXOR2 XDATA5, [TMP + IDX] ; load next block of plain text + mov TMP, [ARG + _aesarg_in + 8*7] + VPXOR2 XDATA6, [IN6 + IDX] ; load next block of plain text + VPXOR2 XDATA7, [TMP + IDX] ; load next block of plain text + + + VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK + VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK + VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK + VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK + VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK + VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK + VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK + VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK + + vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC + vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC + vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC + vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC + + vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC + vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC + vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC + vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC + + vaesenc XDATA0, XKEY0_3 ; 3. ENC + vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC + vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC + vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC + vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC + vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC + vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC + vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC + + vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + vaesenc XDATA1, XKEY1_4 ; 4. ENC + vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC + vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC + vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC + vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC + + vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + vaesenc XDATA2, XKEY2_5 ; 5. ENC + vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC + vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC + vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC + vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC + + vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC + vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC + vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC + vaesenc XDATA3, XKEY3_6 ; 6. ENC + vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC + vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC + vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC + vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC + + vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + vaesenc XDATA4, XKEY4_7 ; 7. ENC + vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC + vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC + vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC + + vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC + vaesenc XDATA5, XKEY5_8 ; 8. ENC + vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC + vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC + + vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC + vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC + vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC + vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC + vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC + vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC + mov TMP, [ARG + _aesarg_out + 8*0] + vaesenc XDATA6, XKEY6_9 ; 9. ENC + vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC + + + vaesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC + vaesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC + vaesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC + vaesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC + vaesenc XDATA4, [KEYS4 + 16*10] ; 10. ENC + vaesenc XDATA5, [KEYS5 + 16*10] ; 10. ENC + vaesenc XDATA6, [KEYS6 + 16*10] ; 10. ENC + vaesenc XDATA7, [KEYS7 + 16*10] ; 10. ENC + + vaesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC + vaesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC + vaesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC + vaesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC + vaesenc XDATA4, [KEYS4 + 16*11] ; 11. ENC + vaesenc XDATA5, [KEYS5 + 16*11] ; 11. ENC + vaesenc XDATA6, [KEYS6 + 16*11] ; 11. ENC + vaesenc XDATA7, [KEYS7 + 16*11] ; 11. ENC + + vaesenclast XDATA0, [KEYS0 + 16*12] ; 12. ENC + vaesenclast XDATA1, [KEYS1 + 16*12] ; 12. ENC + vaesenclast XDATA2, [KEYS2 + 16*12] ; 12. ENC + vaesenclast XDATA3, [KEYS3 + 16*12] ; 12. ENC + vaesenclast XDATA4, [KEYS4 + 16*12] ; 12. ENC + vaesenclast XDATA5, [KEYS5 + 16*12] ; 12. ENC + vaesenclast XDATA6, [KEYS6 + 16*12] ; 12. ENC + vaesenclast XDATA7, [KEYS7 + 16*12] ; 12. ENC + + + VMOVDQ [TMP + IDX], XDATA0 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*1] + VMOVDQ [TMP + IDX], XDATA1 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*2] + VMOVDQ [TMP + IDX], XDATA2 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*3] + VMOVDQ [TMP + IDX], XDATA3 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*4] + VMOVDQ [TMP + IDX], XDATA4 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*5] + VMOVDQ [TMP + IDX], XDATA5 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*6] + VMOVDQ [TMP + IDX], XDATA6 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*7] + VMOVDQ [TMP + IDX], XDATA7 ; write back ciphertext + + add IDX, 16 + cmp [LEN_AREA], IDX + jne main_loop + +done: + ;; update IV + vmovdqa [ARG + _aesarg_IV + 16*0], XDATA0 + vmovdqa [ARG + _aesarg_IV + 16*1], XDATA1 + vmovdqa [ARG + _aesarg_IV + 16*2], XDATA2 + vmovdqa [ARG + _aesarg_IV + 16*3], XDATA3 + vmovdqa [ARG + _aesarg_IV + 16*4], XDATA4 + vmovdqa [ARG + _aesarg_IV + 16*5], XDATA5 + vmovdqa [ARG + _aesarg_IV + 16*6], XDATA6 + vmovdqa [ARG + _aesarg_IV + 16*7], XDATA7 + + ;; update IN and OUT + vmovd xmm0, [LEN_AREA] + vpshufd xmm0, xmm0, 0x44 + vpaddq xmm1, xmm0, [ARG + _aesarg_in + 16*0] + vpaddq xmm2, xmm0, [ARG + _aesarg_in + 16*1] + vpaddq xmm3, xmm0, [ARG + _aesarg_in + 16*2] + vpaddq xmm4, xmm0, [ARG + _aesarg_in + 16*3] + vmovdqa [ARG + _aesarg_in + 16*0], xmm1 + vmovdqa [ARG + _aesarg_in + 16*1], xmm2 + vmovdqa [ARG + _aesarg_in + 16*2], xmm3 + vmovdqa [ARG + _aesarg_in + 16*3], xmm4 + vpaddq xmm5, xmm0, [ARG + _aesarg_out + 16*0] + vpaddq xmm6, xmm0, [ARG + _aesarg_out + 16*1] + vpaddq xmm7, xmm0, [ARG + _aesarg_out + 16*2] + vpaddq xmm8, xmm0, [ARG + _aesarg_out + 16*3] + vmovdqa [ARG + _aesarg_out + 16*0], xmm5 + vmovdqa [ARG + _aesarg_out + 16*1], xmm6 + vmovdqa [ARG + _aesarg_out + 16*2], xmm7 + vmovdqa [ARG + _aesarg_out + 16*3], xmm8 + +;; XMMs are saved at a higher level + mov rbp, [GPR_SAVE_AREA + 8*0] + + add rsp, STACK_size + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_256_x8.asm b/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_256_x8.asm new file mode 100644 index 000000000..75cf285d9 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_256_x8.asm @@ -0,0 +1,536 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;;; routine to do a 256 bit CBC AES encrypt + +;; clobbers all registers except for ARG1 and rbp + +%include "include/os.asm" +%include "mb_mgr_datastruct.asm" + +%define VMOVDQ vmovdqu ;; assume buffers not aligned + +%macro VPXOR2 2 + vpxor %1, %1, %2 +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; struct AES_ARGS { +;; void* in[8]; +;; void* out[8]; +;; UINT128* keys[8]; +;; UINT128 IV[8]; +;; } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void aes_cbc_enc_256_x8(AES_ARGS *args, UINT64 len); +;; arg 1: ARG : addr of AES_ARGS structure +;; arg 2: LEN : len (in units of bytes) + +struc STACK +_gpr_save: resq 1 +_len: resq 1 +endstruc + +%define GPR_SAVE_AREA rsp + _gpr_save +%define LEN_AREA rsp + _len + +%ifdef LINUX +%define ARG rdi +%define LEN rsi +%define REG3 rcx +%define REG4 rdx +%else +%define ARG rcx +%define LEN rdx +%define REG3 rsi +%define REG4 rdi +%endif + +%define IDX rax +%define TMP rbx + +%define KEYS0 REG3 +%define KEYS1 REG4 +%define KEYS2 rbp +%define KEYS3 r8 +%define KEYS4 r9 +%define KEYS5 r10 +%define KEYS6 r11 +%define KEYS7 r12 + +%define IN0 r13 +%define IN2 r14 +%define IN4 r15 +%define IN6 LEN + +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 +%define XDATA4 xmm4 +%define XDATA5 xmm5 +%define XDATA6 xmm6 +%define XDATA7 xmm7 + +%define XKEY0_3 xmm8 +%define XKEY1_4 xmm9 +%define XKEY2_5 xmm10 +%define XKEY3_6 xmm11 +%define XKEY4_7 xmm12 +%define XKEY5_8 xmm13 +%define XKEY6_9 xmm14 +%define XTMP xmm15 + +section .text +MKGLOBAL(aes_cbc_enc_256_x8,function,internal) +aes_cbc_enc_256_x8: + + sub rsp, STACK_size + mov [GPR_SAVE_AREA + 8*0], rbp + + mov IDX, 16 + mov [LEN_AREA], LEN + + mov IN0, [ARG + _aesarg_in + 8*0] + mov IN2, [ARG + _aesarg_in + 8*2] + mov IN4, [ARG + _aesarg_in + 8*4] + mov IN6, [ARG + _aesarg_in + 8*6] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + mov TMP, [ARG + _aesarg_in + 8*1] + VMOVDQ XDATA0, [IN0] ; load first block of plain text + VMOVDQ XDATA1, [TMP] ; load first block of plain text + mov TMP, [ARG + _aesarg_in + 8*3] + VMOVDQ XDATA2, [IN2] ; load first block of plain text + VMOVDQ XDATA3, [TMP] ; load first block of plain text + mov TMP, [ARG + _aesarg_in + 8*5] + VMOVDQ XDATA4, [IN4] ; load first block of plain text + VMOVDQ XDATA5, [TMP] ; load first block of plain text + mov TMP, [ARG + _aesarg_in + 8*7] + VMOVDQ XDATA6, [IN6] ; load first block of plain text + VMOVDQ XDATA7, [TMP] ; load first block of plain text + + + VPXOR2 XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV + VPXOR2 XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV + VPXOR2 XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV + VPXOR2 XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV + VPXOR2 XDATA4, [ARG + _aesarg_IV + 16*4] ; plaintext XOR IV + VPXOR2 XDATA5, [ARG + _aesarg_IV + 16*5] ; plaintext XOR IV + VPXOR2 XDATA6, [ARG + _aesarg_IV + 16*6] ; plaintext XOR IV + VPXOR2 XDATA7, [ARG + _aesarg_IV + 16*7] ; plaintext XOR IV + + mov KEYS0, [ARG + _aesarg_keys + 8*0] + mov KEYS1, [ARG + _aesarg_keys + 8*1] + mov KEYS2, [ARG + _aesarg_keys + 8*2] + mov KEYS3, [ARG + _aesarg_keys + 8*3] + mov KEYS4, [ARG + _aesarg_keys + 8*4] + mov KEYS5, [ARG + _aesarg_keys + 8*5] + mov KEYS6, [ARG + _aesarg_keys + 8*6] + mov KEYS7, [ARG + _aesarg_keys + 8*7] + + VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK + VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK + VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK + VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK + VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK + VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK + VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK + VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK + + vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC + vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC + vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC + vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC + + vmovdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key + + vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC + vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC + vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC + vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC + + vmovdqa XKEY1_4, [KEYS1 + 16*4] ; load round 4 key + + vaesenc XDATA0, XKEY0_3 ; 3. ENC + vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC + vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC + vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC + vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC + vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC + vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC + vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC + + vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + vmovdqa XKEY2_5, [KEYS2 + 16*5] ; load round 5 key + vaesenc XDATA1, XKEY1_4 ; 4. ENC + vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC + vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC + vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC + vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC + + vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + vmovdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key + vaesenc XDATA2, XKEY2_5 ; 5. ENC + vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC + vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC + vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC + vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC + + vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC + vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC + vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC + vmovdqa XKEY4_7, [KEYS4 + 16*7] ; load round 7 key + vaesenc XDATA3, XKEY3_6 ; 6. ENC + vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC + vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC + vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC + vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC + + vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + vmovdqa XKEY5_8, [KEYS5 + 16*8] ; load round 8 key + vaesenc XDATA4, XKEY4_7 ; 7. ENC + vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC + vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC + vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC + + vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC + vmovdqa XKEY6_9, [KEYS6 + 16*9] ; load round 9 key + vaesenc XDATA5, XKEY5_8 ; 8. ENC + vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC + vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC + + vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC + vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC + vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC + vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC + vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC + vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC + mov TMP, [ARG + _aesarg_out + 8*0] + vaesenc XDATA6, XKEY6_9 ; 9. ENC + vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC + + + vaesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC + vaesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC + vaesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC + vaesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC + vaesenc XDATA4, [KEYS4 + 16*10] ; 10. ENC + vaesenc XDATA5, [KEYS5 + 16*10] ; 10. ENC + vaesenc XDATA6, [KEYS6 + 16*10] ; 10. ENC + vaesenc XDATA7, [KEYS7 + 16*10] ; 10. ENC + + vaesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC + vaesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC + vaesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC + vaesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC + vaesenc XDATA4, [KEYS4 + 16*11] ; 11. ENC + vaesenc XDATA5, [KEYS5 + 16*11] ; 11. ENC + vaesenc XDATA6, [KEYS6 + 16*11] ; 11. ENC + vaesenc XDATA7, [KEYS7 + 16*11] ; 11. ENC + + + vaesenc XDATA0, [KEYS0 + 16*12] ; 12. ENC + vaesenc XDATA1, [KEYS1 + 16*12] ; 12. ENC + vaesenc XDATA2, [KEYS2 + 16*12] ; 12. ENC + vaesenc XDATA3, [KEYS3 + 16*12] ; 12. ENC + vaesenc XDATA4, [KEYS4 + 16*12] ; 12. ENC + vaesenc XDATA5, [KEYS5 + 16*12] ; 12. ENC + vaesenc XDATA6, [KEYS6 + 16*12] ; 12. ENC + vaesenc XDATA7, [KEYS7 + 16*12] ; 12. ENC + + vaesenc XDATA0, [KEYS0 + 16*13] ; 13. ENC + vaesenc XDATA1, [KEYS1 + 16*13] ; 13. ENC + vaesenc XDATA2, [KEYS2 + 16*13] ; 13. ENC + vaesenc XDATA3, [KEYS3 + 16*13] ; 13. ENC + vaesenc XDATA4, [KEYS4 + 16*13] ; 13. ENC + vaesenc XDATA5, [KEYS5 + 16*13] ; 13. ENC + vaesenc XDATA6, [KEYS6 + 16*13] ; 13. ENC + vaesenc XDATA7, [KEYS7 + 16*13] ; 13. ENC + + vaesenclast XDATA0, [KEYS0 + 16*14] ; 14. ENC + vaesenclast XDATA1, [KEYS1 + 16*14] ; 14. ENC + vaesenclast XDATA2, [KEYS2 + 16*14] ; 14. ENC + vaesenclast XDATA3, [KEYS3 + 16*14] ; 14. ENC + vaesenclast XDATA4, [KEYS4 + 16*14] ; 14. ENC + vaesenclast XDATA5, [KEYS5 + 16*14] ; 14. ENC + vaesenclast XDATA6, [KEYS6 + 16*14] ; 14. ENC + vaesenclast XDATA7, [KEYS7 + 16*14] ; 14. ENC + + VMOVDQ [TMP], XDATA0 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*1] + VMOVDQ [TMP], XDATA1 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*2] + VMOVDQ [TMP], XDATA2 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*3] + VMOVDQ [TMP], XDATA3 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*4] + VMOVDQ [TMP], XDATA4 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*5] + VMOVDQ [TMP], XDATA5 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*6] + VMOVDQ [TMP], XDATA6 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*7] + VMOVDQ [TMP], XDATA7 ; write back ciphertext + + cmp [LEN_AREA], IDX + je done + +main_loop: + mov TMP, [ARG + _aesarg_in + 8*1] + VPXOR2 XDATA0, [IN0 + IDX] ; load next block of plain text + VPXOR2 XDATA1, [TMP + IDX] ; load next block of plain text + mov TMP, [ARG + _aesarg_in + 8*3] + VPXOR2 XDATA2, [IN2 + IDX] ; load next block of plain text + VPXOR2 XDATA3, [TMP + IDX] ; load next block of plain text + mov TMP, [ARG + _aesarg_in + 8*5] + VPXOR2 XDATA4, [IN4 + IDX] ; load next block of plain text + VPXOR2 XDATA5, [TMP + IDX] ; load next block of plain text + mov TMP, [ARG + _aesarg_in + 8*7] + VPXOR2 XDATA6, [IN6 + IDX] ; load next block of plain text + VPXOR2 XDATA7, [TMP + IDX] ; load next block of plain text + + + VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK + VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK + VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK + VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK + VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK + VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK + VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK + VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK + + vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC + vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC + vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC + vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC + + vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC + vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC + vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC + vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC + + vaesenc XDATA0, XKEY0_3 ; 3. ENC + vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC + vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC + vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC + vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC + vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC + vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC + vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC + + vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + vaesenc XDATA1, XKEY1_4 ; 4. ENC + vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC + vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC + vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC + vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC + + vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + vaesenc XDATA2, XKEY2_5 ; 5. ENC + vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC + vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC + vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC + vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC + + vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC + vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC + vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC + vaesenc XDATA3, XKEY3_6 ; 6. ENC + vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC + vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC + vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC + vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC + + vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + vaesenc XDATA4, XKEY4_7 ; 7. ENC + vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC + vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC + vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC + + vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC + vaesenc XDATA5, XKEY5_8 ; 8. ENC + vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC + vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC + + vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC + vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC + vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC + vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC + vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC + vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC + mov TMP, [ARG + _aesarg_out + 8*0] + vaesenc XDATA6, XKEY6_9 ; 9. ENC + vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC + + + vaesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC + vaesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC + vaesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC + vaesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC + vaesenc XDATA4, [KEYS4 + 16*10] ; 10. ENC + vaesenc XDATA5, [KEYS5 + 16*10] ; 10. ENC + vaesenc XDATA6, [KEYS6 + 16*10] ; 10. ENC + vaesenc XDATA7, [KEYS7 + 16*10] ; 10. ENC + + vaesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC + vaesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC + vaesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC + vaesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC + vaesenc XDATA4, [KEYS4 + 16*11] ; 11. ENC + vaesenc XDATA5, [KEYS5 + 16*11] ; 11. ENC + vaesenc XDATA6, [KEYS6 + 16*11] ; 11. ENC + vaesenc XDATA7, [KEYS7 + 16*11] ; 11. ENC + + vaesenc XDATA0, [KEYS0 + 16*12] ; 12. ENC + vaesenc XDATA1, [KEYS1 + 16*12] ; 12. ENC + vaesenc XDATA2, [KEYS2 + 16*12] ; 12. ENC + vaesenc XDATA3, [KEYS3 + 16*12] ; 12. ENC + vaesenc XDATA4, [KEYS4 + 16*12] ; 12. ENC + vaesenc XDATA5, [KEYS5 + 16*12] ; 12. ENC + vaesenc XDATA6, [KEYS6 + 16*12] ; 12. ENC + vaesenc XDATA7, [KEYS7 + 16*12] ; 12. ENC + + vaesenc XDATA0, [KEYS0 + 16*13] ; 13. ENC + vaesenc XDATA1, [KEYS1 + 16*13] ; 13. ENC + vaesenc XDATA2, [KEYS2 + 16*13] ; 13. ENC + vaesenc XDATA3, [KEYS3 + 16*13] ; 13. ENC + vaesenc XDATA4, [KEYS4 + 16*13] ; 13. ENC + vaesenc XDATA5, [KEYS5 + 16*13] ; 13. ENC + vaesenc XDATA6, [KEYS6 + 16*13] ; 13. ENC + vaesenc XDATA7, [KEYS7 + 16*13] ; 13. ENC + + vaesenclast XDATA0, [KEYS0 + 16*14] ; 14. ENC + vaesenclast XDATA1, [KEYS1 + 16*14] ; 14. ENC + vaesenclast XDATA2, [KEYS2 + 16*14] ; 14. ENC + vaesenclast XDATA3, [KEYS3 + 16*14] ; 14. ENC + vaesenclast XDATA4, [KEYS4 + 16*14] ; 14. ENC + vaesenclast XDATA5, [KEYS5 + 16*14] ; 14. ENC + vaesenclast XDATA6, [KEYS6 + 16*14] ; 14. ENC + vaesenclast XDATA7, [KEYS7 + 16*14] ; 14. ENC + + + VMOVDQ [TMP + IDX], XDATA0 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*1] + VMOVDQ [TMP + IDX], XDATA1 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*2] + VMOVDQ [TMP + IDX], XDATA2 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*3] + VMOVDQ [TMP + IDX], XDATA3 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*4] + VMOVDQ [TMP + IDX], XDATA4 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*5] + VMOVDQ [TMP + IDX], XDATA5 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*6] + VMOVDQ [TMP + IDX], XDATA6 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*7] + VMOVDQ [TMP + IDX], XDATA7 ; write back ciphertext + + add IDX, 16 + cmp [LEN_AREA], IDX + jne main_loop + +done: + ;; update IV + vmovdqa [ARG + _aesarg_IV + 16*0], XDATA0 + vmovdqa [ARG + _aesarg_IV + 16*1], XDATA1 + vmovdqa [ARG + _aesarg_IV + 16*2], XDATA2 + vmovdqa [ARG + _aesarg_IV + 16*3], XDATA3 + vmovdqa [ARG + _aesarg_IV + 16*4], XDATA4 + vmovdqa [ARG + _aesarg_IV + 16*5], XDATA5 + vmovdqa [ARG + _aesarg_IV + 16*6], XDATA6 + vmovdqa [ARG + _aesarg_IV + 16*7], XDATA7 + + ;; update IN and OUT + vmovd xmm0, [LEN_AREA] + vpshufd xmm0, xmm0, 0x44 + vpaddq xmm1, xmm0, [ARG + _aesarg_in + 16*0] + vpaddq xmm2, xmm0, [ARG + _aesarg_in + 16*1] + vpaddq xmm3, xmm0, [ARG + _aesarg_in + 16*2] + vpaddq xmm4, xmm0, [ARG + _aesarg_in + 16*3] + vmovdqa [ARG + _aesarg_in + 16*0], xmm1 + vmovdqa [ARG + _aesarg_in + 16*1], xmm2 + vmovdqa [ARG + _aesarg_in + 16*2], xmm3 + vmovdqa [ARG + _aesarg_in + 16*3], xmm4 + vpaddq xmm5, xmm0, [ARG + _aesarg_out + 16*0] + vpaddq xmm6, xmm0, [ARG + _aesarg_out + 16*1] + vpaddq xmm7, xmm0, [ARG + _aesarg_out + 16*2] + vpaddq xmm8, xmm0, [ARG + _aesarg_out + 16*3] + vmovdqa [ARG + _aesarg_out + 16*0], xmm5 + vmovdqa [ARG + _aesarg_out + 16*1], xmm6 + vmovdqa [ARG + _aesarg_out + 16*2], xmm7 + vmovdqa [ARG + _aesarg_out + 16*3], xmm8 + +;; XMMs are saved at a higher level + mov rbp, [GPR_SAVE_AREA + 8*0] + + add rsp, STACK_size + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/aes_cfb_128_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes_cfb_128_avx.asm new file mode 100644 index 000000000..34d03bb99 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes_cfb_128_avx.asm @@ -0,0 +1,165 @@ +;; +;; Copyright (c) 2018-2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "include/memcpy.asm" +%include "include/clear_regs.asm" + +;;; Routine to do 128 bit CFB AES encrypt/decrypt operations on one block only. +;;; It processes only one buffer at a time. +;;; It is designed to manage partial blocks of DOCSIS 3.1 SEC BPI + +;; In System V AMD64 ABI +;; calle saves: RBX, RBP, R12-R15 +;; Windows x64 ABI +;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 +;; +;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Windows clobbers: RAX R9 R10 R11 +;; Windows preserves: RBX RCX RDX RBP RSI RDI R8 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Linux clobbers: RAX R9 R10 +;; Linux preserves: RBX RCX RDX RBP RSI RDI R8 R11 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; +;; Linux/Windows clobbers: xmm0 +;; + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define arg4 rcx +%define arg5 r8 +%else +%define arg1 rcx +%define arg2 rdx +%define arg3 r8 +%define arg4 r9 +%define arg5 [rsp + 5*8] +%endif + +%define OUT arg1 +%define IN arg2 +%define IV arg3 +%define KEYS arg4 +%ifdef LINUX +%define LEN arg5 +%else +%define LEN2 arg5 +%define LEN r11 +%endif + +%define TMP0 rax +%define TMP1 r10 + +%define XDATA xmm0 +%define XIN xmm1 + +section .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void aes_cfb_128_one(void *out, void *in, void *iv, void *keys, uint64_t len) +;; arg 1: OUT : addr to put clear/cipher text out +;; arg 2: IN : addr to take cipher/clear text from +;; arg 3: IV : initialization vector +;; arg 4: KEYS: pointer to expanded keys structure (16 byte aligned) +;; arg 5: LEN: length of the text to encrypt/decrypt (valid range is 0 to 16) +;; +;; AES CFB128 one block encrypt/decrypt implementation. +;; The function doesn't update IV. The result of operation can be found in OUT. +;; +;; It is primarly designed to process partial block of +;; DOCSIS 3.1 AES Packet PDU Encryption (I.10) +;; +;; It process up to one block only (up to 16 bytes). +;; +;; It makes sure not to read more than LEN bytes from IN and +;; not to store more than LEN bytes to OUT. +MKGLOBAL(aes_cfb_128_one_avx,function,) +MKGLOBAL(aes_cfb_128_one_avx2,function,) +MKGLOBAL(aes_cfb_128_one_avx512,function,) +align 32 +aes_cfb_128_one_avx: +aes_cfb_128_one_avx2: +aes_cfb_128_one_avx512: +%ifndef LINUX + mov LEN, LEN2 +%endif +%ifdef SAFE_PARAM + cmp IV, 0 + jz exit_cfb + + cmp KEYS, 0 + jz exit_cfb + + cmp LEN, 0 + jz skip_in_out_check + + cmp OUT, 0 + jz exit_cfb + + cmp IN, 0 + jz exit_cfb + +skip_in_out_check: +%endif + simd_load_avx_16 XIN, IN, LEN + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu XDATA, [IV] ; IV (or next to last block) + vpxor XDATA, XDATA, [KEYS + 16*0] ; 0. ARK + vaesenc XDATA, XDATA, [KEYS + 16*1] ; 1. ENC + vaesenc XDATA, XDATA, [KEYS + 16*2] ; 2. ENC + vaesenc XDATA, XDATA, [KEYS + 16*3] ; 3. ENC + vaesenc XDATA, XDATA, [KEYS + 16*4] ; 4. ENC + vaesenc XDATA, XDATA, [KEYS + 16*5] ; 5. ENC + vaesenc XDATA, XDATA, [KEYS + 16*6] ; 6. ENC + vaesenc XDATA, XDATA, [KEYS + 16*7] ; 7. ENC + vaesenc XDATA, XDATA, [KEYS + 16*8] ; 8. ENC + vaesenc XDATA, XDATA, [KEYS + 16*9] ; 9. ENC + vaesenclast XDATA, XDATA, [KEYS + 16*10] ; 10. ENC + + vpxor XDATA, XIN ; plaintext/ciphertext XOR block cipher encryption + + simd_store_avx OUT, XDATA, LEN, TMP0, TMP1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifdef SAFE_DATA + ;; XDATA and XIN are the only scratch SIMD registers used + clear_xmms_avx XDATA, XIN + clear_scratch_gps_asm +%endif +exit_cfb: + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/aes_ecb_by4_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes_ecb_by4_avx.asm new file mode 100644 index 000000000..d71bd8c46 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes_ecb_by4_avx.asm @@ -0,0 +1,654 @@ +;; +;; Copyright (c) 2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; routine to do AES ECB encrypt/decrypt on 16n bytes doing AES by 4 + +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +; void aes_ecb_x_y_avx(void *in, +; UINT128 keys[], +; void *out, +; UINT64 len_bytes); +; +; x = direction (enc/dec) +; y = key size (128/192/256) +; arg 1: IN: pointer to input (cipher text) +; arg 2: KEYS: pointer to keys +; arg 3: OUT: pointer to output (plain text) +; arg 4: LEN: length in bytes (multiple of 16) +; + +%include "include/os.asm" + +%ifndef AES_ECB_ENC_128 +%define AES_ECB_ENC_128 aes_ecb_enc_128_avx +%define AES_ECB_ENC_192 aes_ecb_enc_192_avx +%define AES_ECB_ENC_256 aes_ecb_enc_256_avx +%define AES_ECB_DEC_128 aes_ecb_dec_128_avx +%define AES_ECB_DEC_192 aes_ecb_dec_192_avx +%define AES_ECB_DEC_256 aes_ecb_dec_256_avx +%endif + +%ifdef LINUX +%define IN rdi +%define KEYS rsi +%define OUT rdx +%define LEN rcx +%else +%define IN rcx +%define KEYS rdx +%define OUT r8 +%define LEN r9 +%endif + +%define IDX rax +%define TMP IDX +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 +%define XKEY0 xmm4 +%define XKEY2 xmm5 +%define XKEY4 xmm6 +%define XKEY6 xmm7 +%define XKEY10 xmm8 +%define XKEY_A xmm9 +%define XKEY_B xmm10 + +section .text + +%macro AES_ECB 2 +%define %%NROUNDS %1 ; [in] Number of AES rounds, numerical value +%define %%DIR %2 ; [in] Direction (encrypt/decrypt) + +%ifidn %%DIR, ENC +%define AES vaesenc +%define AES_LAST vaesenclast +%else ; DIR = DEC +%define AES vaesdec +%define AES_LAST vaesdeclast +%endif + mov TMP, LEN + and TMP, 3*16 + jz %%initial_4 + cmp TMP, 2*16 + jb %%initial_1 + ja %%initial_3 + +%%initial_2: + ; load plain/cipher text + vmovdqu XDATA0, [IN + 0*16] + vmovdqu XDATA1, [IN + 1*16] + + vmovdqa XKEY0, [KEYS + 0*16] + + vpxor XDATA0, XKEY0 ; 0. ARK + vpxor XDATA1, XKEY0 + + vmovdqa XKEY2, [KEYS + 2*16] + + AES XDATA0, [KEYS + 1*16] ; 1. ENC + AES XDATA1, [KEYS + 1*16] + + mov IDX, 2*16 + + AES XDATA0, XKEY2 ; 2. ENC + AES XDATA1, XKEY2 + + vmovdqa XKEY4, [KEYS + 4*16] + + AES XDATA0, [KEYS + 3*16] ; 3. ENC + AES XDATA1, [KEYS + 3*16] + + AES XDATA0, XKEY4 ; 4. ENC + AES XDATA1, XKEY4 + + vmovdqa XKEY6, [KEYS + 6*16] + + AES XDATA0, [KEYS + 5*16] ; 5. ENC + AES XDATA1, [KEYS + 5*16] + + AES XDATA0, XKEY6 ; 6. ENC + AES XDATA1, XKEY6 + + vmovdqa XKEY_B, [KEYS + 8*16] + + AES XDATA0, [KEYS + 7*16] ; 7. ENC + AES XDATA1, [KEYS + 7*16] + + AES XDATA0, XKEY_B ; 8. ENC + AES XDATA1, XKEY_B + + vmovdqa XKEY10, [KEYS + 10*16] + + AES XDATA0, [KEYS + 9*16] ; 9. ENC + AES XDATA1, [KEYS + 9*16] + +%if %%NROUNDS >= 12 + AES XDATA0, XKEY10 ; 10. ENC + AES XDATA1, XKEY10 + + AES XDATA0, [KEYS + 11*16] ; 11. ENC + AES XDATA1, [KEYS + 11*16] +%endif + +%if %%NROUNDS == 14 + AES XDATA0, [KEYS + 12*16] ; 12. ENC + AES XDATA1, [KEYS + 12*16] + + AES XDATA0, [KEYS + 13*16] ; 13. ENC + AES XDATA1, [KEYS + 13*16] +%endif + +%if %%NROUNDS == 10 + AES_LAST XDATA0, XKEY10 ; 10. ENC + AES_LAST XDATA1, XKEY10 +%elif %%NROUNDS == 12 + AES_LAST XDATA0, [KEYS + 12*16] ; 12. ENC + AES_LAST XDATA1, [KEYS + 12*16] +%else + AES_LAST XDATA0, [KEYS + 14*16] ; 14. ENC + AES_LAST XDATA1, [KEYS + 14*16] +%endif + vmovdqu [OUT + 0*16], XDATA0 + vmovdqu [OUT + 1*16], XDATA1 + + cmp LEN, 2*16 + je %%done + jmp %%main_loop + + + align 16 +%%initial_1: + ; load plain/cipher text + vmovdqu XDATA0, [IN + 0*16] + + vmovdqa XKEY0, [KEYS + 0*16] + + vpxor XDATA0, XKEY0 ; 0. ARK + + vmovdqa XKEY2, [KEYS + 2*16] + + AES XDATA0, [KEYS + 1*16] ; 1. ENC + + mov IDX, 1*16 + + AES XDATA0, XKEY2 ; 2. ENC + + vmovdqa XKEY4, [KEYS + 4*16] + + AES XDATA0, [KEYS + 3*16] ; 3. ENC + + AES XDATA0, XKEY4 ; 4. ENC + + vmovdqa XKEY6, [KEYS + 6*16] + + AES XDATA0, [KEYS + 5*16] ; 5. ENC + + AES XDATA0, XKEY6 ; 6. ENC + + vmovdqa XKEY_B, [KEYS + 8*16] + + AES XDATA0, [KEYS + 7*16] ; 7. ENC + + AES XDATA0, XKEY_B ; 8. ENC + + vmovdqa XKEY10, [KEYS + 10*16] + + AES XDATA0, [KEYS + 9*16] ; 9. ENC + +%if %%NROUNDS >= 12 + AES XDATA0, XKEY10 ; 10. ENC + + AES XDATA0, [KEYS + 11*16] ; 11. ENC +%endif + +%if %%NROUNDS == 14 + AES XDATA0, [KEYS + 12*16] ; 12. ENC + + AES XDATA0, [KEYS + 13*16] ; 13. ENC +%endif + +%if %%NROUNDS == 10 + + AES_LAST XDATA0, XKEY10 ; 10. ENC +%elif %%NROUNDS == 12 + AES_LAST XDATA0, [KEYS + 12*16] ; 12. ENC +%else + AES_LAST XDATA0, [KEYS + 14*16] ; 14. ENC +%endif + + vmovdqu [OUT + 0*16], XDATA0 + + cmp LEN, 1*16 + je %%done + jmp %%main_loop + + +%%initial_3: + ; load plain/cipher text + vmovdqu XDATA0, [IN + 0*16] + vmovdqu XDATA1, [IN + 1*16] + vmovdqu XDATA2, [IN + 2*16] + + vmovdqa XKEY0, [KEYS + 0*16] + + vmovdqa XKEY_A, [KEYS + 1*16] + + vpxor XDATA0, XKEY0 ; 0. ARK + vpxor XDATA1, XKEY0 + vpxor XDATA2, XKEY0 + + vmovdqa XKEY2, [KEYS + 2*16] + + AES XDATA0, XKEY_A ; 1. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + + vmovdqa XKEY_A, [KEYS + 3*16] + mov IDX, 3*16 + + AES XDATA0, XKEY2 ; 2. ENC + AES XDATA1, XKEY2 + AES XDATA2, XKEY2 + + vmovdqa XKEY4, [KEYS + 4*16] + + AES XDATA0, XKEY_A ; 3. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + + vmovdqa XKEY_A, [KEYS + 5*16] + + AES XDATA0, XKEY4 ; 4. ENC + AES XDATA1, XKEY4 + AES XDATA2, XKEY4 + + vmovdqa XKEY6, [KEYS + 6*16] + + AES XDATA0, XKEY_A ; 5. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + + vmovdqa XKEY_A, [KEYS + 7*16] + + AES XDATA0, XKEY6 ; 6. ENC + AES XDATA1, XKEY6 + AES XDATA2, XKEY6 + + vmovdqa XKEY_B, [KEYS + 8*16] + + AES XDATA0, XKEY_A ; 7. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + + vmovdqa XKEY_A, [KEYS + 9*16] + + AES XDATA0, XKEY_B ; 8. ENC + AES XDATA1, XKEY_B + AES XDATA2, XKEY_B + + vmovdqa XKEY_B, [KEYS + 10*16] + + AES XDATA0, XKEY_A ; 9. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + +%if %%NROUNDS >= 12 + vmovdqa XKEY_A, [KEYS + 11*16] + + AES XDATA0, XKEY_B ; 10. ENC + AES XDATA1, XKEY_B + AES XDATA2, XKEY_B + + vmovdqa XKEY_B, [KEYS + 12*16] + + AES XDATA0, XKEY_A ; 11. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + +%endif + +%if %%NROUNDS == 14 + vmovdqa XKEY_A, [KEYS + 13*16] + + AES XDATA0, XKEY_B ; 12. ENC + AES XDATA1, XKEY_B + AES XDATA2, XKEY_B + + vmovdqa XKEY_B, [KEYS + 14*16] + + AES XDATA0, XKEY_A ; 13. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A +%endif + + AES_LAST XDATA0, XKEY_B ; 10/12/14. ENC (depending on key size) + AES_LAST XDATA1, XKEY_B + AES_LAST XDATA2, XKEY_B + + vmovdqu [OUT + 0*16], XDATA0 + vmovdqu [OUT + 1*16], XDATA1 + vmovdqu [OUT + 2*16], XDATA2 + + cmp LEN, 3*16 + je %%done + jmp %%main_loop + + + align 16 +%%initial_4: + ; load plain/cipher text + vmovdqu XDATA0, [IN + 0*16] + vmovdqu XDATA1, [IN + 1*16] + vmovdqu XDATA2, [IN + 2*16] + vmovdqu XDATA3, [IN + 3*16] + + vmovdqa XKEY0, [KEYS + 0*16] + + vmovdqa XKEY_A, [KEYS + 1*16] + + vpxor XDATA0, XKEY0 ; 0. ARK + vpxor XDATA1, XKEY0 + vpxor XDATA2, XKEY0 + vpxor XDATA3, XKEY0 + + vmovdqa XKEY2, [KEYS + 2*16] + + AES XDATA0, XKEY_A ; 1. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A + + vmovdqa XKEY_A, [KEYS + 3*16] + + mov IDX, 4*16 + + AES XDATA0, XKEY2 ; 2. ENC + AES XDATA1, XKEY2 + AES XDATA2, XKEY2 + AES XDATA3, XKEY2 + + vmovdqa XKEY4, [KEYS + 4*16] + + AES XDATA0, XKEY_A ; 3. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A + + vmovdqa XKEY_A, [KEYS + 5*16] + + AES XDATA0, XKEY4 ; 4. ENC + AES XDATA1, XKEY4 + AES XDATA2, XKEY4 + AES XDATA3, XKEY4 + + vmovdqa XKEY6, [KEYS + 6*16] + + AES XDATA0, XKEY_A ; 5. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A + + vmovdqa XKEY_A, [KEYS + 7*16] + + AES XDATA0, XKEY6 ; 6. ENC + AES XDATA1, XKEY6 + AES XDATA2, XKEY6 + AES XDATA3, XKEY6 + + vmovdqa XKEY_B, [KEYS + 8*16] + + AES XDATA0, XKEY_A ; 7. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A + + vmovdqa XKEY_A, [KEYS + 9*16] + + AES XDATA0, XKEY_B ; 8. ENC + AES XDATA1, XKEY_B + AES XDATA2, XKEY_B + AES XDATA3, XKEY_B + + vmovdqa XKEY_B, [KEYS + 10*16] + + AES XDATA0, XKEY_A ; 9. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A + +%if %%NROUNDS >= 12 + vmovdqa XKEY_A, [KEYS + 11*16] + + AES XDATA0, XKEY_B ; 10. ENC + AES XDATA1, XKEY_B + AES XDATA2, XKEY_B + AES XDATA3, XKEY_B + + vmovdqa XKEY_B, [KEYS + 12*16] + + AES XDATA0, XKEY_A ; 11. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A +%endif + +%if %%NROUNDS == 14 + vmovdqa XKEY_A, [KEYS + 13*16] + + AES XDATA0, XKEY_B ; 12. ENC + AES XDATA1, XKEY_B + AES XDATA2, XKEY_B + AES XDATA3, XKEY_B + + vmovdqa XKEY_B, [KEYS + 14*16] + + AES XDATA0, XKEY_A ; 13. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A +%endif + + AES_LAST XDATA0, XKEY_B ; 10/12/14. ENC (depending on key size) + AES_LAST XDATA1, XKEY_B + AES_LAST XDATA2, XKEY_B + AES_LAST XDATA3, XKEY_B + + vmovdqu [OUT + 0*16], XDATA0 + vmovdqu [OUT + 1*16], XDATA1 + vmovdqu [OUT + 2*16], XDATA2 + vmovdqu [OUT + 3*16], XDATA3 + + cmp LEN, 4*16 + jz %%done + jmp %%main_loop + + align 16 +%%main_loop: + ; load plain/cipher text + vmovdqu XDATA0, [IN + IDX + 0*16] + vmovdqu XDATA1, [IN + IDX + 1*16] + vmovdqu XDATA2, [IN + IDX + 2*16] + vmovdqu XDATA3, [IN + IDX + 3*16] + + vmovdqa XKEY_A, [KEYS + 1*16] + + vpxor XDATA0, XKEY0 ; 0. ARK + vpxor XDATA1, XKEY0 + vpxor XDATA2, XKEY0 + vpxor XDATA3, XKEY0 + + add IDX, 4*16 + + AES XDATA0, XKEY_A ; 1. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A + + vmovdqa XKEY_A, [KEYS + 3*16] + + AES XDATA0, XKEY2 ; 2. ENC + AES XDATA1, XKEY2 + AES XDATA2, XKEY2 + AES XDATA3, XKEY2 + + AES XDATA0, XKEY_A ; 3. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A + + vmovdqa XKEY_A, [KEYS + 5*16] + + AES XDATA0, XKEY4 ; 4. ENC + AES XDATA1, XKEY4 + AES XDATA2, XKEY4 + AES XDATA3, XKEY4 + + AES XDATA0, XKEY_A ; 5. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A + + vmovdqa XKEY_A, [KEYS + 7*16] + + AES XDATA0, XKEY6 ; 6. ENC + AES XDATA1, XKEY6 + AES XDATA2, XKEY6 + AES XDATA3, XKEY6 + + vmovdqa XKEY_B, [KEYS + 8*16] + + AES XDATA0, XKEY_A ; 7. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A + + vmovdqa XKEY_A, [KEYS + 9*16] + + AES XDATA0, XKEY_B ; 8. ENC + AES XDATA1, XKEY_B + AES XDATA2, XKEY_B + AES XDATA3, XKEY_B + + vmovdqa XKEY_B, [KEYS + 10*16] + + AES XDATA0, XKEY_A ; 9. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A + +%if %%NROUNDS >= 12 + vmovdqa XKEY_A, [KEYS + 11*16] + + AES XDATA0, XKEY_B ; 10. ENC + AES XDATA1, XKEY_B + AES XDATA2, XKEY_B + AES XDATA3, XKEY_B + + vmovdqa XKEY_B, [KEYS + 12*16] + + AES XDATA0, XKEY_A ; 11. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A +%endif + +%if %%NROUNDS == 14 + vmovdqa XKEY_A, [KEYS + 13*16] + + AES XDATA0, XKEY_B ; 12. ENC + AES XDATA1, XKEY_B + AES XDATA2, XKEY_B + AES XDATA3, XKEY_B + + vmovdqa XKEY_B, [KEYS + 14*16] + + AES XDATA0, XKEY_A ; 13. ENC + AES XDATA1, XKEY_A + AES XDATA2, XKEY_A + AES XDATA3, XKEY_A +%endif + + AES_LAST XDATA0, XKEY_B ; 10/12/14. ENC (depending on key size) + AES_LAST XDATA1, XKEY_B + AES_LAST XDATA2, XKEY_B + AES_LAST XDATA3, XKEY_B + + vmovdqu [OUT + IDX + 0*16 - 4*16], XDATA0 + vmovdqu [OUT + IDX + 1*16 - 4*16], XDATA1 + vmovdqu [OUT + IDX + 2*16 - 4*16], XDATA2 + vmovdqu [OUT + IDX + 3*16 - 4*16], XDATA3 + + cmp IDX, LEN + jne %%main_loop + +%%done: + + ret + +%endmacro + +align 16 +MKGLOBAL(AES_ECB_ENC_128,function,internal) +AES_ECB_ENC_128: + + AES_ECB 10, ENC + +align 16 +MKGLOBAL(AES_ECB_ENC_192,function,internal) +AES_ECB_ENC_192: + + AES_ECB 12, ENC + +align 16 +MKGLOBAL(AES_ECB_ENC_256,function,internal) +AES_ECB_ENC_256: + + AES_ECB 14, ENC + +align 16 +MKGLOBAL(AES_ECB_DEC_128,function,internal) +AES_ECB_DEC_128: + + AES_ECB 10, DEC + +align 16 +MKGLOBAL(AES_ECB_DEC_192,function,internal) +AES_ECB_DEC_192: + + AES_ECB 12, DEC + +align 16 +MKGLOBAL(AES_ECB_DEC_256,function,internal) +AES_ECB_DEC_256: + + AES_ECB 14, DEC + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/aes_xcbc_mac_128_x8.asm b/src/spdk/intel-ipsec-mb/avx/aes_xcbc_mac_128_x8.asm new file mode 100644 index 000000000..615e19050 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes_xcbc_mac_128_x8.asm @@ -0,0 +1,418 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;;; routine to do 128 bit AES XCBC + +;; clobbers all registers except for ARG1 and rbp + +%include "include/os.asm" +%include "mb_mgr_datastruct.asm" + +%define VMOVDQ vmovdqu ;; assume buffers not aligned + +%macro VPXOR2 2 + vpxor %1, %1, %2 +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; struct AES_XCBC_ARGS_x8 { +;; void* in[8]; +;; UINT128* keys[8]; +;; UINT128 ICV[8]; +;; } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void aes_xcbc_mac_128_x8(AES_XCBC_ARGS_x8 *args, UINT64 len); +;; arg 1: ARG : addr of AES_XCBC_ARGS_x8 structure +;; arg 2: LEN : len (in units of bytes) + +struc STACK +_gpr_save: resq 1 +_len: resq 1 +endstruc + +%define GPR_SAVE_AREA rsp + _gpr_save +%define LEN_AREA rsp + _len + +%ifdef LINUX +%define ARG rdi +%define LEN rsi +%define REG3 rcx +%define REG4 rdx +%else +%define ARG rcx +%define LEN rdx +%define REG3 rsi +%define REG4 rdi +%endif + +%define IDX rax +%define TMP rbx + +%define KEYS0 REG3 +%define KEYS1 REG4 +%define KEYS2 rbp +%define KEYS3 r8 +%define KEYS4 r9 +%define KEYS5 r10 +%define KEYS6 r11 +%define KEYS7 r12 + +%define IN0 r13 +%define IN2 r14 +%define IN4 r15 +%define IN6 LEN + +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 +%define XDATA4 xmm4 +%define XDATA5 xmm5 +%define XDATA6 xmm6 +%define XDATA7 xmm7 + +%define XKEY0_3 xmm8 +%define XKEY1_4 xmm9 +%define XKEY2_5 xmm10 +%define XKEY3_6 xmm11 +%define XKEY4_7 xmm12 +%define XKEY5_8 xmm13 +%define XKEY6_9 xmm14 +%define XTMP xmm15 + +section .text +MKGLOBAL(aes_xcbc_mac_128_x8,function,internal) +aes_xcbc_mac_128_x8: + + sub rsp, STACK_size + mov [GPR_SAVE_AREA + 8*0], rbp + + mov IDX, 16 + mov [LEN_AREA], LEN + + mov IN0, [ARG + _aesxcbcarg_in + 8*0] + mov IN2, [ARG + _aesxcbcarg_in + 8*2] + mov IN4, [ARG + _aesxcbcarg_in + 8*4] + mov IN6, [ARG + _aesxcbcarg_in + 8*6] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + mov TMP, [ARG + _aesxcbcarg_in + 8*1] + VMOVDQ XDATA0, [IN0] ; load first block of plain text + VMOVDQ XDATA1, [TMP] ; load first block of plain text + mov TMP, [ARG + _aesxcbcarg_in + 8*3] + VMOVDQ XDATA2, [IN2] ; load first block of plain text + VMOVDQ XDATA3, [TMP] ; load first block of plain text + mov TMP, [ARG + _aesxcbcarg_in + 8*5] + VMOVDQ XDATA4, [IN4] ; load first block of plain text + VMOVDQ XDATA5, [TMP] ; load first block of plain text + mov TMP, [ARG + _aesxcbcarg_in + 8*7] + VMOVDQ XDATA6, [IN6] ; load first block of plain text + VMOVDQ XDATA7, [TMP] ; load first block of plain text + + + VPXOR2 XDATA0, [ARG + _aesxcbcarg_ICV + 16*0] ; plaintext XOR ICV + VPXOR2 XDATA1, [ARG + _aesxcbcarg_ICV + 16*1] ; plaintext XOR ICV + VPXOR2 XDATA2, [ARG + _aesxcbcarg_ICV + 16*2] ; plaintext XOR ICV + VPXOR2 XDATA3, [ARG + _aesxcbcarg_ICV + 16*3] ; plaintext XOR ICV + VPXOR2 XDATA4, [ARG + _aesxcbcarg_ICV + 16*4] ; plaintext XOR ICV + VPXOR2 XDATA5, [ARG + _aesxcbcarg_ICV + 16*5] ; plaintext XOR ICV + VPXOR2 XDATA6, [ARG + _aesxcbcarg_ICV + 16*6] ; plaintext XOR ICV + VPXOR2 XDATA7, [ARG + _aesxcbcarg_ICV + 16*7] ; plaintext XOR ICV + + mov KEYS0, [ARG + _aesxcbcarg_keys + 8*0] + mov KEYS1, [ARG + _aesxcbcarg_keys + 8*1] + mov KEYS2, [ARG + _aesxcbcarg_keys + 8*2] + mov KEYS3, [ARG + _aesxcbcarg_keys + 8*3] + mov KEYS4, [ARG + _aesxcbcarg_keys + 8*4] + mov KEYS5, [ARG + _aesxcbcarg_keys + 8*5] + mov KEYS6, [ARG + _aesxcbcarg_keys + 8*6] + mov KEYS7, [ARG + _aesxcbcarg_keys + 8*7] + + VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK + VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK + VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK + VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK + VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK + VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK + VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK + VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK + + vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC + vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC + vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC + vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC + + vmovdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key + + vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC + vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC + vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC + vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC + + vmovdqa XKEY1_4, [KEYS1 + 16*4] ; load round 4 key + + vaesenc XDATA0, XKEY0_3 ; 3. ENC + vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC + vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC + vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC + vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC + vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC + vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC + vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC + + vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + vmovdqa XKEY2_5, [KEYS2 + 16*5] ; load round 5 key + vaesenc XDATA1, XKEY1_4 ; 4. ENC + vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC + vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC + vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC + vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC + + vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + vmovdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key + vaesenc XDATA2, XKEY2_5 ; 5. ENC + vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC + vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC + vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC + vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC + + vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC + vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC + vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC + vmovdqa XKEY4_7, [KEYS4 + 16*7] ; load round 7 key + vaesenc XDATA3, XKEY3_6 ; 6. ENC + vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC + vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC + vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC + vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC + + vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + vmovdqa XKEY5_8, [KEYS5 + 16*8] ; load round 8 key + vaesenc XDATA4, XKEY4_7 ; 7. ENC + vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC + vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC + vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC + + vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC + vmovdqa XKEY6_9, [KEYS6 + 16*9] ; load round 9 key + vaesenc XDATA5, XKEY5_8 ; 8. ENC + vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC + vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC + + vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC + vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC + vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC + vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC + vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC + vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC + vaesenc XDATA6, XKEY6_9 ; 9. ENC + vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC + + vaesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC + vaesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC + vaesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC + vaesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC + vaesenclast XDATA4, [KEYS4 + 16*10] ; 10. ENC + vaesenclast XDATA5, [KEYS5 + 16*10] ; 10. ENC + vaesenclast XDATA6, [KEYS6 + 16*10] ; 10. ENC + vaesenclast XDATA7, [KEYS7 + 16*10] ; 10. ENC + + cmp [LEN_AREA], IDX + je done + +main_loop: + mov TMP, [ARG + _aesxcbcarg_in + 8*1] + VPXOR2 XDATA0, [IN0 + IDX] ; load next block of plain text + VPXOR2 XDATA1, [TMP + IDX] ; load next block of plain text + mov TMP, [ARG + _aesxcbcarg_in + 8*3] + VPXOR2 XDATA2, [IN2 + IDX] ; load next block of plain text + VPXOR2 XDATA3, [TMP + IDX] ; load next block of plain text + mov TMP, [ARG + _aesxcbcarg_in + 8*5] + VPXOR2 XDATA4, [IN4 + IDX] ; load next block of plain text + VPXOR2 XDATA5, [TMP + IDX] ; load next block of plain text + mov TMP, [ARG + _aesxcbcarg_in + 8*7] + VPXOR2 XDATA6, [IN6 + IDX] ; load next block of plain text + VPXOR2 XDATA7, [TMP + IDX] ; load next block of plain text + + + VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK + VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK + VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK + VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK + VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK + VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK + VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK + VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK + + vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC + vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC + vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC + vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC + + vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC + vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC + vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC + vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC + + vaesenc XDATA0, XKEY0_3 ; 3. ENC + vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC + vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC + vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC + vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC + vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC + vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC + vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC + + vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + vaesenc XDATA1, XKEY1_4 ; 4. ENC + vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC + vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC + vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC + vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC + + vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + vaesenc XDATA2, XKEY2_5 ; 5. ENC + vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC + vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC + vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC + vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC + + vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC + vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC + vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC + vaesenc XDATA3, XKEY3_6 ; 6. ENC + vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC + vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC + vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC + vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC + + vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + vaesenc XDATA4, XKEY4_7 ; 7. ENC + vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC + vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC + vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC + + vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC + vaesenc XDATA5, XKEY5_8 ; 8. ENC + vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC + vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC + + vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC + vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC + vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC + vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC + vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC + vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC + vaesenc XDATA6, XKEY6_9 ; 9. ENC + vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC + + + vaesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC + vaesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC + vaesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC + vaesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC + vaesenclast XDATA4, [KEYS4 + 16*10] ; 10. ENC + vaesenclast XDATA5, [KEYS5 + 16*10] ; 10. ENC + vaesenclast XDATA6, [KEYS6 + 16*10] ; 10. ENC + vaesenclast XDATA7, [KEYS7 + 16*10] ; 10. ENC + + add IDX, 16 + cmp [LEN_AREA], IDX + jne main_loop + +done: + ;; update ICV + vmovdqa [ARG + _aesxcbcarg_ICV + 16*0], XDATA0 + vmovdqa [ARG + _aesxcbcarg_ICV + 16*1], XDATA1 + vmovdqa [ARG + _aesxcbcarg_ICV + 16*2], XDATA2 + vmovdqa [ARG + _aesxcbcarg_ICV + 16*3], XDATA3 + vmovdqa [ARG + _aesxcbcarg_ICV + 16*4], XDATA4 + vmovdqa [ARG + _aesxcbcarg_ICV + 16*5], XDATA5 + vmovdqa [ARG + _aesxcbcarg_ICV + 16*6], XDATA6 + vmovdqa [ARG + _aesxcbcarg_ICV + 16*7], XDATA7 + + ;; update IN + vmovd xmm0, [LEN_AREA] + vpshufd xmm0, xmm0, 0x44 + vpaddq xmm1, xmm0, [ARG + _aesxcbcarg_in + 16*0] + vpaddq xmm2, xmm0, [ARG + _aesxcbcarg_in + 16*1] + vpaddq xmm3, xmm0, [ARG + _aesxcbcarg_in + 16*2] + vpaddq xmm4, xmm0, [ARG + _aesxcbcarg_in + 16*3] + vmovdqa [ARG + _aesxcbcarg_in + 16*0], xmm1 + vmovdqa [ARG + _aesxcbcarg_in + 16*1], xmm2 + vmovdqa [ARG + _aesxcbcarg_in + 16*2], xmm3 + vmovdqa [ARG + _aesxcbcarg_in + 16*3], xmm4 + +;; XMMs are saved at a higher level + mov rbp, [GPR_SAVE_AREA + 8*0] + + add rsp, STACK_size + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/gcm128_avx_gen2.asm b/src/spdk/intel-ipsec-mb/avx/gcm128_avx_gen2.asm new file mode 100644 index 000000000..1bb601e4f --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/gcm128_avx_gen2.asm @@ -0,0 +1,31 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2018 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM128_MODE 1 +%include "avx/gcm_avx_gen2.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/gcm192_avx_gen2.asm b/src/spdk/intel-ipsec-mb/avx/gcm192_avx_gen2.asm new file mode 100644 index 000000000..4de59d5bf --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/gcm192_avx_gen2.asm @@ -0,0 +1,31 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2018 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM192_MODE 1 +%include "avx/gcm_avx_gen2.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/gcm256_avx_gen2.asm b/src/spdk/intel-ipsec-mb/avx/gcm256_avx_gen2.asm new file mode 100644 index 000000000..de8eadf4c --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/gcm256_avx_gen2.asm @@ -0,0 +1,30 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2018 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define GCM256_MODE 1 +%include "avx/gcm_avx_gen2.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/gcm_avx_gen2.asm b/src/spdk/intel-ipsec-mb/avx/gcm_avx_gen2.asm new file mode 100644 index 000000000..2aa3a162d --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/gcm_avx_gen2.asm @@ -0,0 +1,2515 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2019 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; +; Authors: +; Erdinc Ozturk +; Vinodh Gopal +; James Guilford +; +; +; References: +; This code was derived and highly optimized from the code described in paper: +; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010 +; +; For the shift-based reductions used in this code, we used the method described in paper: +; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010. +; +; +; +; +; Assumptions: +; +; +; +; iv: +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Salt (From the SA) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Initialization Vector | +; | (This is the sequence number from IPSec header) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x1 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; +; +; AAD: +; AAD will be padded with 0 to the next 16byte multiple +; for example, assume AAD is a u32 vector +; +; if AAD is 8 bytes: +; AAD[3] = {A0, A1}; +; padded AAD in xmm register = {A1 A0 0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A1) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 32-bit Sequence Number (A0) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 32-bit Sequence Number +; +; if AAD is 12 bytes: +; AAD[3] = {A0, A1, A2}; +; padded AAD in xmm register = {A2 A1 A0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A2) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 64-bit Extended Sequence Number {A1,A0} | +; | | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 64-bit Extended Sequence Number +; +; +; aadLen: +; Must be a multiple of 4 bytes and from the definition of the spec. +; The code additionally supports any aadLen length. +; +; TLen: +; from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +; +; poly = x^128 + x^127 + x^126 + x^121 + 1 +; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part. +; + +%include "include/os.asm" +%include "include/reg_sizes.asm" +%include "include/clear_regs.asm" +%include "include/gcm_defines.asm" +%include "include/gcm_keys_sse_avx.asm" +%include "include/memcpy.asm" + +%ifndef GCM128_MODE +%ifndef GCM192_MODE +%ifndef GCM256_MODE +%error "No GCM mode selected for gcm_avx_gen2.asm!" +%endif +%endif +%endif + +%ifdef GCM128_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx_gen2 +%define NROUNDS 9 +%endif + +%ifdef GCM192_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx_gen2 +%define NROUNDS 11 +%endif + +%ifdef GCM256_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx_gen2 +%define NROUNDS 13 +%endif + +default rel +; need to push 4 registers into stack to maintain +%define STACK_OFFSET 8*4 + +%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register) +%define TMP3 16*1 ; Temporary storage for AES State 3 +%define TMP4 16*2 ; Temporary storage for AES State 4 +%define TMP5 16*3 ; Temporary storage for AES State 5 +%define TMP6 16*4 ; Temporary storage for AES State 6 +%define TMP7 16*5 ; Temporary storage for AES State 7 +%define TMP8 16*6 ; Temporary storage for AES State 8 + +%define LOCAL_STORAGE 16*7 + +%ifidn __OUTPUT_FORMAT__, win64 + %define XMM_STORAGE 16*10 +%else + %define XMM_STORAGE 0 +%endif + +%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE + +section .text +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Utility Macros +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +; Input: A and B (128-bits each, bit-reflected) +; Output: C = A*B*x mod poly, (i.e. >>1 ) +; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GHASH_MUL 7 +%define %%GH %1 ; 16 Bytes +%define %%HK %2 ; 16 Bytes +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Karatsuba + vpshufd %%T2, %%GH, 01001110b + vpshufd %%T3, %%HK, 01001110b + vpxor %%T2, %%T2, %%GH ; %%T2 = (a1+a0) + vpxor %%T3, %%T3, %%HK ; %%T3 = (b1+b0) + + vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1 + vpclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0 + vpclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + vpxor %%T2, %%T2, %%GH + vpxor %%T2, %%T2, %%T1 ; %%T2 = a0*b1+a1*b0 + + vpslldq %%T3, %%T2, 8 ; shift-L %%T3 2 DWs + vpsrldq %%T2, %%T2, 8 ; shift-R %%T2 2 DWs + vpxor %%GH, %%GH, %%T3 + vpxor %%T1, %%T1, %%T2 ; <%%T1:%%GH> = %%GH x %%HK + + ;first phase of the reduction + vpslld %%T2, %%GH, 31 ; packed right shifting << 31 + vpslld %%T3, %%GH, 30 ; packed right shifting shift << 30 + vpslld %%T4, %%GH, 25 ; packed right shifting shift << 25 + + vpxor %%T2, %%T2, %%T3 ; xor the shifted versions + vpxor %%T2, %%T2, %%T4 + + vpsrldq %%T5, %%T2, 4 ; shift-R %%T5 1 DW + + vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs + vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;second phase of the reduction + + vpsrld %%T2,%%GH,1 ; packed left shifting >> 1 + vpsrld %%T3,%%GH,2 ; packed left shifting >> 2 + vpsrld %%T4,%%GH,7 ; packed left shifting >> 7 + vpxor %%T2, %%T2, %%T3 ; xor the shifted versions + vpxor %%T2, %%T2, %%T4 + + vpxor %%T2, %%T2, %%T5 + vpxor %%GH, %%GH, %%T2 + vpxor %%GH, %%GH, %%T1 ; the result is in %%GH + + +%endmacro + + +%macro PRECOMPUTE 8 +%define %%GDATA %1 +%define %%HK %2 +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 +%define %%T6 %8 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vmovdqa %%T5, %%HK + + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly + vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_2_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly + vmovdqu [%%GDATA + HashKey_3], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_3_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly + vmovdqu [%%GDATA + HashKey_4], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_4_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly + vmovdqu [%%GDATA + HashKey_5], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_5_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly + vmovdqu [%%GDATA + HashKey_6], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_6_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly + vmovdqu [%%GDATA + HashKey_7], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_7_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly + vmovdqu [%%GDATA + HashKey_8], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_8_k], %%T1 +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes. +; Returns 0 if data has length 0. +; Input: The input data (INPUT), that data's length (LENGTH). +; Output: The packed xmm register (OUTPUT). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro READ_SMALL_DATA_INPUT 6 +%define %%OUTPUT %1 ; %%OUTPUT is an xmm register +%define %%INPUT %2 +%define %%LENGTH %3 +%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers +%define %%COUNTER %5 +%define %%TMP1 %6 + + vpxor %%OUTPUT, %%OUTPUT + mov %%COUNTER, %%LENGTH + mov %%END_READ_LOCATION, %%INPUT + add %%END_READ_LOCATION, %%LENGTH + xor %%TMP1, %%TMP1 + + + cmp %%COUNTER, 8 + jl %%_byte_loop_2 + vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists + je %%_done + + sub %%COUNTER, 8 + +%%_byte_loop_1: ;Read in data 1 byte at a time while data is left + shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in + dec %%END_READ_LOCATION + mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] + dec %%COUNTER + jg %%_byte_loop_1 + vpinsrq %%OUTPUT, %%TMP1, 1 + jmp %%_done + +%%_byte_loop_2: ;Read in data 1 byte at a time while data is left + cmp %%COUNTER, 0 + je %%_done + shl %%TMP1, 8 ;This loop handles when no bytes were already read in + dec %%END_READ_LOCATION + mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] + dec %%COUNTER + jg %%_byte_loop_2 + vpinsrq %%OUTPUT, %%TMP1, 0 +%%_done: + +%endmacro ; READ_SMALL_DATA_INPUT + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. +; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). +; Output: The hash of the data (AAD_HASH). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro CALC_AAD_HASH 15 +%define %%A_IN %1 +%define %%A_LEN %2 +%define %%AAD_HASH %3 +%define %%GDATA_KEY %4 +%define %%XTMP0 %5 ; xmm temp reg 5 +%define %%XTMP1 %6 ; xmm temp reg 5 +%define %%XTMP2 %7 +%define %%XTMP3 %8 +%define %%XTMP4 %9 +%define %%XTMP5 %10 ; xmm temp reg 5 +%define %%T1 %11 ; temp reg 1 +%define %%T2 %12 +%define %%T3 %13 +%define %%T4 %14 +%define %%T5 %15 ; temp reg 5 + + + mov %%T1, %%A_IN ; T1 = AAD + mov %%T2, %%A_LEN ; T2 = aadLen + vpxor %%AAD_HASH, %%AAD_HASH + +%%_get_AAD_loop128: + cmp %%T2, 128 + jl %%_exit_AAD_loop128 + + vmovdqu %%XTMP0, [%%T1 + 16*0] + vpshufb %%XTMP0, [rel SHUF_MASK] + + vpxor %%XTMP0, %%AAD_HASH + + vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_8] + vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1 + vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0 + vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0 + vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1 + vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1 + +%assign i 1 +%assign j 7 +%rep 7 + vmovdqu %%XTMP0, [%%T1 + 16*i] + vpshufb %%XTMP0, [rel SHUF_MASK] + + vmovdqu %%XTMP5, [%%GDATA_KEY + HashKey_ %+ j] + vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1 + vpxor %%XTMP1, %%XTMP1, %%XTMP4 + + vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0 + vpxor %%XTMP2, %%XTMP2, %%XTMP4 + + vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1 + vpxor %%XTMP3, %%XTMP3, %%XTMP4 + vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 + vpxor %%XTMP3, %%XTMP3, %%XTMP4 +%assign i (i + 1) +%assign j (j - 1) +%endrep + + vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs + vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs + vpxor %%XTMP2, %%XTMP2, %%XTMP4 + vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;first phase of the reduction + vmovdqa %%XTMP5, [rel POLY2] + vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01 + vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs + vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;second phase of the reduction + vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00 + vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10 + vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts) + + vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1 + + sub %%T2, 128 + je %%_CALC_AAD_done + + add %%T1, 128 + jmp %%_get_AAD_loop128 + +%%_exit_AAD_loop128: + cmp %%T2, 16 + jl %%_get_small_AAD_block + + ;; calculate hash_key position to start with + mov %%T3, %%T2 + and %%T3, -16 ; 1 to 7 blocks possible here + neg %%T3 + add %%T3, HashKey_1 + 16 + lea %%T3, [%%GDATA_KEY + %%T3] + + vmovdqu %%XTMP0, [%%T1] + vpshufb %%XTMP0, [rel SHUF_MASK] + + vpxor %%XTMP0, %%AAD_HASH + + vmovdqu %%XTMP5, [%%T3] + vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = a1*b1 + vpclmulqdq %%XTMP2, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = a0*b0 + vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = a1*b0 + vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 ; %%T4 = a0*b1 + vpxor %%XTMP3, %%XTMP3, %%XTMP4 ; %%T3 = a1*b0 + a0*b1 + + add %%T3, 16 ; move to next hashkey + add %%T1, 16 ; move to next data block + sub %%T2, 16 + cmp %%T2, 16 + jl %%_AAD_reduce + +%%_AAD_blocks: + vmovdqu %%XTMP0, [%%T1] + vpshufb %%XTMP0, [rel SHUF_MASK] + + vmovdqu %%XTMP5, [%%T3] + vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x11 ; %%T1 = T1 + a1*b1 + vpxor %%XTMP1, %%XTMP1, %%XTMP4 + + vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x00 ; %%T2 = T2 + a0*b0 + vpxor %%XTMP2, %%XTMP2, %%XTMP4 + + vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x01 ; %%T3 = T3 + a1*b0 + a0*b1 + vpxor %%XTMP3, %%XTMP3, %%XTMP4 + vpclmulqdq %%XTMP4, %%XTMP0, %%XTMP5, 0x10 + vpxor %%XTMP3, %%XTMP3, %%XTMP4 + + add %%T3, 16 ; move to next hashkey + add %%T1, 16 + sub %%T2, 16 + cmp %%T2, 16 + jl %%_AAD_reduce + jmp %%_AAD_blocks + +%%_AAD_reduce: + vpslldq %%XTMP4, %%XTMP3, 8 ; shift-L 2 DWs + vpsrldq %%XTMP3, %%XTMP3, 8 ; shift-R 2 DWs + vpxor %%XTMP2, %%XTMP2, %%XTMP4 + vpxor %%XTMP1, %%XTMP1, %%XTMP3 ; accumulate the results in %%T1(M):%%T2(L) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;first phase of the reduction + vmovdqa %%XTMP5, [rel POLY2] + vpclmulqdq %%XTMP0, %%XTMP5, %%XTMP2, 0x01 + vpslldq %%XTMP0, %%XTMP0, 8 ; shift-L xmm2 2 DWs + vpxor %%XTMP2, %%XTMP2, %%XTMP0 ; first phase of the reduction complete + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;second phase of the reduction + vpclmulqdq %%XTMP3, %%XTMP5, %%XTMP2, 0x00 + vpsrldq %%XTMP3, %%XTMP3, 4 ; shift-R 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + vpclmulqdq %%XTMP4, %%XTMP5, %%XTMP2, 0x10 + vpslldq %%XTMP4, %%XTMP4, 4 ; shift-L 1 DW (Shift-L 1-DW to obtain result with no shifts) + + vpxor %%XTMP4, %%XTMP4, %%XTMP3 ; second phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vpxor %%AAD_HASH, %%XTMP1, %%XTMP4 ; the result is in %%T1 + + or %%T2, %%T2 + je %%_CALC_AAD_done + +%%_get_small_AAD_block: + vmovdqu %%XTMP0, [%%GDATA_KEY + HashKey] + READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5 + ;byte-reflect the AAD data + vpshufb %%XTMP1, [rel SHUF_MASK] + vpxor %%AAD_HASH, %%XTMP1 + GHASH_MUL %%AAD_HASH, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 + +%%_CALC_AAD_done: + +%endmacro ; CALC_AAD_HASH + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls. +; Requires the input data be at least 1 byte long. +; Input: +; GDATA_KEY - struct gcm_key_data * +; GDATA_CTX - struct gcm_context_data * +; PLAIN_CYPH_IN - input text +; PLAIN_CYPH_LEN - input text length +; DATA_OFFSET - the current data offset +; ENC_DEC - whether encoding or decoding +; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro PARTIAL_BLOCK 8 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%DATA_OFFSET %6 +%define %%AAD_HASH %7 +%define %%ENC_DEC %8 + mov r13, [%%GDATA_CTX + PBlockLen] + cmp r13, 0 + je %%_partial_block_done ;Leave Macro if no partial blocks + + cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading + jl %%_fewer_than_16_bytes + VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register + jmp %%_data_read + +%%_fewer_than_16_bytes: + lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15 + +%%_data_read: ;Finished reading in data + + + vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key + vmovdqu xmm13, [%%GDATA_KEY + HashKey] + + lea r12, [SHIFT_MASK] + + cmp r13, rax + add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16) + vmovdqu xmm2, [r12] ; get the appropriate shuffle mask + vpshufb xmm9, xmm2 ;shift right r13 bytes + +%ifidn %%ENC_DEC, DEC + vmovdqa xmm3, xmm1 + vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn) + + mov r15, %%PLAIN_CYPH_LEN + add r15, r13 + sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly + sub r12, r15 +%%_no_extra_mask_1: + + vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 + vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 + + vpand xmm3, xmm1 + vpshufb xmm3, [SHUF_MASK] + vpshufb xmm3, xmm2 + vpxor %%AAD_HASH, xmm3 + + + cmp r15,0 + jl %%_partial_incomplete_1 + + GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + xor rax,rax + mov [%%GDATA_CTX + PBlockLen], rax + jmp %%_dec_done +%%_partial_incomplete_1: +%ifidn __OUTPUT_FORMAT__, win64 + mov rax, %%PLAIN_CYPH_LEN + add [%%GDATA_CTX + PBlockLen], rax +%else + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%endif +%%_dec_done: + vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH + +%else + vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + + mov r15, %%PLAIN_CYPH_LEN + add r15, r13 + sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly + sub r12, r15 +%%_no_extra_mask_2: + + vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 + vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 + + vpshufb xmm9, [SHUF_MASK] + vpshufb xmm9, xmm2 + vpxor %%AAD_HASH, xmm9 + + cmp r15,0 + jl %%_partial_incomplete_2 + + GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + xor rax,rax + mov [%%GDATA_CTX + PBlockLen], rax + jmp %%_encode_done +%%_partial_incomplete_2: +%ifidn __OUTPUT_FORMAT__, win64 + mov rax, %%PLAIN_CYPH_LEN + add [%%GDATA_CTX + PBlockLen], rax +%else + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%endif +%%_encode_done: + vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH + + vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext + vpshufb xmm9, xmm2 +%endif + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; output encrypted Bytes + cmp r15,0 + jl %%_partial_fill + mov r12, r13 + mov r13, 16 + sub r13, r12 ; Set r13 to be the number of bytes to write out + jmp %%_count_set +%%_partial_fill: + mov r13, %%PLAIN_CYPH_LEN +%%_count_set: + vmovq rax, xmm9 + cmp r13, 8 + jle %%_less_than_8_bytes_left + + mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax + add %%DATA_OFFSET, 8 + vpsrldq xmm9, xmm9, 8 + vmovq rax, xmm9 + sub r13, 8 +%%_less_than_8_bytes_left: + mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al + add %%DATA_OFFSET, 1 + shr rax, 8 + sub r13, 1 + jne %%_less_than_8_bytes_left + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%%_partial_block_done: +%endmacro ; PARTIAL_BLOCK + + +; if a = number of total plaintext bytes +; b = floor(a/16) +; %%num_initial_blocks = b mod 8; +; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext +; %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified. +; Updated AAD_HASH is returned in %%T3 + +%macro INITIAL_BLOCKS 24 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%LENGTH %5 +%define %%DATA_OFFSET %6 +%define %%num_initial_blocks %7 ; can be 0, 1, 2, 3, 4, 5, 6 or 7 +%define %%T1 %8 +%define %%HASH_KEY %9 +%define %%T3 %10 +%define %%T4 %11 +%define %%T5 %12 +%define %%CTR %13 +%define %%XMM1 %14 +%define %%XMM2 %15 +%define %%XMM3 %16 +%define %%XMM4 %17 +%define %%XMM5 %18 +%define %%XMM6 %19 +%define %%XMM7 %20 +%define %%XMM8 %21 +%define %%T6 %22 +%define %%T_key %23 +%define %%ENC_DEC %24 + +%assign i (8-%%num_initial_blocks) + vmovdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg + ; start AES for %%num_initial_blocks blocks + vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0 + + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa reg(i), %%CTR + vpshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap +%assign i (i+1) +%endrep + + vmovdqu %%T_key, [%%GDATA_KEY+16*0] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vpxor reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j 1 +%rep NROUNDS + vmovdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vaesenc reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j (j+1) +%endrep ; NROUNDS + + +vmovdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vaesenclast reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + vpxor reg(i), %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks + add %%DATA_OFFSET, 16 + %ifidn %%ENC_DEC, DEC + vmovdqa reg(i), %%T1 + %endif + vpshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations +%assign i (i+1) +%endrep + + +%assign i (8-%%num_initial_blocks) +%assign j (9-%%num_initial_blocks) + +%rep %%num_initial_blocks + vpxor reg(j), reg(i) + GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks +%assign i (i+1) +%assign j (j+1) +%endrep + ; %%XMM8 has the current Hash Value + vmovdqa %%T3, %%XMM8 + + cmp %%LENGTH, 128 + jl %%_initial_blocks_done + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Prepare 8 counter blocks and perform rounds of AES cipher on them, load plain/cipher text and +; store cipher/plain text. +; Keep 8 cipher text blocks for further GHASH computations (XMM1 - XMM8) +; - combine current GHASH value into block 0 (XMM1) + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM1, %%CTR + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM2, %%CTR + vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM3, %%CTR + vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM4, %%CTR + vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM5, %%CTR + vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM6, %%CTR + vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM7, %%CTR + vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM8, %%CTR + vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + + vmovdqu %%T_key, [%%GDATA_KEY+16*0] + vpxor %%XMM1, %%T_key + vpxor %%XMM2, %%T_key + vpxor %%XMM3, %%T_key + vpxor %%XMM4, %%T_key + vpxor %%XMM5, %%T_key + vpxor %%XMM6, %%T_key + vpxor %%XMM7, %%T_key + vpxor %%XMM8, %%T_key + + +%assign i 1 +%rep NROUNDS + vmovdqu %%T_key, [%%GDATA_KEY+16*i] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key +%assign i (i+1) +%endrep + + + vmovdqu %%T_key, [%%GDATA_KEY+16*i] + vaesenclast %%XMM1, %%T_key + vaesenclast %%XMM2, %%T_key + vaesenclast %%XMM3, %%T_key + vaesenclast %%XMM4, %%T_key + vaesenclast %%XMM5, %%T_key + vaesenclast %%XMM6, %%T_key + vaesenclast %%XMM7, %%T_key + vaesenclast %%XMM8, %%T_key + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0] + vpxor %%XMM1, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM1, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1] + vpxor %%XMM2, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM2, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2] + vpxor %%XMM3, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM3, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3] + vpxor %%XMM4, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM4, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4] + vpxor %%XMM5, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM5, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5] + vpxor %%XMM6, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM6, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6] + vpxor %%XMM7, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM7, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7] + vpxor %%XMM8, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM8, %%T1 + %endif + + add %%DATA_OFFSET, 128 + + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + vpxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext + vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_initial_blocks_done: + + +%endmacro + + +; encrypt 8 blocks at a time +; ghash the 8 previously encrypted ciphertext blocks +; %%GDATA - (GCM key data), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified +; r11 is the data offset value +%macro GHASH_8_ENCRYPT_8_PARALLEL 22 +%define %%GDATA %1 +%define %%CYPH_PLAIN_OUT %2 +%define %%PLAIN_CYPH_IN %3 +%define %%DATA_OFFSET %4 +%define %%T1 %5 +%define %%T2 %6 +%define %%T3 %7 +%define %%T4 %8 +%define %%T5 %9 +%define %%T6 %10 +%define %%CTR %11 +%define %%XMM1 %12 +%define %%XMM2 %13 +%define %%XMM3 %14 +%define %%XMM4 %15 +%define %%XMM5 %16 +%define %%XMM6 %17 +%define %%XMM7 %18 +%define %%XMM8 %19 +%define %%T7 %20 +%define %%loop_idx %21 +%define %%ENC_DEC %22 + + vmovdqa %%T2, %%XMM1 + vmovdqu [rsp + TMP2], %%XMM2 + vmovdqu [rsp + TMP3], %%XMM3 + vmovdqu [rsp + TMP4], %%XMM4 + vmovdqu [rsp + TMP5], %%XMM5 + vmovdqu [rsp + TMP6], %%XMM6 + vmovdqu [rsp + TMP7], %%XMM7 + vmovdqu [rsp + TMP8], %%XMM8 + +%ifidn %%loop_idx, in_order + vpaddd %%XMM1, %%CTR, [ONE] ; INCR CNT + vpaddd %%XMM2, %%XMM1, [ONE] + vpaddd %%XMM3, %%XMM2, [ONE] + vpaddd %%XMM4, %%XMM3, [ONE] + vpaddd %%XMM5, %%XMM4, [ONE] + vpaddd %%XMM6, %%XMM5, [ONE] + vpaddd %%XMM7, %%XMM6, [ONE] + vpaddd %%XMM8, %%XMM7, [ONE] + vmovdqa %%CTR, %%XMM8 + + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap +%else + vpaddd %%XMM1, %%CTR, [ONEf] ; INCR CNT + vpaddd %%XMM2, %%XMM1, [ONEf] + vpaddd %%XMM3, %%XMM2, [ONEf] + vpaddd %%XMM4, %%XMM3, [ONEf] + vpaddd %%XMM5, %%XMM4, [ONEf] + vpaddd %%XMM6, %%XMM5, [ONEf] + vpaddd %%XMM7, %%XMM6, [ONEf] + vpaddd %%XMM8, %%XMM7, [ONEf] + vmovdqa %%CTR, %%XMM8 +%endif + + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T1, [%%GDATA + 16*0] + vpxor %%XMM1, %%T1 + vpxor %%XMM2, %%T1 + vpxor %%XMM3, %%T1 + vpxor %%XMM4, %%T1 + vpxor %%XMM5, %%T1 + vpxor %%XMM6, %%T1 + vpxor %%XMM7, %%T1 + vpxor %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T1, [%%GDATA + 16*1] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + + vmovdqu %%T1, [%%GDATA + 16*2] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_8] + vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 + vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0 + + vpshufd %%T6, %%T2, 01001110b + vpxor %%T6, %%T2 + + vmovdqu %%T5, [%%GDATA + HashKey_8_k] + vpclmulqdq %%T6, %%T6, %%T5, 0x00 ; + + + vmovdqu %%T1, [%%GDATA + 16*3] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP2] + vmovdqu %%T5, [%%GDATA + HashKey_7] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_7_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*4] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqu %%T1, [rsp + TMP3] + vmovdqu %%T5, [%%GDATA + HashKey_6] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_6_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*5] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + + vmovdqu %%T1, [rsp + TMP4] + vmovdqu %%T5, [%%GDATA + HashKey_5] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_5_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*6] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP5] + vmovdqu %%T5, [%%GDATA + HashKey_4] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_4_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + + vmovdqu %%T1, [%%GDATA + 16*7] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP6] + vmovdqu %%T5, [%%GDATA + HashKey_3] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_3_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*8] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP7] + vmovdqu %%T5, [%%GDATA + HashKey_2] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_2_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + 16*9] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T1, [rsp + TMP8] + vmovdqu %%T5, [%%GDATA + HashKey] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vpxor %%T6, %%T4 + vpxor %%T6, %%T7 + +%ifdef GCM128_MODE + vmovdqu %%T5, [%%GDATA + 16*10] +%endif +%ifdef GCM192_MODE + vmovdqu %%T5, [%%GDATA + 16*10] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*11] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*12] +%endif +%ifdef GCM256_MODE + vmovdqu %%T5, [%%GDATA + 16*10] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*11] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*12] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*13] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*14] +%endif + +%assign i 0 +%assign j 1 +%rep 8 + +%ifidn %%ENC_DEC, ENC +%ifdef NT_LD + VXLDR %%T2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i] + vpxor %%T2, %%T2, %%T5 +%else + vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i] +%endif ; NT_LD + vaesenclast reg(j), reg(j), %%T2 +%else + VXLDR %%T2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i] + vpxor %%T2, %%T2, %%T5 + vaesenclast %%T3, reg(j), %%T2 + vpxor reg(j), %%T2, %%T5 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*i], %%T3 +%endif ; %%ENC_DEC + +%assign i (i+1) +%assign j (j+1) +%endrep + + vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs + vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs + vpxor %%T7, %%T3 + vpxor %%T6, %%T4 ; accumulate the results in %%T6:%%T7 + + + ;first phase of the reduction + + vpslld %%T2, %%T7, 31 ; packed right shifting << 31 + vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30 + vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25 + + vpxor %%T2, %%T2, %%T3 ; xor the shifted versions + vpxor %%T2, %%T2, %%T4 + + vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW + + vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs + vpxor %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + %ifidn %%ENC_DEC, ENC + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 ; Write to the Ciphertext buffer + %endif + + ;second phase of the reduction + + vpsrld %%T2,%%T7,1 ; packed left shifting >> 1 + vpsrld %%T3,%%T7,2 ; packed left shifting >> 2 + vpsrld %%T4,%%T7,7 ; packed left shifting >> 7 + vpxor %%T2, %%T2,%%T3 ; xor the shifted versions + vpxor %%T2, %%T2,%%T4 + + vpxor %%T2, %%T2, %%T1 + vpxor %%T7, %%T7, %%T2 + vpxor %%T6, %%T6, %%T7 ; the result is in %%T6 + + + + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM2, [SHUF_MASK] + vpshufb %%XMM3, [SHUF_MASK] + vpshufb %%XMM4, [SHUF_MASK] + vpshufb %%XMM5, [SHUF_MASK] + vpshufb %%XMM6, [SHUF_MASK] + vpshufb %%XMM7, [SHUF_MASK] + vpshufb %%XMM8, [SHUF_MASK] + + + vpxor %%XMM1, %%T6 + +%endmacro + + +; GHASH the last 4 ciphertext blocks. +; %%GDATA is GCM key data +%macro GHASH_LAST_8 16 +%define %%GDATA %1 +%define %%T1 %2 +%define %%T2 %3 +%define %%T3 %4 +%define %%T4 %5 +%define %%T5 %6 +%define %%T6 %7 +%define %%T7 %8 +%define %%XMM1 %9 +%define %%XMM2 %10 +%define %%XMM3 %11 +%define %%XMM4 %12 +%define %%XMM5 %13 +%define %%XMM6 %14 +%define %%XMM7 %15 +%define %%XMM8 %16 + ;; Karatsuba Method + + + vpshufd %%T2, %%XMM1, 01001110b + vpxor %%T2, %%XMM1 + vmovdqu %%T5, [%%GDATA + HashKey_8] + vpclmulqdq %%T6, %%XMM1, %%T5, 0x11 + vpclmulqdq %%T7, %%XMM1, %%T5, 0x00 + + vmovdqu %%T3, [%%GDATA + HashKey_8_k] + vpclmulqdq %%XMM1, %%T2, %%T3, 0x00 + + + ;;;;;;;;;;;;;;;;;;;;;; + + + vpshufd %%T2, %%XMM2, 01001110b + vpxor %%T2, %%XMM2 + vmovdqu %%T5, [%%GDATA + HashKey_7] + vpclmulqdq %%T4, %%XMM2, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM2, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_7_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + + vpshufd %%T2, %%XMM3, 01001110b + vpxor %%T2, %%XMM3 + vmovdqu %%T5, [%%GDATA + HashKey_6] + vpclmulqdq %%T4, %%XMM3, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM3, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_6_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + + vpshufd %%T2, %%XMM4, 01001110b + vpxor %%T2, %%XMM4 + vmovdqu %%T5, [%%GDATA + HashKey_5] + vpclmulqdq %%T4, %%XMM4, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM4, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_5_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vpshufd %%T2, %%XMM5, 01001110b + vpxor %%T2, %%XMM5 + vmovdqu %%T5, [%%GDATA + HashKey_4] + vpclmulqdq %%T4, %%XMM5, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM5, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_4_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vpshufd %%T2, %%XMM6, 01001110b + vpxor %%T2, %%XMM6 + vmovdqu %%T5, [%%GDATA + HashKey_3] + + vpclmulqdq %%T4, %%XMM6, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM6, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_3_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vpshufd %%T2, %%XMM7, 01001110b + vpxor %%T2, %%XMM7 + vmovdqu %%T5, [%%GDATA + HashKey_2] + vpclmulqdq %%T4, %%XMM7, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM7, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_2_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vpshufd %%T2, %%XMM8, 01001110b + vpxor %%T2, %%XMM8 + vmovdqu %%T5, [%%GDATA + HashKey] + vpclmulqdq %%T4, %%XMM8, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM8, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + vpxor %%XMM1, %%XMM1, %%T6 + vpxor %%T2, %%XMM1, %%T7 + + + + + vpslldq %%T4, %%T2, 8 + vpsrldq %%T2, %%T2, 8 + + vpxor %%T7, %%T4 + vpxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications + + ;first phase of the reduction + + vpslld %%T2, %%T7, 31 ; packed right shifting << 31 + vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30 + vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25 + + vpxor %%T2, %%T2, %%T3 ; xor the shifted versions + vpxor %%T2, %%T2, %%T4 + + vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW + + vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs + vpxor %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;second phase of the reduction + + vpsrld %%T2,%%T7,1 ; packed left shifting >> 1 + vpsrld %%T3,%%T7,2 ; packed left shifting >> 2 + vpsrld %%T4,%%T7,7 ; packed left shifting >> 7 + vpxor %%T2, %%T2,%%T3 ; xor the shifted versions + vpxor %%T2, %%T2,%%T4 + + vpxor %%T2, %%T2, %%T1 + vpxor %%T7, %%T7, %%T2 + vpxor %%T6, %%T6, %%T7 ; the result is in %%T6 + + +%endmacro + + +; Encryption of a single block +; %%GDATA is GCM key data +%macro ENCRYPT_SINGLE_BLOCK 2 +%define %%GDATA %1 +%define %%XMM0 %2 + + vpxor %%XMM0, [%%GDATA+16*0] +%assign i 1 +%rep NROUNDS + vaesenc %%XMM0, [%%GDATA+16*i] +%assign i (i+1) +%endrep ; NROUNDS + vaesenclast %%XMM0, [%%GDATA+16*i] +%endmacro + + +;; Start of Stack Setup + +%macro FUNC_SAVE 0 + ;; Required for Update/GMC_ENC + ;the number of pushes must equal STACK_OFFSET + push r12 + push r13 + push r14 + push r15 + mov r14, rsp + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 + vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7 + vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8 + vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9 + vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10 + vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11 + vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12 + vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13 + vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14 + vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15 +%endif +%endmacro + + +%macro FUNC_RESTORE 0 + +%ifdef SAFE_DATA + clear_scratch_gps_asm + clear_scratch_xmms_avx_asm +%endif +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16] + vmovdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16] + vmovdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16] + vmovdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16] + vmovdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16] + vmovdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16] + vmovdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16] + vmovdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16] + vmovdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16] + vmovdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16] +%endif + +;; Required for Update/GMC_ENC + mov rsp, r14 + pop r15 + pop r14 + pop r13 + pop r12 +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding. +; Input: struct gcm_key_data *(GDATA_KEY), struct gcm_context_data *(GDATA_CTX), +; IV, Additional Authentication data (A_IN), Additional +; Data length (A_LEN) +; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA. +; Clobbers rax, r10-r13, and xmm0-xmm6 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_INIT 5 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%IV %3 +%define %%A_IN %4 +%define %%A_LEN %5 +%define %%AAD_HASH xmm0 + + CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax + vpxor xmm2, xmm3 + mov r10, %%A_LEN + + vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash + mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length + xor r10, r10 + mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0 + mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0 + vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0 + mov r10, %%IV + vmovdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001 + vpinsrq xmm2, [r10], 0 + vpinsrd xmm2, [r10+8], 2 + vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv + + vpshufb xmm2, [rel SHUF_MASK] + + vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct +; has been initialized by GCM_INIT +; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. +; Input: struct gcm_key_data* (GDATA_KEY), struct gcm_context_data * (GDATA_CTX), +; input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN), +; and whether encoding or decoding (ENC_DEC) +; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10-r15, and xmm0-xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_ENC_DEC 6 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%ENC_DEC %6 +%define %%DATA_OFFSET r11 + +; Macro flow: +; calculate the number of 16byte blocks in the message +; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted' +; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left' +; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes' + cmp %%PLAIN_CYPH_LEN, 0 + je %%_multiple_of_16_bytes + + xor %%DATA_OFFSET, %%DATA_OFFSET +%ifidn __OUTPUT_FORMAT__, win64 + mov rax, %%PLAIN_CYPH_LEN + add [%%GDATA_CTX + InLen], rax ; Update length of data processed +%else + add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN ; Update length of data processed +%endif + vmovdqu xmm13, [%%GDATA_KEY + HashKey] ; xmm13 = HashKey + vmovdqu xmm8, [%%GDATA_CTX + AadHash] + + + PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC + + + mov r13, %%PLAIN_CYPH_LEN + sub r13, %%DATA_OFFSET + mov r10, r13 ; save the amount of data left to process in r10 + and r13, -16 ; r13 = r13 - (r13 mod 16) + + mov r12, r13 + shr r12, 4 + and r12, 7 + + jz %%_initial_num_blocks_is_0 + + cmp r12, 7 + je %%_initial_num_blocks_is_7 + cmp r12, 6 + je %%_initial_num_blocks_is_6 + cmp r12, 5 + je %%_initial_num_blocks_is_5 + cmp r12, 4 + je %%_initial_num_blocks_is_4 + cmp r12, 3 + je %%_initial_num_blocks_is_3 + cmp r12, 2 + je %%_initial_num_blocks_is_2 + + jmp %%_initial_num_blocks_is_1 + +%%_initial_num_blocks_is_7: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*7 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_6: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*6 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_5: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*5 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_4: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*4 + jmp %%_initial_blocks_encrypted + + +%%_initial_num_blocks_is_3: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*3 + jmp %%_initial_blocks_encrypted +%%_initial_num_blocks_is_2: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*2 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_1: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_0: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + + +%%_initial_blocks_encrypted: + cmp r13, 0 + je %%_zero_cipher_left + + sub r13, 128 + je %%_eight_cipher_left + + + + + vmovd r15d, xmm9 + and r15d, 255 + vpshufb xmm9, [SHUF_MASK] + + +%%_encrypt_by_8_new: + cmp r15d, 255-8 + jg %%_encrypt_by_8 + + + + add r15b, 8 + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC + add %%DATA_OFFSET, 128 + sub r13, 128 + jne %%_encrypt_by_8_new + + vpshufb xmm9, [SHUF_MASK] + jmp %%_eight_cipher_left + +%%_encrypt_by_8: + vpshufb xmm9, [SHUF_MASK] + add r15b, 8 + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN,%%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC + vpshufb xmm9, [SHUF_MASK] + add %%DATA_OFFSET, 128 + sub r13, 128 + jne %%_encrypt_by_8_new + + vpshufb xmm9, [SHUF_MASK] + + + + +%%_eight_cipher_left: + GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 + + +%%_zero_cipher_left: + vmovdqu [%%GDATA_CTX + AadHash], xmm14 ; ctx_data.aad hash = xmm14 + vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; ctx_data.current_counter = xmm9 + + mov r13, r10 + and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16) + + je %%_multiple_of_16_bytes + + mov [%%GDATA_CTX + PBlockLen], r13 ; ctx_data.partial_blck_length = r13 + ; handle the last <16 Byte block seperately + + vpaddd xmm9, [ONE] ; INCR CNT to get Yn + vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9 + vpshufb xmm9, [SHUF_MASK] + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Yn) + vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm9 ; ctx_data.partial_block_enc_key = xmm9 + + cmp %%PLAIN_CYPH_LEN, 16 + jge %%_large_enough_update + + lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax + lea r12, [SHIFT_MASK + 16] + sub r12, r13 + jmp %%_data_read + +%%_large_enough_update: + sub %%DATA_OFFSET, 16 + add %%DATA_OFFSET, r13 + + vmovdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block + + sub %%DATA_OFFSET, r13 + add %%DATA_OFFSET, 16 + + + lea r12, [SHIFT_MASK + 16] + sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16) + + vmovdqu xmm2, [r12] ; get the appropriate shuffle mask + vpshufb xmm1, xmm2 ; shift right 16-r13 bytes +%%_data_read: +%ifidn %%ENC_DEC, DEC + vmovdqa xmm2, xmm1 + vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9 + vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9 + vpand xmm2, xmm1 + vpshufb xmm2, [SHUF_MASK] + vpxor xmm14, xmm2 + vmovdqu [%%GDATA_CTX + AadHash], xmm14 + +%else + vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9 + vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9 + vpshufb xmm9, [SHUF_MASK] + vpxor xmm14, xmm9 + vmovdqu [%%GDATA_CTX + AadHash], xmm14 + + vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; output r13 Bytes + vmovq rax, xmm9 + cmp r13, 8 + jle %%_less_than_8_bytes_left + + mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax + add %%DATA_OFFSET, 8 + vpsrldq xmm9, xmm9, 8 + vmovq rax, xmm9 + sub r13, 8 + +%%_less_than_8_bytes_left: + mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al + add %%DATA_OFFSET, 1 + shr rax, 8 + sub r13, 1 + jne %%_less_than_8_bytes_left + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_multiple_of_16_bytes: + + + +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes. +; Input: struct gcm_key_data* (GDATA_KEY), struct gcm_context_data *(GDATA_CTX) and +; whether encoding or decoding (ENC_DEC). +; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN) +; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_COMPLETE 5 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%AUTH_TAG %3 +%define %%AUTH_TAG_LEN %4 +%define %%ENC_DEC %5 +%define %%PLAIN_CYPH_LEN rax + + mov r12, [%%GDATA_CTX + PBlockLen] + vmovdqu xmm14, [%%GDATA_CTX + AadHash] + vmovdqu xmm13, [%%GDATA_KEY + HashKey] + + cmp r12, 0 + + je %%_partial_done + + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + vmovdqu [%%GDATA_CTX + AadHash], xmm14 + +%%_partial_done: + + mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes) + mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen] + + shl r12, 3 ; convert into number of bits + vmovd xmm15, r12d ; len(A) in xmm15 + + shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128) + vmovq xmm1, %%PLAIN_CYPH_LEN + vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000 + vpxor xmm15, xmm1 ; xmm15 = len(A)||len(C) + + vpxor xmm14, xmm15 + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation + vpshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap + + vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0 + + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0) + + vpxor xmm9, xmm14 + + +%%_return_T: + mov r10, %%AUTH_TAG ; r10 = authTag + mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len + + cmp r11, 16 + je %%_T_16 + + cmp r11, 12 + je %%_T_12 + + cmp r11, 8 + je %%_T_8 + + simd_store_avx r10, xmm9, r11, r12, rax + jmp %%_return_T_done +%%_T_8: + vmovq rax, xmm9 + mov [r10], rax + jmp %%_return_T_done +%%_T_12: + vmovq rax, xmm9 + mov [r10], rax + vpsrldq xmm9, xmm9, 8 + vmovd eax, xmm9 + mov [r10 + 8], eax + jmp %%_return_T_done +%%_T_16: + vmovdqu [r10], xmm9 + +%%_return_T_done: + +%ifdef SAFE_DATA + ;; Clear sensitive data from context structure + vpxor xmm0, xmm0 + vmovdqu [%%GDATA_CTX + AadHash], xmm0 + vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm0 +%endif +%endmacro ; GCM_COMPLETE + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_precomp_128_avx_gen2 +; (struct gcm_key_data *key_data); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(precomp,_),function,) +FN_NAME(precomp,_): + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_precomp +%endif + + push r12 + push r13 + push r14 + push r15 + + mov r14, rsp + + + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 ; align rsp to 64 bytes + +%ifidn __OUTPUT_FORMAT__, win64 + ; only xmm6 needs to be maintained + vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 +%endif + + vpxor xmm6, xmm6 + ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey + + vpshufb xmm6, [SHUF_MASK] + ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; + vmovdqa xmm2, xmm6 + vpsllq xmm6, 1 + vpsrlq xmm2, 63 + vmovdqa xmm1, xmm2 + vpslldq xmm2, xmm2, 8 + vpsrldq xmm1, xmm1, 8 + vpor xmm6, xmm2 + ;reduction + vpshufd xmm2, xmm1, 00100100b + vpcmpeqd xmm2, [TWOONE] + vpand xmm2, [POLY] + vpxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly + + + PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] +%endif + mov rsp, r14 + + pop r15 + pop r14 + pop r13 + pop r12 + +%ifdef SAFE_DATA + clear_scratch_gps_asm + clear_scratch_xmms_avx_asm +%endif +exit_precomp: + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_init_128_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *iv, +; const u8 *aad, +; u64 aad_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(init,_),function,) +FN_NAME(init,_): + push r12 + push r13 +%ifidn __OUTPUT_FORMAT__, win64 + push r14 + push r15 + mov r14, rsp + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 1*16 + movdqu [rsp + 0*16], xmm6 +%endif + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_init + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_init + + ;; Check IV != NULL + cmp arg3, 0 + jz exit_init + + ;; Check if aad_len == 0 + cmp arg5, 0 + jz skip_aad_check_init + + ;; Check aad != NULL (aad_len != 0) + cmp arg4, 0 + jz exit_init + +skip_aad_check_init: +%endif + GCM_INIT arg1, arg2, arg3, arg4, arg5 + +%ifdef SAFE_DATA + clear_scratch_gps_asm + clear_scratch_xmms_avx_asm +%endif +exit_init: + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm6 , [rsp + 0*16] + mov rsp, r14 + pop r15 + pop r14 +%endif + pop r13 + pop r12 + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_update_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(enc,_update_),function,) +FN_NAME(enc,_update_): + + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_update_enc + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_update_enc + + ;; Check if plaintext_len == 0 + cmp arg5, 0 + jz skip_in_out_check_update_enc + + ;; Check out != NULL (plaintext_len != 0) + cmp arg3, 0 + jz exit_update_enc + + ;; Check in != NULL (plaintext_len != 0) + cmp arg4, 0 + jz exit_update_enc + +skip_in_out_check_update_enc: +%endif + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC + +exit_update_enc: + FUNC_RESTORE + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_update_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(dec,_update_),function,) +FN_NAME(dec,_update_): + + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_update_dec + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_update_dec + + ;; Check if plaintext_len == 0 + cmp arg5, 0 + jz skip_in_out_check_update_dec + + ;; Check out != NULL (plaintext_len != 0) + cmp arg3, 0 + jz exit_update_dec + + ;; Check in != NULL (plaintext_len != 0) + cmp arg4, 0 + jz exit_update_dec + +skip_in_out_check_update_dec: +%endif + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC + +exit_update_dec: + FUNC_RESTORE + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_finalize_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(enc,_finalize_),function,) +FN_NAME(enc,_finalize_): + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_enc_fin + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_enc_fin + + ;; Check auth_tag != NULL + cmp arg3, 0 + jz exit_enc_fin + + ;; Check auth_tag_len == 0 or > 16 + cmp arg4, 0 + jz exit_enc_fin + + cmp arg4, 16 + ja exit_enc_fin +%endif + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 5*16 + vmovdqu [rsp + 0*16],xmm6 + vmovdqu [rsp + 1*16],xmm9 + vmovdqu [rsp + 2*16],xmm11 + vmovdqu [rsp + 3*16],xmm14 + vmovdqu [rsp + 4*16],xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, ENC + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15 , [rsp + 4*16] + vmovdqu xmm14 , [rsp + 3*16] + vmovdqu xmm11 , [rsp + 2*16] + vmovdqu xmm9 , [rsp + 1*16] + vmovdqu xmm6 , [rsp + 0*16] + add rsp, 5*16 +%endif + + pop r12 + +%ifdef SAFE_DATA + clear_scratch_gps_asm + clear_scratch_xmms_avx_asm +%endif +exit_enc_fin: + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_finalize_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(dec,_finalize_),function,) +FN_NAME(dec,_finalize_): + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_dec_fin + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_dec_fin + + ;; Check auth_tag != NULL + cmp arg3, 0 + jz exit_dec_fin + + ;; Check auth_tag_len == 0 or > 16 + cmp arg4, 0 + jz exit_dec_fin + + cmp arg4, 16 + ja exit_dec_fin +%endif + + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 5*16 + vmovdqu [rsp + 0*16],xmm6 + vmovdqu [rsp + 1*16],xmm9 + vmovdqu [rsp + 2*16],xmm11 + vmovdqu [rsp + 3*16],xmm14 + vmovdqu [rsp + 4*16],xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, DEC + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15 , [rsp + 4*16] + vmovdqu xmm14 , [rsp + 3*16] + vmovdqu xmm11 , [rsp + 2*16] + vmovdqu xmm9 , [rsp + 1*16] + vmovdqu xmm6 , [rsp + 0*16] + add rsp, 5*16 +%endif + + pop r12 + +%ifdef SAFE_DATA + clear_scratch_gps_asm + clear_scratch_xmms_avx_asm +%endif +exit_dec_fin: + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(enc,_),function,) +FN_NAME(enc,_): + + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_enc + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_enc + + ;; Check IV != NULL + cmp arg6, 0 + jz exit_enc + + ;; Check auth_tag != NULL + cmp arg9, 0 + jz exit_enc + + ;; Check auth_tag_len == 0 or > 16 + cmp arg10, 0 + jz exit_enc + + cmp arg10, 16 + ja exit_enc + + ;; Check if plaintext_len == 0 + cmp arg5, 0 + jz skip_in_out_check_enc + + ;; Check out != NULL (plaintext_len != 0) + cmp arg3, 0 + jz exit_enc + + ;; Check in != NULL (plaintext_len != 0) + cmp arg4, 0 + jz exit_enc + +skip_in_out_check_enc: + ;; Check if aad_len == 0 + cmp arg8, 0 + jz skip_aad_check_enc + + ;; Check aad != NULL (aad_len != 0) + cmp arg7, 0 + jz exit_enc + +skip_aad_check_enc: +%endif + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC + + GCM_COMPLETE arg1, arg2, arg9, arg10, ENC + +exit_enc: + FUNC_RESTORE + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(dec,_),function,) +FN_NAME(dec,_): + + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_dec + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_dec + + ;; Check IV != NULL + cmp arg6, 0 + jz exit_dec + + ;; Check auth_tag != NULL + cmp arg9, 0 + jz exit_dec + + ;; Check auth_tag_len == 0 or > 16 + cmp arg10, 0 + jz exit_dec + + cmp arg10, 16 + ja exit_dec + + ;; Check if plaintext_len == 0 + cmp arg5, 0 + jz skip_in_out_check_dec + + ;; Check out != NULL (plaintext_len != 0) + cmp arg3, 0 + jz exit_dec + + ;; Check in != NULL (plaintext_len != 0) + cmp arg4, 0 + jz exit_dec + +skip_in_out_check_dec: + ;; Check if aad_len == 0 + cmp arg8, 0 + jz skip_aad_check_dec + + ;; Check aad != NULL (aad_len != 0) + cmp arg7, 0 + jz exit_dec + +skip_aad_check_dec: +%endif + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC + + GCM_COMPLETE arg1, arg2, arg9, arg10, DEC + +exit_dec: + FUNC_RESTORE + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/kasumi_avx.c b/src/spdk/intel-ipsec-mb/avx/kasumi_avx.c new file mode 100644 index 000000000..4739191ac --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/kasumi_avx.c @@ -0,0 +1,386 @@ +/******************************************************************************* + Copyright (c) 2009-2019, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include <limits.h> + +#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_xmms_avx + +#include "include/save_xmms.h" +#include "include/kasumi_internal.h" +#include "include/save_xmms.h" +#include "include/clear_regs_mem.h" + +#define SAVE_XMMS save_xmms_avx +#define RESTORE_XMMS restore_xmms_avx + +void +kasumi_f8_1_buffer_avx(const kasumi_key_sched_t *pCtx, const uint64_t IV, + const void *pBufferIn, void *pBufferOut, + const uint32_t cipherLengthInBytes) +{ +#ifndef LINUX + DECLARE_ALIGNED(uint128_t xmm_save[10], 16); + + SAVE_XMMS(xmm_save); +#endif +#ifdef SAFE_PARAM + /* Check for NULL pointers */ + if (pCtx == NULL || pBufferIn == NULL || pBufferOut == NULL) + return; + + /* Check input data is in range of supported length */ + if (cipherLengthInBytes == 0 || + cipherLengthInBytes > (KASUMI_MAX_LEN / CHAR_BIT)) + return; +#endif + kasumi_f8_1_buffer(pCtx, IV, pBufferIn, pBufferOut, + cipherLengthInBytes); +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#ifndef LINUX + RESTORE_XMMS(xmm_save); +#endif +} + +void +kasumi_f8_1_buffer_bit_avx(const kasumi_key_sched_t *pCtx, const uint64_t IV, + const void *pBufferIn, + void *pBufferOut, + const uint32_t cipherLengthInBits, + const uint32_t offsetInBits) +{ +#ifndef LINUX + DECLARE_ALIGNED(uint128_t xmm_save[10], 16); + + SAVE_XMMS(xmm_save); +#endif +#ifdef SAFE_PARAM + /* Check for NULL pointers */ + if (pCtx == NULL || pBufferIn == NULL || pBufferOut == NULL) + return; + + /* Check input data is in range of supported length */ + if (cipherLengthInBits == 0 || + cipherLengthInBits > KASUMI_MAX_LEN) + return; +#endif + kasumi_f8_1_buffer_bit(pCtx, IV, pBufferIn, pBufferOut, + cipherLengthInBits, offsetInBits); +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#ifndef LINUX + RESTORE_XMMS(xmm_save); +#endif +} + +void +kasumi_f8_2_buffer_avx(const kasumi_key_sched_t *pCtx, const uint64_t IV1, + const uint64_t IV2, const void *pBufferIn1, + void *pBufferOut1, const uint32_t lengthInBytes1, + const void *pBufferIn2, void *pBufferOut2, + const uint32_t lengthInBytes2) +{ +#ifndef LINUX + DECLARE_ALIGNED(uint128_t xmm_save[10], 16); + + SAVE_XMMS(xmm_save); +#endif +#ifdef SAFE_PARAM + /* Check for NULL pointers */ + if (pCtx == NULL) + return; + + if (pBufferIn1 == NULL || pBufferOut1 == NULL) + return; + + if (pBufferIn2 == NULL || pBufferOut2 == NULL) + return; + + /* Check input data is in range of supported length */ + if (lengthInBytes1 == 0 || lengthInBytes1 > (KASUMI_MAX_LEN / CHAR_BIT)) + return; + + if (lengthInBytes2 == 0 || lengthInBytes2 > (KASUMI_MAX_LEN / CHAR_BIT)) + return; +#endif + kasumi_f8_2_buffer(pCtx, IV1, IV2, + pBufferIn1, pBufferOut1, lengthInBytes1, + pBufferIn2, pBufferOut2, lengthInBytes2); +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#ifndef LINUX + RESTORE_XMMS(xmm_save); +#endif +} + +void +kasumi_f8_3_buffer_avx(const kasumi_key_sched_t *pCtx, const uint64_t IV1, + const uint64_t IV2, const uint64_t IV3, + const void *pBufferIn1, void *pBufferOut1, + const void *pBufferIn2, void *pBufferOut2, + const void *pBufferIn3, void *pBufferOut3, + const uint32_t lengthInBytes) +{ +#ifndef LINUX + DECLARE_ALIGNED(uint128_t xmm_save[10], 16); + + SAVE_XMMS(xmm_save); +#endif +#ifdef SAFE_PARAM + /* Check for NULL pointers */ + if (pCtx == NULL) + return; + + if (pBufferIn1 == NULL || pBufferOut1 == NULL) + return; + + if (pBufferIn2 == NULL || pBufferOut2 == NULL) + return; + + if (pBufferIn3 == NULL || pBufferOut3 == NULL) + return; + + /* Check input data is in range of supported length */ + if (lengthInBytes == 0 || lengthInBytes > (KASUMI_MAX_LEN / CHAR_BIT)) + return; +#endif + kasumi_f8_3_buffer(pCtx, IV1, IV2, IV3, + pBufferIn1, pBufferOut1, + pBufferIn2, pBufferOut2, + pBufferIn3, pBufferOut3, lengthInBytes); +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#ifndef LINUX + RESTORE_XMMS(xmm_save); +#endif +} + +void +kasumi_f8_4_buffer_avx(const kasumi_key_sched_t *pCtx, + const uint64_t IV1, const uint64_t IV2, + const uint64_t IV3, const uint64_t IV4, + const void *pBufferIn1, void *pBufferOut1, + const void *pBufferIn2, void *pBufferOut2, + const void *pBufferIn3, void *pBufferOut3, + const void *pBufferIn4, void *pBufferOut4, + const uint32_t lengthInBytes) +{ +#ifndef LINUX + DECLARE_ALIGNED(uint128_t xmm_save[10], 16); + + SAVE_XMMS(xmm_save); +#endif +#ifdef SAFE_PARAM + /* Check for NULL pointers */ + if (pCtx == NULL) + return; + + if (pBufferIn1 == NULL || pBufferOut1 == NULL) + return; + + if (pBufferIn2 == NULL || pBufferOut2 == NULL) + return; + + if (pBufferIn3 == NULL || pBufferOut3 == NULL) + return; + + if (pBufferIn4 == NULL || pBufferOut4 == NULL) + return; + + /* Check input data is in range of supported length */ + if (lengthInBytes == 0 || lengthInBytes > (KASUMI_MAX_LEN / CHAR_BIT)) + return; +#endif + kasumi_f8_4_buffer(pCtx, IV1, IV2, IV3, IV4, + pBufferIn1, pBufferOut1, + pBufferIn2, pBufferOut2, + pBufferIn3, pBufferOut3, + pBufferIn4, pBufferOut4, + lengthInBytes); +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#ifndef LINUX + RESTORE_XMMS(xmm_save); +#endif +} + +void +kasumi_f8_n_buffer_avx(const kasumi_key_sched_t *pKeySchedule, + const uint64_t IV[], + const void * const pDataIn[], void *pDataOut[], + const uint32_t dataLen[], const uint32_t dataCount) +{ +#ifndef LINUX + DECLARE_ALIGNED(uint128_t xmm_save[10], 16); + + SAVE_XMMS(xmm_save); +#endif + uint32_t numLeft = dataCount; + const uint64_t *IVPtr; + const void * const *pDataInPtr; + void **pDataOutPtr; + const uint32_t *dataLenPtr; + uint32_t i = 0; + uint32_t numBuffs; + +#ifdef SAFE_PARAM + /* Check for NULL pointers */ + if (pKeySchedule == NULL || pDataIn == NULL || pDataOut == NULL || + dataLen == NULL || IV == NULL) + return; + + for (i = 0; i < dataCount; i++) { + /* Check for NULL pointers */ + if (pDataIn[i] == NULL || pDataOut[i] == NULL) + return; + + /* Check input data is in range of supported length */ + if (dataLen[i] == 0 || dataLen[i] > (KASUMI_MAX_LEN / CHAR_BIT)) + return; + } +#endif + + i = 0; + + /* KASUMI F8 n buffer function can handle up to 16 buffers */ + while (numLeft > 0) { + IVPtr = &IV[i]; + pDataInPtr = &pDataIn[i]; + pDataOutPtr = &pDataOut[i]; + dataLenPtr = &dataLen[i]; + numBuffs = (numLeft > 16) ? 16 : numLeft; + + kasumi_f8_n_buffer(pKeySchedule, IVPtr, pDataInPtr, pDataOutPtr, + dataLenPtr, numBuffs); + i += numBuffs; + numLeft -= numBuffs; + } +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#ifndef LINUX + RESTORE_XMMS(xmm_save); +#endif +} + + +void +kasumi_f9_1_buffer_avx(const kasumi_key_sched_t *pCtx, const void *pBufferIn, + const uint32_t lengthInBytes, void *pDigest) +{ +#ifndef LINUX + DECLARE_ALIGNED(uint128_t xmm_save[10], 16); + + SAVE_XMMS(xmm_save); +#endif +#ifdef SAFE_PARAM + /* Check for NULL pointers */ + if (pCtx == NULL || pBufferIn == NULL || pDigest == NULL) + return; + + /* Check input data is in range of supported length */ + if (lengthInBytes == 0 || lengthInBytes > (KASUMI_MAX_LEN / CHAR_BIT)) + return; +#endif + kasumi_f9_1_buffer(pCtx, pBufferIn, lengthInBytes, pDigest); +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#ifndef LINUX + RESTORE_XMMS(xmm_save); +#endif +} + +void +kasumi_f9_1_buffer_user_avx(const kasumi_key_sched_t *pCtx, const uint64_t IV, + const void *pBufferIn, const uint32_t lengthInBits, + void *pDigest, const uint32_t direction) +{ +#ifndef LINUX + DECLARE_ALIGNED(uint128_t xmm_save[10], 16); + + SAVE_XMMS(xmm_save); +#endif +#ifdef SAFE_PARAM + /* Check for NULL pointers */ + if (pCtx == NULL || pBufferIn == NULL || pDigest == NULL) + return; + + /* Check input data is in range of supported length */ + if (lengthInBits == 0 || lengthInBits > KASUMI_MAX_LEN) + return; +#endif + kasumi_f9_1_buffer_user(pCtx, IV, pBufferIn, lengthInBits, + pDigest, direction); +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#ifndef LINUX + RESTORE_XMMS(xmm_save); +#endif +} + +int +kasumi_init_f8_key_sched_avx(const void *const pKey, + kasumi_key_sched_t *pCtx) +{ + return kasumi_init_f8_key_sched(pKey, pCtx); +} + +int +kasumi_init_f9_key_sched_avx(const void *const pKey, + kasumi_key_sched_t *pCtx) +{ + return kasumi_init_f9_key_sched(pKey, pCtx); +} + +size_t +kasumi_key_sched_size_avx(void) +{ + return kasumi_key_sched_size(); +} diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_flush_avx.asm new file mode 100644 index 000000000..3e3de0492 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_flush_avx.asm @@ -0,0 +1,30 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define AES_CBC_ENC_X8 aes_cbc_enc_192_x8 +%define FLUSH_JOB_AES_ENC flush_job_aes192_enc_avx +%include "avx/mb_mgr_aes_flush_avx.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_submit_avx.asm new file mode 100644 index 000000000..57fae603c --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_submit_avx.asm @@ -0,0 +1,30 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define AES_CBC_ENC_X8 aes_cbc_enc_192_x8 +%define SUBMIT_JOB_AES_ENC submit_job_aes192_enc_avx +%include "avx/mb_mgr_aes_submit_avx.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_flush_avx.asm new file mode 100644 index 000000000..04c4824d7 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_flush_avx.asm @@ -0,0 +1,30 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define AES_CBC_ENC_X8 aes_cbc_enc_256_x8 +%define FLUSH_JOB_AES_ENC flush_job_aes256_enc_avx +%include "avx/mb_mgr_aes_flush_avx.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_submit_avx.asm new file mode 100644 index 000000000..ee1de7165 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_submit_avx.asm @@ -0,0 +1,30 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define AES_CBC_ENC_X8 aes_cbc_enc_256_x8 +%define SUBMIT_JOB_AES_ENC submit_job_aes256_enc_avx +%include "avx/mb_mgr_aes_submit_avx.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_ccm_auth_submit_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_ccm_auth_submit_flush_avx.asm new file mode 100644 index 000000000..9d132ec5f --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_ccm_auth_submit_flush_avx.asm @@ -0,0 +1,537 @@ +;; +;; Copyright (c) 2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" + +%include "include/reg_sizes.asm" +%include "include/const.inc" +%include "include/memcpy.asm" + +%ifndef AES128_CBC_MAC + +%define AES128_CBC_MAC aes128_cbc_mac_x8 +%define SUBMIT_JOB_AES_CCM_AUTH submit_job_aes_ccm_auth_avx +%define FLUSH_JOB_AES_CCM_AUTH flush_job_aes_ccm_auth_avx + +%endif + +extern AES128_CBC_MAC + +section .data +default rel + +align 16 +len_mask: + dq 0xFFFFFFFFFFFFFFF0 +align 16 +len_masks: + dq 0x000000000000FFFF, 0x0000000000000000 + dq 0x00000000FFFF0000, 0x0000000000000000 + dq 0x0000FFFF00000000, 0x0000000000000000 + dq 0xFFFF000000000000, 0x0000000000000000 + dq 0x0000000000000000, 0x000000000000FFFF + dq 0x0000000000000000, 0x00000000FFFF0000 + dq 0x0000000000000000, 0x0000FFFF00000000 + dq 0x0000000000000000, 0xFFFF000000000000 +dupw: + dq 0x0100010001000100, 0x0100010001000100 +counter_mask: + dq 0xFFFFFFFFFFFFFF07, 0x0000FFFFFFFFFFFF +one: dq 1 +two: dq 2 +three: dq 3 +four: dq 4 +five: dq 5 +six: dq 6 +seven: dq 7 + +section .text + +%define APPEND(a,b) a %+ b + +%define NROUNDS 9 ; AES-CCM-128 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define job_rax rax +%define tmp4 rax +%define auth_len_aad rax + +%define min_idx rbp +%define flags rbp + +%define lane r8 + +%define iv_len r9 +%define auth_len r9 + +%define aad_len r10 +%define init_block_addr r11 + +%define unused_lanes rbx +%define r rbx + +%define tmp r12 +%define tmp2 r13 +%define tmp3 r14 + +%define good_lane r15 +%define min_job r15 + +%define init_block0 xmm0 +%define ccm_lens xmm1 +%define min_len_idx xmm2 +%define xtmp0 xmm3 +%define xtmp1 xmm4 +%define xtmp2 xmm5 +%define xtmp3 xmm6 + +; STACK_SPACE needs to be an odd multiple of 8 +; This routine and its callee clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +;;; =========================================================================== +;;; =========================================================================== +;;; MACROS +;;; =========================================================================== +;;; =========================================================================== + +%macro ENCRYPT_SINGLE_BLOCK 2 +%define %%GDATA %1 +%define %%XMM0 %2 + + vpxor %%XMM0, [%%GDATA+16*0] +%assign i 1 +%rep NROUNDS + vaesenc %%XMM0, [%%GDATA+16*i] +%assign i (i+1) +%endrep + vaesenclast %%XMM0, [%%GDATA+16*i] +%endmacro + +;;; =========================================================================== +;;; AES CCM auth job submit & flush +;;; =========================================================================== +;;; SUBMIT_FLUSH [in] - SUBMIT, FLUSH job selection +%macro GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_AVX 1 +%define %%SUBMIT_FLUSH %1 + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + ;; Find free lane + mov unused_lanes, [state + _aes_ccm_unused_lanes] + +%ifidn %%SUBMIT_FLUSH, SUBMIT + + mov lane, unused_lanes + and lane, 15 + shr unused_lanes, 4 + mov [state + _aes_ccm_unused_lanes], unused_lanes + + ;; Copy job info into lane + mov [state + _aes_ccm_job_in_lane + lane*8], job + ;; Copy keys into lane args + mov tmp, [job + _aes_enc_key_expanded] + mov [state + _aes_ccm_args_keys + lane*8], tmp + ;; init_done = 0 + mov word [state + _aes_ccm_init_done + lane*2], 0 + lea tmp, [lane * 8] + + vpxor init_block0, init_block0 + vmovdqa [state + _aes_ccm_args_IV + tmp*2], init_block0 + + ;; Prepare initial Block 0 for CBC-MAC-128 + + ;; Byte 0: flags with L' and M' (AAD later) + ;; Calculate L' = 15 - IV length - 1 = 14 - IV length + mov flags, 14 + mov iv_len, [job + _iv_len_in_bytes] + sub flags, iv_len + ;; Calculate M' = (Digest length - 2) / 2 + mov tmp, [job + _auth_tag_output_len_in_bytes] + sub tmp, 2 + + shl tmp, 2 ; M' << 3 (combine 1xshr, to div by 2, and 3xshl) + or flags, tmp + + ;; Bytes 1 - 13: Nonce (7 - 13 bytes long) + + ;; Bytes 1 - 7 are always copied (first 7 bytes) + mov tmp, [job + _iv] + vpinsrb init_block0, [tmp], 1 + vpinsrw init_block0, [tmp + 1], 1 + vpinsrd init_block0, [tmp + 3], 1 + + cmp iv_len, 7 + je %%_finish_nonce_move + + cmp iv_len, 8 + je %%_iv_length_8 + cmp iv_len, 9 + je %%_iv_length_9 + cmp iv_len, 10 + je %%_iv_length_10 + cmp iv_len, 11 + je %%_iv_length_11 + cmp iv_len, 12 + je %%_iv_length_12 + + ;; Bytes 8 - 13 +%%_iv_length_13: + vpinsrb init_block0, [tmp + 12], 13 +%%_iv_length_12: + vpinsrb init_block0, [tmp + 11], 12 +%%_iv_length_11: + vpinsrd init_block0, [tmp + 7], 2 + jmp %%_finish_nonce_move +%%_iv_length_10: + vpinsrb init_block0, [tmp + 9], 10 +%%_iv_length_9: + vpinsrb init_block0, [tmp + 8], 9 +%%_iv_length_8: + vpinsrb init_block0, [tmp + 7], 8 + +%%_finish_nonce_move: + + ;; Bytes 14 & 15 (message length), in Big Endian + mov ax, [job + _msg_len_to_hash_in_bytes] + xchg al, ah + vpinsrw init_block0, ax, 7 + + mov aad_len, [job + _cbcmac_aad_len] + ;; Initial length to authenticate (Block 0) + mov auth_len, 16 + ;; Length to authenticate (Block 0 + len(AAD) (2B) + AAD padded, + ;; so length is multiple of 64B) + lea auth_len_aad, [aad_len + (2 + 15) + 16] + and auth_len_aad, -16 + + or aad_len, aad_len + cmovne auth_len, auth_len_aad + ;; Update lengths to authenticate and find min length + vmovdqa ccm_lens, [state + _aes_ccm_lens] + XVPINSRW ccm_lens, xtmp0, tmp2, lane, auth_len, scale_x16 + vmovdqa [state + _aes_ccm_lens], ccm_lens + vphminposuw min_len_idx, ccm_lens + + mov tmp, lane + shl tmp, 6 + lea init_block_addr, [state + _aes_ccm_init_blocks + tmp] + or aad_len, aad_len + je %%_aad_complete + + or flags, (1 << 6) ; Set Adata bit in flags + + ;; Copy AAD + ;; Set all 0s in last block (padding) + lea tmp, [init_block_addr + auth_len] + sub tmp, 16 + vpxor xtmp0, xtmp0 + vmovdqa [tmp], xtmp0 + + ;; Start copying from second block + lea tmp, [init_block_addr+16] + mov rax, aad_len + xchg al, ah + mov [tmp], ax + add tmp, 2 + mov tmp2, [job + _cbcmac_aad] + memcpy_avx_64_1 tmp, tmp2, aad_len, tmp3, tmp4, xtmp0, xtmp1, xtmp2, xtmp3 + +%%_aad_complete: + + ;; Finish Block 0 with Byte 0 + vpinsrb init_block0, BYTE(flags), 0 + vmovdqa [init_block_addr], init_block0 + + mov [state + _aes_ccm_args_in + lane * 8], init_block_addr + + cmp byte [state + _aes_ccm_unused_lanes], 0xf + jne %%_return_null + +%else ; end SUBMIT + + ;; Check at least one job + bt unused_lanes, 35 + jc %%_return_null + + ;; Find a lane with a non-null job + xor good_lane, good_lane + cmp QWORD [state + _aes_ccm_job_in_lane + 1*8], 0 + cmovne good_lane, [rel one] + cmp QWORD [state + _aes_ccm_job_in_lane + 2*8], 0 + cmovne good_lane, [rel two] + cmp QWORD [state + _aes_ccm_job_in_lane + 3*8], 0 + cmovne good_lane, [rel three] + cmp qword [state + _aes_ccm_job_in_lane + 4*8], 0 + cmovne good_lane, [rel four] + cmp qword [state + _aes_ccm_job_in_lane + 5*8], 0 + cmovne good_lane, [rel five] + cmp qword [state + _aes_ccm_job_in_lane + 6*8], 0 + cmovne good_lane, [rel six] + cmp qword [state + _aes_ccm_job_in_lane + 7*8], 0 + cmovne good_lane, [rel seven] + + ; Copy good_lane to empty lanes + movzx tmp, word [state + _aes_ccm_init_done + good_lane*2] + mov tmp2, [state + _aes_ccm_args_in + good_lane*8] + mov tmp3, [state + _aes_ccm_args_keys + good_lane*8] + shl good_lane, 4 ; multiply by 16 + vmovdqa xtmp0, [state + _aes_ccm_args_IV + good_lane] + vmovdqa ccm_lens, [state + _aes_ccm_lens] + +%assign I 0 +%rep 8 + cmp qword [state + _aes_ccm_job_in_lane + I*8], 0 + jne APPEND(skip_,I) + vpor ccm_lens, [rel len_masks + 16*I] + mov [state + _aes_ccm_init_done + I*2], WORD(tmp) + mov [state + _aes_ccm_args_in + I*8], tmp2 + mov [state + _aes_ccm_args_keys + I*8], tmp3 + vmovdqa [state + _aes_ccm_args_IV + I*16], xtmp0 +APPEND(skip_,I): +%assign I (I+1) +%endrep + vmovdqa [state + _aes_ccm_lens], ccm_lens + ;; Find min length + vphminposuw min_len_idx, ccm_lens + +%endif ; end FLUSH + +%%_ccm_round: + vpextrw len2, min_len_idx, 0 ; min value + vpextrw min_idx, min_len_idx, 1 ; min index (0...7) + + mov min_job, [state + _aes_ccm_job_in_lane + min_idx*8] + + or len2, len2 + je %%_len_is_0 + ;; subtract min length from all lengths + vpshufb min_len_idx, min_len_idx, [rel dupw] ; broadcast min length + vpsubw ccm_lens, min_len_idx + vmovdqa [state + _aes_ccm_lens], ccm_lens + + ; "state" and "args" are the same address, arg1 + ; len2 is arg2 + call AES128_CBC_MAC + ; state and min_idx are intact + +%%_len_is_0: + + movzx tmp, WORD [state + _aes_ccm_init_done + min_idx*2] + cmp WORD(tmp), 0 + je %%_prepare_full_blocks_to_auth + cmp WORD(tmp), 1 + je %%_prepare_partial_block_to_auth + +%%_encrypt_digest: + + ;; Set counter block 0 (reusing previous initial block 0) + mov tmp, min_idx + shl tmp, 3 + vmovdqa init_block0, [state + _aes_ccm_init_blocks + tmp * 8] + + vpand init_block0, [rel counter_mask] + + mov tmp2, [state + _aes_ccm_args_keys + tmp] + ENCRYPT_SINGLE_BLOCK tmp2, init_block0 + vpxor init_block0, [state + _aes_ccm_args_IV + tmp * 2] + + ;; Copy Mlen bytes into auth_tag_output (Mlen = 4,6,8,10,12,14,16) + mov min_job, [state + _aes_ccm_job_in_lane + tmp] + mov tmp3, [min_job + _auth_tag_output_len_in_bytes] + mov tmp2, [min_job + _auth_tag_output] + + simd_store_avx tmp2, init_block0, tmp3, tmp, tmp4 +%%_update_lanes: + ; Update unused lanes + mov unused_lanes, [state + _aes_ccm_unused_lanes] + shl unused_lanes, 4 + or unused_lanes, min_idx + mov [state + _aes_ccm_unused_lanes], unused_lanes + + ; Set return job + mov job_rax, min_job + + mov qword [state + _aes_ccm_job_in_lane + min_idx*8], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + +%ifdef SAFE_DATA + vpxor xtmp0, xtmp0 +%ifidn %%SUBMIT_FLUSH, SUBMIT + shl min_idx, 3 + ;; Clear digest (in memory for CBC IV), counter block 0 and AAD of returned job + vmovdqa [state + _aes_ccm_args_IV + min_idx * 2], xtmp0 + vmovdqa [state + _aes_ccm_init_blocks + min_idx * 8], xtmp0 + vmovdqa [state + _aes_ccm_init_blocks + min_idx * 8 + 16], xtmp0 + vmovdqa [state + _aes_ccm_init_blocks + min_idx * 8 + 32], xtmp0 + vmovdqa [state + _aes_ccm_init_blocks + min_idx * 8 + 48], xtmp0 + mov qword [state + _aes_ccm_args_keys + min_idx], 0 +%else + ;; Clear digest (in memory for CBC IV), counter block 0 and AAD + ;; of returned job and "NULL lanes" +%assign I 0 +%rep 8 + cmp qword [state + _aes_ccm_job_in_lane + I*8], 0 + jne APPEND(skip_clear_,I) + vmovdqa [state + _aes_ccm_args_IV + I*16], xtmp0 + vmovdqa [state + _aes_ccm_init_blocks + I*64], xtmp0 + vmovdqa [state + _aes_ccm_init_blocks + I*64 + 16], xtmp0 + vmovdqa [state + _aes_ccm_init_blocks + I*64 + 32], xtmp0 + vmovdqa [state + _aes_ccm_init_blocks + I*64 + 48], xtmp0 + mov qword [state + _aes_ccm_args_keys + I*8], 0 +APPEND(skip_clear_,I): +%assign I (I+1) +%endrep + +%endif ;; SUBMIT +%endif ;; SAFE_DATA + +%%_return: + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + ret + +%%_return_null: + xor job_rax, job_rax + jmp %%_return + +%%_prepare_full_blocks_to_auth: + + cmp dword [min_job + _cipher_direction], 2 ; DECRYPT + je %%_decrypt + +%%_encrypt: + mov tmp, [min_job + _src] + add tmp, [min_job + _hash_start_src_offset_in_bytes] + jmp %%_set_init_done_1 + +%%_decrypt: + mov tmp, [min_job + _dst] + +%%_set_init_done_1: + mov [state + _aes_ccm_args_in + min_idx*8], tmp + mov word [state + _aes_ccm_init_done + min_idx*2], 1 + + ; Check if there are full blocks to hash + mov tmp, [min_job + _msg_len_to_hash_in_bytes] + and tmp, -16 + je %%_prepare_partial_block_to_auth + + ;; Update lengths to authenticate and find min length + vmovdqa ccm_lens, [state + _aes_ccm_lens] + XVPINSRW ccm_lens, xtmp0, tmp2, min_idx, tmp, scale_x16 + vphminposuw min_len_idx, ccm_lens + vmovdqa [state + _aes_ccm_lens], ccm_lens + + jmp %%_ccm_round + +%%_prepare_partial_block_to_auth: + ; Check if partial block needs to be hashed + mov auth_len, [min_job + _msg_len_to_hash_in_bytes] + and auth_len, 15 + je %%_encrypt_digest + + mov word [state + _aes_ccm_init_done + min_idx * 2], 2 + ;; Update lengths to authenticate and find min length + vmovdqa ccm_lens, [state + _aes_ccm_lens] + XVPINSRW ccm_lens, xtmp0, tmp2, min_idx, 16, scale_x16 + vphminposuw min_len_idx, ccm_lens + vmovdqa [state + _aes_ccm_lens], ccm_lens + + mov tmp2, min_idx + shl tmp2, 6 + add tmp2, 16 ; pb[AES_BLOCK_SIZE] + lea init_block_addr, [state + _aes_ccm_init_blocks + tmp2] + mov tmp2, [state + _aes_ccm_args_in + min_idx * 8] + + simd_load_avx_15_1 xtmp0, tmp2, auth_len + +%%_finish_partial_block_copy: + vmovdqa [init_block_addr], xtmp0 + mov [state + _aes_ccm_args_in + min_idx * 8], init_block_addr + + jmp %%_ccm_round +%endmacro + + +align 64 +; JOB_AES_HMAC * submit_job_aes_ccm_auth_avx(MB_MGR_CCM_OOO *state, JOB_AES_HMAC *job) +; arg 1 : state +; arg 2 : job +MKGLOBAL(SUBMIT_JOB_AES_CCM_AUTH,function,internal) +SUBMIT_JOB_AES_CCM_AUTH: + GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_AVX SUBMIT + +; JOB_AES_HMAC * flush_job_aes_ccm_auth_avx(MB_MGR_CCM_OOO *state) +; arg 1 : state +MKGLOBAL(FLUSH_JOB_AES_CCM_AUTH,function,internal) +FLUSH_JOB_AES_CCM_AUTH: + GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_AVX FLUSH + + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_cmac_submit_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_cmac_submit_flush_avx.asm new file mode 100644 index 000000000..e17023004 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_cmac_submit_flush_avx.asm @@ -0,0 +1,518 @@ +;; +;; Copyright (c) 2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" + +%include "include/reg_sizes.asm" +%include "include/memcpy.asm" +%include "include/const.inc" +;%define DO_DBGPRINT +%include "include/dbgprint.asm" + +%define AES128_CBC_MAC aes128_cbc_mac_x8 +%define SUBMIT_JOB_AES_CMAC_AUTH submit_job_aes_cmac_auth_avx +%define FLUSH_JOB_AES_CMAC_AUTH flush_job_aes_cmac_auth_avx + +extern AES128_CBC_MAC + +section .data +default rel + +align 16 +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 + ;ddq 0x00000000000000000000FFFF00000000 + dq 0x0000FFFF00000000, 0x0000000000000000 + ;ddq 0x0000000000000000FFFF000000000000 + dq 0xFFFF000000000000, 0x0000000000000000 + ;ddq 0x000000000000FFFF0000000000000000 + dq 0x0000000000000000, 0x000000000000FFFF + ;ddq 0x00000000FFFF00000000000000000000 + dq 0x0000000000000000, 0x00000000FFFF0000 + ;ddq 0x0000FFFF000000000000000000000000 + dq 0x0000000000000000, 0x0000FFFF00000000 + ;ddq 0xFFFF0000000000000000000000000000 + dq 0x0000000000000000, 0xFFFF000000000000 +dupw: + ;ddq 0x01000100010001000100010001000100 + dq 0x0100010001000100, 0x0100010001000100 +one: dq 1 +two: dq 2 +three: dq 3 +four: dq 4 +five: dq 5 +six: dq 6 +seven: dq 7 + +section .text + +%define APPEND(a,b) a %+ b + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define job_rax rax + +; idx needs to be in rbp +%define len rbp +%define idx rbp +%define tmp rbp + +%define lane r8 + +%define iv r9 +%define m_last r10 +%define n r11 + +%define unused_lanes rbx +%define r rbx + +%define tmp3 r12 +%define tmp4 r13 +%define tmp2 r14 + +%define good_lane r15 +%define rbits r15 + +; STACK_SPACE needs to be an odd multiple of 8 +; This routine and its callee clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +;;; =========================================================================== +;;; =========================================================================== +;;; MACROS +;;; =========================================================================== +;;; =========================================================================== + +;;; =========================================================================== +;;; AES CMAC job submit & flush +;;; =========================================================================== +;;; SUBMIT_FLUSH [in] - SUBMIT, FLUSH job selection +%macro GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_AVX 1 +%define %%SUBMIT_FLUSH %1 + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + ;; Find free lane + mov unused_lanes, [state + _aes_cmac_unused_lanes] + +%ifidn %%SUBMIT_FLUSH, SUBMIT + + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + mov [state + _aes_cmac_unused_lanes], unused_lanes + + ;; Copy job info into lane + mov [state + _aes_cmac_job_in_lane + lane*8], job + ;; Copy keys into lane args + mov tmp, [job + _key_expanded] + mov [state + _aes_cmac_args_keys + lane*8], tmp + mov tmp, lane + shl tmp, 4 ; lane*16 + + ;; Zero IV to store digest + vpxor xmm0, xmm0 + vmovdqa [state + _aes_cmac_args_IV + tmp], xmm0 + + lea m_last, [state + _aes_cmac_scratch + tmp] + + ;; calculate len + ;; convert bits to bytes (message length in bits for CMAC) + mov len, [job + _msg_len_to_hash_in_bits] + mov rbits, len + add len, 7 ; inc len if there are remainder bits + shr len, 3 + and rbits, 7 + + ;; Check number of blocks and for partial block + mov r, len ; set remainder + and r, 0xf + + lea n, [len + 0xf] ; set num blocks + shr n, 4 + + jz %%_lt_one_block ; check one or more blocks? + + ;; One or more blocks, potentially partial + mov word [state + _aes_cmac_init_done + lane*2], 0 + + mov tmp2, [job + _src] + add tmp2, [job + _hash_start_src_offset_in_bytes] + mov [state + _aes_cmac_args_in + lane*8], tmp2 + + ;; len = (n-1)*16 + lea tmp2, [n - 1] + shl tmp2, 4 + vmovdqa xmm0, [state + _aes_cmac_lens] + XVPINSRW xmm0, xmm1, tmp, lane, tmp2, scale_x16 + vmovdqa [state + _aes_cmac_lens], xmm0 + + ;; check remainder bits + or rbits, rbits + jnz %%_not_complete_block_3gpp + + ;; check if complete block + or r, r + jz %%_complete_block + +%%_not_complete_block: + ;; M_last = padding(M_n) XOR K2 + lea tmp, [rel padding_0x80_tab16 + 16] + sub tmp, r + vmovdqu xmm0, [tmp] + vmovdqa [m_last], xmm0 + + mov tmp, [job + _src] + add tmp, [job + _hash_start_src_offset_in_bytes] + lea tmp3, [n - 1] + shl tmp3, 4 + add tmp, tmp3 + + memcpy_avx_16 m_last, tmp, r, tmp4, iv + + ;; src + n + r + mov tmp3, [job + _skey2] + vmovdqa xmm1, [m_last] + vmovdqu xmm0, [tmp3] + vpxor xmm0, xmm1 + vmovdqa [m_last], xmm0 + +%%_step_5: + ;; Find min length + vmovdqa xmm0, [state + _aes_cmac_lens] + vphminposuw xmm1, xmm0 + + cmp byte [state + _aes_cmac_unused_lanes], 0xf + jne %%_return_null + +%else ; end SUBMIT + + ;; Check at least one job + bt unused_lanes, 35 + jc %%_return_null + + ;; Find a lane with a non-null job + xor good_lane, good_lane + cmp qword [state + _aes_cmac_job_in_lane + 1*8], 0 + cmovne good_lane, [rel one] + cmp qword [state + _aes_cmac_job_in_lane + 2*8], 0 + cmovne good_lane, [rel two] + cmp qword [state + _aes_cmac_job_in_lane + 3*8], 0 + cmovne good_lane, [rel three] + cmp qword [state + _aes_cmac_job_in_lane + 4*8], 0 + cmovne good_lane, [rel four] + cmp qword [state + _aes_cmac_job_in_lane + 5*8], 0 + cmovne good_lane, [rel five] + cmp qword [state + _aes_cmac_job_in_lane + 6*8], 0 + cmovne good_lane, [rel six] + cmp qword [state + _aes_cmac_job_in_lane + 7*8], 0 + cmovne good_lane, [rel seven] + + ; Copy good_lane to empty lanes + mov tmp2, [state + _aes_cmac_args_in + good_lane*8] + mov tmp3, [state + _aes_cmac_args_keys + good_lane*8] + shl good_lane, 4 ; multiply by 16 + vmovdqa xmm2, [state + _aes_cmac_args_IV + good_lane] + vmovdqa xmm0, [state + _aes_cmac_lens] + +%assign I 0 +%rep 8 + cmp qword [state + _aes_cmac_job_in_lane + I*8], 0 + jne APPEND(skip_,I) + mov [state + _aes_cmac_args_in + I*8], tmp2 + mov [state + _aes_cmac_args_keys + I*8], tmp3 + vmovdqa [state + _aes_cmac_args_IV + I*16], xmm2 + vpor xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + ;; Find min length + vphminposuw xmm1, xmm0 + +%endif ; end FLUSH + +%%_cmac_round: + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) + cmp len2, 0 + je %%_len_is_0 + vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes + vpsubw xmm0, xmm1 + vmovdqa [state + _aes_cmac_lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len2 is arg2 + call AES128_CBC_MAC + ; state and idx are intact + + vmovdqa xmm0, [state + _aes_cmac_lens] ; preload lens +%%_len_is_0: + ; Check if job complete + test word [state + _aes_cmac_init_done + idx*2], 0xffff + jnz %%_copy_complete_digest + + ; Finish step 6 + mov word [state + _aes_cmac_init_done + idx*2], 1 + + XVPINSRW xmm0, xmm1, tmp3, idx, 16, scale_x16 + vmovdqa [state + _aes_cmac_lens], xmm0 + + vphminposuw xmm1, xmm0 ; find min length + + mov tmp3, idx + shl tmp3, 4 ; idx*16 + lea m_last, [state + _aes_cmac_scratch + tmp3] + mov [state + _aes_cmac_args_in + idx*8], m_last + + jmp %%_cmac_round + +%%_copy_complete_digest: + ; Job complete, copy digest to AT output + mov job_rax, [state + _aes_cmac_job_in_lane + idx*8] + + mov tmp4, idx + shl tmp4, 4 + lea tmp3, [state + _aes_cmac_args_IV + tmp4] + mov tmp4, [job_rax + _auth_tag_output_len_in_bytes] + mov tmp2, [job_rax + _auth_tag_output] + + cmp tmp4, 16 + jne %%_ne_16_copy + + ;; 16 byte AT copy + vmovdqa xmm0, [tmp3] + vmovdqu [tmp2], xmm0 + jmp %%_update_lanes + +%%_ne_16_copy: + memcpy_avx_16 tmp2, tmp3, tmp4, lane, iv + +%%_update_lanes: + ; Update unused lanes + mov unused_lanes, [state + _aes_cmac_unused_lanes] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _aes_cmac_unused_lanes], unused_lanes + + ; Set return job + mov job_rax, [state + _aes_cmac_job_in_lane + idx*8] + + mov qword [state + _aes_cmac_job_in_lane + idx*8], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + +%ifdef SAFE_DATA + vpxor xmm0, xmm0 +%ifidn %%SUBMIT_FLUSH, SUBMIT + ;; Clear digest (in memory for IV) and scratch memory of returned job + vmovdqa [tmp3], xmm0 + + shl idx, 4 + vmovdqa [state + _aes_cmac_scratch + idx], xmm0 + +%else + ;; Clear digest and scratch memory of returned job and "NULL lanes" +%assign I 0 +%rep 8 + cmp qword [state + _aes_cmac_job_in_lane + I*8], 0 + jne APPEND(skip_clear_,I) + vmovdqa [state + _aes_cmac_args_IV + I*16], xmm0 + vmovdqa [state + _aes_cmac_scratch + I*16], xmm0 +APPEND(skip_clear_,I): +%assign I (I+1) +%endrep +%endif ;; SUBMIT + +%endif ;; SAFE_DATA + +%%_return: + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + ret + +%%_return_null: + xor job_rax, job_rax + jmp %%_return + +%ifidn %%SUBMIT_FLUSH, SUBMIT +%%_complete_block: + + ;; Block size aligned + mov tmp2, [job + _src] + add tmp2, [job + _hash_start_src_offset_in_bytes] + lea tmp3, [n - 1] + shl tmp3, 4 + add tmp2, tmp3 + + ;; M_last = M_n XOR K1 + mov tmp3, [job + _skey1] + vmovdqu xmm0, [tmp3] + vmovdqu xmm1, [tmp2] + vpxor xmm0, xmm1 + vmovdqa [m_last], xmm0 + + jmp %%_step_5 + +%%_lt_one_block: + ;; Single partial block + mov word [state + _aes_cmac_init_done + lane*2], 1 + mov [state + _aes_cmac_args_in + lane*8], m_last + + vmovdqa xmm0, [state + _aes_cmac_lens] + XVPINSRW xmm0, xmm1, tmp2, lane, 16, scale_x16 + vmovdqa [state + _aes_cmac_lens], xmm0 + + mov n, 1 + jmp %%_not_complete_block + +%%_not_complete_block_3gpp: + ;; bit pad last block + ;; xor with skey2 + ;; copy to m_last + + ;; load pointer to src + mov tmp, [job + _src] + add tmp, [job + _hash_start_src_offset_in_bytes] + lea tmp3, [n - 1] + shl tmp3, 4 + add tmp, tmp3 + + ;; check if partial block + or r, r + jz %%_load_full_block_3gpp + + simd_load_avx_15_1 xmm0, tmp, r + dec r + +%%_update_mlast_3gpp: + ;; set last byte padding mask + ;; shift into correct xmm idx + + ;; save and restore rcx on windows +%ifndef LINUX + mov tmp, rcx +%endif + mov rcx, rbits + mov tmp3, 0xff + shr tmp3, cl + movq xmm2, tmp3 + XVPSLLB xmm2, r, xmm1, tmp2 + + ;; pad final byte + vpandn xmm2, xmm0 +%ifndef LINUX + mov rcx, tmp +%endif + ;; set OR mask to pad final bit + mov tmp2, tmp3 + shr tmp2, 1 + xor tmp2, tmp3 ; XOR to get OR mask + movq xmm3, tmp2 + ;; xmm1 contains shift table from previous shift + vpshufb xmm3, xmm1 + + ;; load skey2 address + mov tmp3, [job + _skey2] + vmovdqu xmm1, [tmp3] + + ;; set final padding bit + vpor xmm2, xmm3 + + ;; XOR last partial block with skey2 + ;; update mlast + vpxor xmm2, xmm1 + vmovdqa [m_last], xmm2 + + jmp %%_step_5 + +%%_load_full_block_3gpp: + vmovdqu xmm0, [tmp] + mov r, 0xf + jmp %%_update_mlast_3gpp +%endif +%endmacro + + +align 64 +; JOB_AES_HMAC * submit_job_aes_cmac_auth_avx(MB_MGR_CMAC_OOO *state, JOB_AES_HMAC *job) +; arg 1 : state +; arg 2 : job +MKGLOBAL(SUBMIT_JOB_AES_CMAC_AUTH,function,internal) +SUBMIT_JOB_AES_CMAC_AUTH: + GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_AVX SUBMIT + +; JOB_AES_HMAC * flush_job_aes_cmac_auth_avx(MB_MGR_CMAC_OOO *state) +; arg 1 : state +MKGLOBAL(FLUSH_JOB_AES_CMAC_AUTH,function,internal) +FLUSH_JOB_AES_CMAC_AUTH: + GENERIC_SUBMIT_FLUSH_JOB_AES_CMAC_AVX FLUSH + + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_flush_avx.asm new file mode 100644 index 000000000..dbd2a4547 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_flush_avx.asm @@ -0,0 +1,239 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" + +%include "include/reg_sizes.asm" + +%ifndef AES_CBC_ENC_X8 +%define AES_CBC_ENC_X8 aes_cbc_enc_128_x8 +%define FLUSH_JOB_AES_ENC flush_job_aes128_enc_avx +%endif + +; void AES_CBC_ENC_X8(AES_ARGS *args, UINT64 len_in_bytes); +extern AES_CBC_ENC_X8 + +section .data +default rel +align 16 +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 + ;ddq 0x00000000000000000000FFFF00000000 + dq 0x0000FFFF00000000, 0x0000000000000000 + ;ddq 0x0000000000000000FFFF000000000000 + dq 0xFFFF000000000000, 0x0000000000000000 + ;ddq 0x000000000000FFFF0000000000000000 + dq 0x0000000000000000, 0x000000000000FFFF + ;ddq 0x00000000FFFF00000000000000000000 + dq 0x0000000000000000, 0x00000000FFFF0000 + ;ddq 0x0000FFFF000000000000000000000000 + dq 0x0000000000000000, 0x0000FFFF00000000 + ;ddq 0xFFFF0000000000000000000000000000 + dq 0x0000000000000000, 0xFFFF000000000000 +dupw: + ;ddq 0x01000100010001000100010001000100 + dq 0x0100010001000100, 0x0100010001000100 +one: dq 1 +two: dq 2 +three: dq 3 +four: dq 4 +five: dq 5 +six: dq 6 +seven: dq 7 + +section .text + +%define APPEND(a,b) a %+ b + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define job_rax rax + +%if 1 +%define unused_lanes rbx +%define tmp1 rbx + +%define good_lane rdx +%define iv rdx + +%define tmp2 rax + +; idx needs to be in rbp +%define tmp rbp +%define idx rbp + +%define tmp3 r8 +%endif + +; STACK_SPACE needs to be an odd multiple of 8 +; This routine and its callee clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +; JOB* FLUSH_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job) +; arg 1 : state +; arg 2 : job +MKGLOBAL(FLUSH_JOB_AES_ENC,function,internal) +FLUSH_JOB_AES_ENC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + ; check for empty + mov unused_lanes, [state + _aes_unused_lanes] + bt unused_lanes, 32+3 + jc return_null + + ; find a lane with a non-null job + xor good_lane, good_lane + cmp qword [state + _aes_job_in_lane + 1*8], 0 + cmovne good_lane, [rel one] + cmp qword [state + _aes_job_in_lane + 2*8], 0 + cmovne good_lane, [rel two] + cmp qword [state + _aes_job_in_lane + 3*8], 0 + cmovne good_lane, [rel three] + cmp qword [state + _aes_job_in_lane + 4*8], 0 + cmovne good_lane, [rel four] + cmp qword [state + _aes_job_in_lane + 5*8], 0 + cmovne good_lane, [rel five] + cmp qword [state + _aes_job_in_lane + 6*8], 0 + cmovne good_lane, [rel six] + cmp qword [state + _aes_job_in_lane + 7*8], 0 + cmovne good_lane, [rel seven] + + ; copy good_lane to empty lanes + mov tmp1, [state + _aes_args_in + good_lane*8] + mov tmp2, [state + _aes_args_out + good_lane*8] + mov tmp3, [state + _aes_args_keys + good_lane*8] + shl good_lane, 4 ; multiply by 16 + vmovdqa xmm2, [state + _aes_args_IV + good_lane] + vmovdqa xmm0, [state + _aes_lens] + +%assign I 0 +%rep 8 + cmp qword [state + _aes_job_in_lane + I*8], 0 + jne APPEND(skip_,I) + mov [state + _aes_args_in + I*8], tmp1 + mov [state + _aes_args_out + I*8], tmp2 + mov [state + _aes_args_keys + I*8], tmp3 + vmovdqa [state + _aes_args_IV + I*16], xmm2 + vpor xmm0, xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _aes_lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call AES_CBC_ENC_X8 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + mov job_rax, [state + _aes_job_in_lane + idx*8] + mov unused_lanes, [state + _aes_unused_lanes] + mov qword [state + _aes_job_in_lane + idx*8], 0 + or dword [job_rax + _status], STS_COMPLETED_AES + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _aes_unused_lanes], unused_lanes +%ifdef SAFE_DATA + ;; Clear IVs of returned job and "NULL lanes" + vpxor xmm0, xmm0 +%assign I 0 +%rep 8 + cmp qword [state + _aes_job_in_lane + I*8], 0 + jne APPEND(skip_clear_,I) + vmovdqa [state + _aes_args_IV + I*16], xmm0 +APPEND(skip_clear_,I): +%assign I (I+1) +%endrep +%endif + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +return_null: + xor job_rax, job_rax + jmp return + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_submit_avx.asm new file mode 100644 index 000000000..c95fa1f6c --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_submit_avx.asm @@ -0,0 +1,194 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" + +%include "include/reg_sizes.asm" +%include "include/const.inc" + +%ifndef AES_CBC_ENC_X8 +%define AES_CBC_ENC_X8 aes_cbc_enc_128_x8 +%define SUBMIT_JOB_AES_ENC submit_job_aes128_enc_avx +%endif + +; void AES_CBC_ENC_X8(AES_ARGS *args, UINT64 len_in_bytes); +extern AES_CBC_ENC_X8 + +section .data +default rel + +align 16 +dupw: + ;ddq 0x01000100010001000100010001000100 + dq 0x0100010001000100, 0x0100010001000100 + +section .text + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define job_rax rax + +%if 1 +; idx needs to be in rbp +%define len rbp +%define idx rbp +%define tmp rbp + +%define lane r8 + +%define iv r9 + +%define unused_lanes rbx +%endif + +; STACK_SPACE needs to be an odd multiple of 8 +; This routine and its callee clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +; JOB* SUBMIT_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job) +; arg 1 : state +; arg 2 : job +MKGLOBAL(SUBMIT_JOB_AES_ENC,function,internal) +SUBMIT_JOB_AES_ENC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _aes_unused_lanes] + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + mov len, [job + _msg_len_to_cipher_in_bytes] + and len, -16 ; DOCSIS may pass size unaligned to block size + mov iv, [job + _iv] + mov [state + _aes_unused_lanes], unused_lanes + + mov [state + _aes_job_in_lane + lane*8], job + + vmovdqa xmm0, [state + _aes_lens] + XVPINSRW xmm0, xmm1, tmp, lane, len, scale_x16 + vmovdqa [state + _aes_lens], xmm0 + + mov tmp, [job + _src] + add tmp, [job + _cipher_start_src_offset_in_bytes] + vmovdqu xmm0, [iv] + mov [state + _aes_args_in + lane*8], tmp + mov tmp, [job + _aes_enc_key_expanded] + mov [state + _aes_args_keys + lane*8], tmp + mov tmp, [job + _dst] + mov [state + _aes_args_out + lane*8], tmp + shl lane, 4 ; multiply by 16 + vmovdqa [state + _aes_args_IV + lane], xmm0 + + cmp unused_lanes, 0xf + jne return_null + + ; Find min length + vmovdqa xmm0, [state + _aes_lens] + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...7) + cmp len2, 0 + je len_is_0 + + vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _aes_lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call AES_CBC_ENC_X8 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + mov job_rax, [state + _aes_job_in_lane + idx*8] + mov unused_lanes, [state + _aes_unused_lanes] + mov qword [state + _aes_job_in_lane + idx*8], 0 + or dword [job_rax + _status], STS_COMPLETED_AES + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _aes_unused_lanes], unused_lanes +%ifdef SAFE_DATA + ;; Clear IV + vpxor xmm0, xmm0 + shl idx, 3 ; multiply by 8 + vmovdqa [state + _aes_args_IV + idx*2], xmm0 + mov qword [state + _aes_args_keys + idx], 0 +%endif + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +return_null: + xor job_rax, job_rax + jmp return + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_flush_avx.asm new file mode 100644 index 000000000..a810842a9 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_flush_avx.asm @@ -0,0 +1,264 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" + +%include "include/reg_sizes.asm" + +%ifndef AES_XCBC_X8 +%define AES_XCBC_X8 aes_xcbc_mac_128_x8 +%define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_avx +%endif + +; void AES_XCBC_X8(AES_XCBC_ARGS_x8 *args, UINT64 len_in_bytes); +extern AES_XCBC_X8 + +section .data +default rel + +align 16 +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 + ;ddq 0x00000000000000000000FFFF00000000 + dq 0x0000FFFF00000000, 0x0000000000000000 + ;ddq 0x0000000000000000FFFF000000000000 + dq 0xFFFF000000000000, 0x0000000000000000 + ;ddq 0x000000000000FFFF0000000000000000 + dq 0x0000000000000000, 0x000000000000FFFF + ;ddq 0x00000000FFFF00000000000000000000 + dq 0x0000000000000000, 0x00000000FFFF0000 + ;ddq 0x0000FFFF000000000000000000000000 + dq 0x0000000000000000, 0x0000FFFF00000000 + ;ddq 0xFFFF0000000000000000000000000000 + dq 0x0000000000000000, 0xFFFF000000000000 +dupw: + ;ddq 0x01000100010001000100010001000100 + dq 0x0100010001000100, 0x0100010001000100 +one: dq 1 +two: dq 2 +three: dq 3 +four: dq 4 +five: dq 5 +six: dq 6 +seven: dq 7 + +section .text + +%define APPEND(a,b) a %+ b + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define job_rax rax + +%if 1 +%define unused_lanes rbx +%define tmp1 rbx + +%define icv rdx + +%define tmp2 rax + +; idx needs to be in rbp +%define tmp r10 +%define idx rbp + +%define tmp3 r8 +%define lane_data r9 +%endif + +; STACK_SPACE needs to be an odd multiple of 8 +; This routine and its callee clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +; JOB* FLUSH_JOB_AES_XCBC(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job) +; arg 1 : state +; arg 2 : job +MKGLOBAL(FLUSH_JOB_AES_XCBC,function,internal) +FLUSH_JOB_AES_XCBC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + ; check for empty + mov unused_lanes, [state + _aes_xcbc_unused_lanes] + bt unused_lanes, 32+3 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _aes_xcbc_ldata + 1 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + cmovne idx, [rel one] + cmp qword [state + _aes_xcbc_ldata + 2 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + cmovne idx, [rel two] + cmp qword [state + _aes_xcbc_ldata + 3 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + cmovne idx, [rel three] + cmp qword [state + _aes_xcbc_ldata + 4 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + cmovne idx, [rel four] + cmp qword [state + _aes_xcbc_ldata + 5 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + cmovne idx, [rel five] + cmp qword [state + _aes_xcbc_ldata + 6 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + cmovne idx, [rel six] + cmp qword [state + _aes_xcbc_ldata + 7 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + cmovne idx, [rel seven] + +copy_lane_data: + ; copy idx to empty lanes + mov tmp1, [state + _aes_xcbc_args_in + idx*8] + mov tmp3, [state + _aes_xcbc_args_keys + idx*8] + shl idx, 4 ; multiply by 16 + vmovdqa xmm2, [state + _aes_xcbc_args_ICV + idx] + vmovdqa xmm0, [state + _aes_xcbc_lens] + +%assign I 0 +%rep 8 + cmp qword [state + _aes_xcbc_ldata + I * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _aes_xcbc_args_in + I*8], tmp1 + mov [state + _aes_xcbc_args_keys + I*8], tmp3 + vmovdqa [state + _aes_xcbc_args_ICV + I*16], xmm2 + vpor xmm0, xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + + vmovdqa [state + _aes_xcbc_lens], xmm0 + + ; Find min length + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _aes_xcbc_lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call AES_XCBC_X8 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _XCBC_LANE_DATA_size + lea lane_data, [state + _aes_xcbc_ldata + lane_data] + cmp dword [lane_data + _xcbc_final_done], 0 + jne end_loop + + mov dword [lane_data + _xcbc_final_done], 1 + mov word [state + _aes_xcbc_lens + 2*idx], 16 + lea tmp, [lane_data + _xcbc_final_block] + mov [state + _aes_xcbc_args_in + 8*idx], tmp + jmp copy_lane_data + +end_loop: + mov job_rax, [lane_data + _xcbc_job_in_lane] + mov icv, [job_rax + _auth_tag_output] + mov unused_lanes, [state + _aes_xcbc_unused_lanes] + mov qword [lane_data + _xcbc_job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 4 + or unused_lanes, idx + shl idx, 4 ; multiply by 16 + mov [state + _aes_xcbc_unused_lanes], unused_lanes + + ; copy 12 bytes + vmovdqa xmm0, [state + _aes_xcbc_args_ICV + idx] + vmovq [icv], xmm0 + vpextrd [icv + 8], xmm0, 2 + +%ifdef SAFE_DATA + vpxor xmm0, xmm0 + ;; Clear ICV's and final blocks in returned job and NULL lanes +%assign I 0 +%rep 8 + cmp qword [state + _aes_xcbc_ldata + I * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + jne APPEND(skip_clear_,I) + vmovdqa [state + _aes_xcbc_args_ICV + I*16], xmm0 + lea lane_data, [state + _aes_xcbc_ldata + (I * _XCBC_LANE_DATA_size)] + vmovdqa [lane_data + _xcbc_final_block], xmm0 + vmovdqa [lane_data + _xcbc_final_block + 16], xmm0 +APPEND(skip_clear_,I): +%assign I (I+1) +%endrep +%endif + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +return_null: + xor job_rax, job_rax + jmp return + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_submit_avx.asm new file mode 100644 index 000000000..38f6a6470 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_submit_avx.asm @@ -0,0 +1,272 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" + +%include "include/reg_sizes.asm" + +%include "include/memcpy.asm" +%include "include/const.inc" + +%ifndef AES_XCBC_X8 +%define AES_XCBC_X8 aes_xcbc_mac_128_x8 +%define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_avx +%endif + +; void AES_XCBC_X8(AES_XCBC_ARGS_x8 *args, UINT64 len_in_bytes); +extern AES_XCBC_X8 + + +section .data +default rel + +align 16 +dupw: ;ddq 0x01000100010001000100010001000100 + dq 0x0100010001000100, 0x0100010001000100 +x80: ;ddq 0x00000000000000000000000000000080 + dq 0x0000000000000080, 0x0000000000000000 + +section .text + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define job_rax rax + +%if 1 +; idx needs to be in rbp +%define len r11 +%define idx rbp +%define tmp2 rbp +%define tmp r14 + +%define lane r8 +%define icv r9 +%define p2 r9 + +%define last_len r10 + +%define lane_data r12 +%define p r13 + +%define unused_lanes rbx +%endif + +; STACK_SPACE needs to be an odd multiple of 8 +; This routine and its callee clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +; JOB* SUBMIT_JOB_AES_XCBC(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job) +; arg 1 : state +; arg 2 : job +MKGLOBAL(SUBMIT_JOB_AES_XCBC,function,internal) +SUBMIT_JOB_AES_XCBC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _aes_xcbc_unused_lanes] + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _XCBC_LANE_DATA_size + lea lane_data, [state + _aes_xcbc_ldata + lane_data] + mov len, [job + _msg_len_to_hash_in_bytes] + mov [state + _aes_xcbc_unused_lanes], unused_lanes + mov [lane_data + _xcbc_job_in_lane], job + mov dword [lane_data + _xcbc_final_done], 0 + mov tmp, [job + _k1_expanded] + mov [state + _aes_xcbc_args_keys + lane*8], tmp + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + + mov last_len, len + + cmp len, 16 + jle small_buffer + + mov [state + _aes_xcbc_args_in + lane*8], p + add p, len ; set point to end of data + + and last_len, 15 ; Check lsbs of msg len + jnz slow_copy ; if not 16B mult, do slow copy + +fast_copy: + vmovdqu xmm0, [p - 16] ; load last block M[n] + mov tmp, [job + _k2] ; load K2 address + vmovdqu xmm1, [tmp] ; load K2 + vpxor xmm0, xmm0, xmm1 ; M[n] XOR K2 + vmovdqa [lane_data + _xcbc_final_block], xmm0 + sub len, 16 ; take last block off length +end_fast_copy: + vpxor xmm0, xmm0, xmm0 + shl lane, 4 ; multiply by 16 + vmovdqa [state + _aes_xcbc_args_ICV + lane], xmm0 + + vmovdqa xmm0, [state + _aes_xcbc_lens] + XVPINSRW xmm0, xmm1, tmp, lane, len, no_scale + vmovdqa [state + _aes_xcbc_lens], xmm0 + + cmp unused_lanes, 0xf + jne return_null + +start_loop: + ; Find min length + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...7) + cmp len2, 0 + je len_is_0 + + vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _aes_xcbc_lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call AES_XCBC_X8 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _XCBC_LANE_DATA_size + lea lane_data, [state + _aes_xcbc_ldata + lane_data] + cmp dword [lane_data + _xcbc_final_done], 0 + jne end_loop + + mov dword [lane_data + _xcbc_final_done], 1 + + vmovdqa xmm0, [state + _aes_xcbc_lens] + XVPINSRW xmm0, xmm1, tmp, idx, 16, scale_x16 + vmovdqa [state + _aes_xcbc_lens], xmm0 + + lea tmp, [lane_data + _xcbc_final_block] + mov [state + _aes_xcbc_args_in + 8*idx], tmp + jmp start_loop + +end_loop: + ; process completed job "idx" + mov job_rax, [lane_data + _xcbc_job_in_lane] + mov icv, [job_rax + _auth_tag_output] + mov unused_lanes, [state + _aes_xcbc_unused_lanes] + mov qword [lane_data + _xcbc_job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 4 + or unused_lanes, idx + shl idx, 4 ; multiply by 16 + mov [state + _aes_xcbc_unused_lanes], unused_lanes + + ; copy 12 bytes + vmovdqa xmm0, [state + _aes_xcbc_args_ICV + idx] + vmovq [icv], xmm0 + vpextrd [icv + 8], xmm0, 2 + +%ifdef SAFE_DATA + ;; Clear ICV + vpxor xmm0, xmm0 + vmovdqa [state + _aes_xcbc_args_ICV + idx], xmm0 + + ;; Clear final block (32 bytes) + vmovdqa [lane_data + _xcbc_final_block], xmm0 + vmovdqa [lane_data + _xcbc_final_block + 16], xmm0 +%endif + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +small_buffer: + ; For buffers <= 16 Bytes + ; The input data is set to final block + lea tmp, [lane_data + _xcbc_final_block] ; final block + mov [state + _aes_xcbc_args_in + lane*8], tmp + add p, len ; set point to end of data + cmp len, 16 + je fast_copy + +slow_copy: + and len, ~15 ; take final block off len + sub p, last_len ; adjust data pointer + lea p2, [lane_data + _xcbc_final_block + 16] ; upper part of final + sub p2, last_len ; adjust data pointer backwards + memcpy_avx_16_1 p2, p, last_len, tmp, tmp2 + vmovdqa xmm0, [rel x80] ; fill reg with padding + vmovdqu [lane_data + _xcbc_final_block + 16], xmm0 ; add padding + vmovdqu xmm0, [p2] ; load final block to process + mov tmp, [job + _k3] ; load K3 address + vmovdqu xmm1, [tmp] ; load K3 + vpxor xmm0, xmm0, xmm1 ; M[n] XOR K3 + vmovdqu [lane_data + _xcbc_final_block], xmm0 ; write final block + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_avx.c b/src/spdk/intel-ipsec-mb/avx/mb_mgr_avx.c new file mode 100644 index 000000000..29cf2a308 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_avx.c @@ -0,0 +1,733 @@ +/******************************************************************************* + Copyright (c) 2012-2018, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_xmms_avx + +#include "intel-ipsec-mb.h" +#include "include/kasumi_internal.h" +#include "include/zuc_internal.h" +#include "include/snow3g.h" + +#include "save_xmms.h" +#include "asm.h" +#include "des.h" +#include "cpu_feature.h" +#include "noaesni.h" + +JOB_AES_HMAC *submit_job_aes128_enc_avx(MB_MGR_AES_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_aes128_enc_avx(MB_MGR_AES_OOO *state); + +JOB_AES_HMAC *submit_job_aes192_enc_avx(MB_MGR_AES_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_aes192_enc_avx(MB_MGR_AES_OOO *state); + +JOB_AES_HMAC *submit_job_aes256_enc_avx(MB_MGR_AES_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_aes256_enc_avx(MB_MGR_AES_OOO *state); + +JOB_AES_HMAC *submit_job_aes_xcbc_avx(MB_MGR_AES_XCBC_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_aes_xcbc_avx(MB_MGR_AES_XCBC_OOO *state); + +JOB_AES_HMAC *submit_job_aes_cntr_avx(JOB_AES_HMAC *job); + +JOB_AES_HMAC *submit_job_aes_cntr_bit_avx(JOB_AES_HMAC *job); + +#define SAVE_XMMS save_xmms_avx +#define RESTORE_XMMS restore_xmms_avx + +#define SUBMIT_JOB_AES128_ENC submit_job_aes128_enc_avx +#define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_avx +#define FLUSH_JOB_AES128_ENC flush_job_aes128_enc_avx +#define SUBMIT_JOB_AES192_ENC submit_job_aes192_enc_avx +#define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_avx +#define FLUSH_JOB_AES192_ENC flush_job_aes192_enc_avx +#define SUBMIT_JOB_AES256_ENC submit_job_aes256_enc_avx +#define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_avx +#define FLUSH_JOB_AES256_ENC flush_job_aes256_enc_avx +#define SUBMIT_JOB_AES_ECB_128_ENC submit_job_aes_ecb_128_enc_avx +#define SUBMIT_JOB_AES_ECB_128_DEC submit_job_aes_ecb_128_dec_avx +#define SUBMIT_JOB_AES_ECB_192_ENC submit_job_aes_ecb_192_enc_avx +#define SUBMIT_JOB_AES_ECB_192_DEC submit_job_aes_ecb_192_dec_avx +#define SUBMIT_JOB_AES_ECB_256_ENC submit_job_aes_ecb_256_enc_avx +#define SUBMIT_JOB_AES_ECB_256_DEC submit_job_aes_ecb_256_dec_avx + +#define SUBMIT_JOB_AES_CNTR submit_job_aes_cntr_avx +#define SUBMIT_JOB_AES_CNTR_BIT submit_job_aes_cntr_bit_avx + +#define AES_CBC_DEC_128 aes_cbc_dec_128_avx +#define AES_CBC_DEC_192 aes_cbc_dec_192_avx +#define AES_CBC_DEC_256 aes_cbc_dec_256_avx + +#define AES_CNTR_128 aes_cntr_128_avx +#define AES_CNTR_192 aes_cntr_192_avx +#define AES_CNTR_256 aes_cntr_256_avx + +#define AES_CNTR_CCM_128 aes_cntr_ccm_128_avx + +#define AES_ECB_ENC_128 aes_ecb_enc_128_avx +#define AES_ECB_ENC_192 aes_ecb_enc_192_avx +#define AES_ECB_ENC_256 aes_ecb_enc_256_avx +#define AES_ECB_DEC_128 aes_ecb_dec_128_avx +#define AES_ECB_DEC_192 aes_ecb_dec_192_avx +#define AES_ECB_DEC_256 aes_ecb_dec_256_avx + +#define SUBMIT_JOB_PON_ENC submit_job_pon_enc_avx +#define SUBMIT_JOB_PON_DEC submit_job_pon_dec_avx +#define SUBMIT_JOB_PON_ENC_NO_CTR submit_job_pon_enc_no_ctr_avx +#define SUBMIT_JOB_PON_DEC_NO_CTR submit_job_pon_dec_no_ctr_avx + +#ifndef NO_GCM +#define AES_GCM_DEC_128 aes_gcm_dec_128_avx_gen2 +#define AES_GCM_ENC_128 aes_gcm_enc_128_avx_gen2 +#define AES_GCM_DEC_192 aes_gcm_dec_192_avx_gen2 +#define AES_GCM_ENC_192 aes_gcm_enc_192_avx_gen2 +#define AES_GCM_DEC_256 aes_gcm_dec_256_avx_gen2 +#define AES_GCM_ENC_256 aes_gcm_enc_256_avx_gen2 + +#define SUBMIT_JOB_AES_GCM_DEC submit_job_aes_gcm_dec_avx +#define FLUSH_JOB_AES_GCM_DEC flush_job_aes_gcm_dec_avx +#define SUBMIT_JOB_AES_GCM_ENC submit_job_aes_gcm_enc_avx +#define FLUSH_JOB_AES_GCM_ENC flush_job_aes_gcm_enc_avx +#endif + +#define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_avx +#define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_avx + +#define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_avx +#define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_avx +#define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_avx +#define QUEUE_SIZE queue_size_avx + +#define SUBMIT_JOB_AES_ENC SUBMIT_JOB_AES_ENC_AVX +#define FLUSH_JOB_AES_ENC FLUSH_JOB_AES_ENC_AVX +#define SUBMIT_JOB_AES_DEC SUBMIT_JOB_AES_DEC_AVX +#define FLUSH_JOB_AES_DEC FLUSH_JOB_AES_DEC_AVX + + + +JOB_AES_HMAC *submit_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_sha_224_avx(MB_MGR_HMAC_SHA_256_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_sha_224_avx(MB_MGR_HMAC_SHA_256_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_sha_256_avx(MB_MGR_HMAC_SHA_256_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_sha_256_avx(MB_MGR_HMAC_SHA_256_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_sha_384_avx(MB_MGR_HMAC_SHA_512_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_sha_384_avx(MB_MGR_HMAC_SHA_512_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_sha_512_avx(MB_MGR_HMAC_SHA_512_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_sha_512_avx(MB_MGR_HMAC_SHA_512_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state); + +JOB_AES_HMAC *submit_job_aes_cmac_auth_avx(MB_MGR_CMAC_OOO *state, + JOB_AES_HMAC *job); + +JOB_AES_HMAC *flush_job_aes_cmac_auth_avx(MB_MGR_CMAC_OOO *state); + +JOB_AES_HMAC *submit_job_aes_ccm_auth_avx(MB_MGR_CCM_OOO *state, + JOB_AES_HMAC *job); + +JOB_AES_HMAC *flush_job_aes_ccm_auth_avx(MB_MGR_CCM_OOO *state); + +#define SUBMIT_JOB_HMAC submit_job_hmac_avx +#define FLUSH_JOB_HMAC flush_job_hmac_avx +#define SUBMIT_JOB_HMAC_SHA_224 submit_job_hmac_sha_224_avx +#define FLUSH_JOB_HMAC_SHA_224 flush_job_hmac_sha_224_avx +#define SUBMIT_JOB_HMAC_SHA_256 submit_job_hmac_sha_256_avx +#define FLUSH_JOB_HMAC_SHA_256 flush_job_hmac_sha_256_avx +#define SUBMIT_JOB_HMAC_SHA_384 submit_job_hmac_sha_384_avx +#define FLUSH_JOB_HMAC_SHA_384 flush_job_hmac_sha_384_avx +#define SUBMIT_JOB_HMAC_SHA_512 submit_job_hmac_sha_512_avx +#define FLUSH_JOB_HMAC_SHA_512 flush_job_hmac_sha_512_avx +#define SUBMIT_JOB_HMAC_MD5 submit_job_hmac_md5_avx +#define FLUSH_JOB_HMAC_MD5 flush_job_hmac_md5_avx + +/* ====================================================================== */ + +#define SUBMIT_JOB submit_job_avx +#define FLUSH_JOB flush_job_avx +#define SUBMIT_JOB_NOCHECK submit_job_nocheck_avx +#define GET_NEXT_JOB get_next_job_avx +#define GET_COMPLETED_JOB get_completed_job_avx + +/* ====================================================================== */ + + +#define SUBMIT_JOB_HASH SUBMIT_JOB_HASH_AVX +#define FLUSH_JOB_HASH FLUSH_JOB_HASH_AVX + +/* ====================================================================== */ + +#define AES_CFB_128_ONE aes_cfb_128_one_avx + +void aes128_cbc_mac_x8(AES_ARGS *args, uint64_t len); + +#define AES128_CBC_MAC aes128_cbc_mac_x8 + +#define FLUSH_JOB_AES_CCM_AUTH flush_job_aes_ccm_auth_avx +#define SUBMIT_JOB_AES_CCM_AUTH submit_job_aes_ccm_auth_avx + +#define FLUSH_JOB_AES_CMAC_AUTH flush_job_aes_cmac_auth_avx +#define SUBMIT_JOB_AES_CMAC_AUTH submit_job_aes_cmac_auth_avx + +/* ====================================================================== */ + +/* + * GCM submit / flush API for AVX arch + */ +#ifndef NO_GCM +static JOB_AES_HMAC * +submit_job_aes_gcm_dec_avx(MB_MGR *state, JOB_AES_HMAC *job) +{ + DECLARE_ALIGNED(struct gcm_context_data ctx, 16); + (void) state; + + if (16 == job->aes_key_len_in_bytes) + AES_GCM_DEC_128(job->aes_dec_key_expanded, &ctx, job->dst, + job->src + + job->cipher_start_src_offset_in_bytes, + job->msg_len_to_cipher_in_bytes, + job->iv, + job->u.GCM.aad, job->u.GCM.aad_len_in_bytes, + job->auth_tag_output, + job->auth_tag_output_len_in_bytes); + else if (24 == job->aes_key_len_in_bytes) + AES_GCM_DEC_192(job->aes_dec_key_expanded, &ctx, job->dst, + job->src + + job->cipher_start_src_offset_in_bytes, + job->msg_len_to_cipher_in_bytes, + job->iv, + job->u.GCM.aad, job->u.GCM.aad_len_in_bytes, + job->auth_tag_output, + job->auth_tag_output_len_in_bytes); + else /* assume 32 bytes */ + AES_GCM_DEC_256(job->aes_dec_key_expanded, &ctx, job->dst, + job->src + + job->cipher_start_src_offset_in_bytes, + job->msg_len_to_cipher_in_bytes, + job->iv, + job->u.GCM.aad, job->u.GCM.aad_len_in_bytes, + job->auth_tag_output, + job->auth_tag_output_len_in_bytes); + + job->status = STS_COMPLETED; + return job; +} + +static JOB_AES_HMAC * +flush_job_aes_gcm_dec_avx(MB_MGR *state, JOB_AES_HMAC *job) +{ + (void) state; + (void) job; + return NULL; +} + +static JOB_AES_HMAC * +submit_job_aes_gcm_enc_avx(MB_MGR *state, JOB_AES_HMAC *job) +{ + DECLARE_ALIGNED(struct gcm_context_data ctx, 16); + (void) state; + + if (16 == job->aes_key_len_in_bytes) + AES_GCM_ENC_128(job->aes_enc_key_expanded, &ctx, job->dst, + job->src + + job->cipher_start_src_offset_in_bytes, + job->msg_len_to_cipher_in_bytes, job->iv, + job->u.GCM.aad, job->u.GCM.aad_len_in_bytes, + job->auth_tag_output, + job->auth_tag_output_len_in_bytes); + else if (24 == job->aes_key_len_in_bytes) + AES_GCM_ENC_192(job->aes_enc_key_expanded, &ctx, job->dst, + job->src + + job->cipher_start_src_offset_in_bytes, + job->msg_len_to_cipher_in_bytes, job->iv, + job->u.GCM.aad, job->u.GCM.aad_len_in_bytes, + job->auth_tag_output, + job->auth_tag_output_len_in_bytes); + else /* assume 32 bytes */ + AES_GCM_ENC_256(job->aes_enc_key_expanded, &ctx, job->dst, + job->src + + job->cipher_start_src_offset_in_bytes, + job->msg_len_to_cipher_in_bytes, job->iv, + job->u.GCM.aad, job->u.GCM.aad_len_in_bytes, + job->auth_tag_output, + job->auth_tag_output_len_in_bytes); + + job->status = STS_COMPLETED; + return job; +} + +static JOB_AES_HMAC * +flush_job_aes_gcm_enc_avx(MB_MGR *state, JOB_AES_HMAC *job) +{ + (void) state; + (void) job; + return NULL; +} +#endif /* NO_GCM */ + +/* ====================================================================== */ + +IMB_DLL_LOCAL JOB_AES_HMAC * +submit_job_aes_cntr_avx(JOB_AES_HMAC *job) +{ + if (16 == job->aes_key_len_in_bytes) + AES_CNTR_128(job->src + job->cipher_start_src_offset_in_bytes, + job->iv, + job->aes_enc_key_expanded, + job->dst, + job->msg_len_to_cipher_in_bytes, + job->iv_len_in_bytes); + else if (24 == job->aes_key_len_in_bytes) + AES_CNTR_192(job->src + job->cipher_start_src_offset_in_bytes, + job->iv, + job->aes_enc_key_expanded, + job->dst, + job->msg_len_to_cipher_in_bytes, + job->iv_len_in_bytes); + else /* assume 32 bytes */ + AES_CNTR_256(job->src + job->cipher_start_src_offset_in_bytes, + job->iv, + job->aes_enc_key_expanded, + job->dst, + job->msg_len_to_cipher_in_bytes, + job->iv_len_in_bytes); + + job->status |= STS_COMPLETED_AES; + return job; +} + +IMB_DLL_LOCAL JOB_AES_HMAC * +submit_job_aes_cntr_bit_avx(JOB_AES_HMAC *job) +{ + if (16 == job->aes_key_len_in_bytes) + aes_cntr_bit_128_avx(job->src + + job->cipher_start_src_offset_in_bytes, + job->iv, + job->aes_enc_key_expanded, + job->dst, + job->msg_len_to_cipher_in_bits, + job->iv_len_in_bytes); + else if (24 == job->aes_key_len_in_bytes) + aes_cntr_bit_192_avx(job->src + + job->cipher_start_src_offset_in_bytes, + job->iv, + job->aes_enc_key_expanded, + job->dst, + job->msg_len_to_cipher_in_bits, + job->iv_len_in_bytes); + else /* assume 32 bytes */ + aes_cntr_bit_256_avx(job->src + + job->cipher_start_src_offset_in_bytes, + job->iv, + job->aes_enc_key_expanded, + job->dst, + job->msg_len_to_cipher_in_bits, + job->iv_len_in_bytes); + + job->status |= STS_COMPLETED_AES; + return job; +} + +void +init_mb_mgr_avx(MB_MGR *state) +{ + unsigned int j; + uint8_t *p; + size_t size; + + state->features = cpu_feature_adjust(state->flags, + cpu_feature_detect()); + + if (!(state->features & IMB_FEATURE_AESNI)) { + init_mb_mgr_sse_no_aesni(state); + return; + } + + /* Init AES out-of-order fields */ + memset(state->aes128_ooo.lens, 0xFF, + sizeof(state->aes128_ooo.lens)); + memset(&state->aes128_ooo.lens[0], 0, + sizeof(state->aes128_ooo.lens[0]) * 8); + memset(state->aes128_ooo.job_in_lane, 0, + sizeof(state->aes128_ooo.job_in_lane)); + state->aes128_ooo.unused_lanes = 0xF76543210; + state->aes128_ooo.num_lanes_inuse = 0; + + memset(state->aes192_ooo.lens, 0xFF, + sizeof(state->aes192_ooo.lens)); + memset(&state->aes192_ooo.lens[0], 0, + sizeof(state->aes192_ooo.lens[0]) * 8); + memset(state->aes192_ooo.job_in_lane, 0, + sizeof(state->aes192_ooo.job_in_lane)); + state->aes192_ooo.unused_lanes = 0xF76543210; + state->aes192_ooo.num_lanes_inuse = 0; + + memset(&state->aes256_ooo.lens, 0xFF, + sizeof(state->aes256_ooo.lens)); + memset(&state->aes256_ooo.lens[0], 0, + sizeof(state->aes256_ooo.lens[0]) * 8); + memset(state->aes256_ooo.job_in_lane, 0, + sizeof(state->aes256_ooo.job_in_lane)); + state->aes256_ooo.unused_lanes = 0xF76543210; + state->aes256_ooo.num_lanes_inuse = 0; + + /* DOCSIS SEC BPI (AES CBC + AES CFB for partial block) + * uses same settings as AES128 CBC. + */ + memset(state->docsis_sec_ooo.lens, 0xFF, + sizeof(state->docsis_sec_ooo.lens)); + memset(&state->docsis_sec_ooo.lens[0], 0, + sizeof(state->docsis_sec_ooo.lens[0]) * 8); + memset(state->docsis_sec_ooo.job_in_lane, 0, + sizeof(state->docsis_sec_ooo.job_in_lane)); + state->docsis_sec_ooo.unused_lanes = 0xF76543210; + state->docsis_sec_ooo.num_lanes_inuse = 0; + + + /* Init HMAC/SHA1 out-of-order fields */ + state->hmac_sha_1_ooo.lens[0] = 0; + state->hmac_sha_1_ooo.lens[1] = 0; + state->hmac_sha_1_ooo.lens[2] = 0; + state->hmac_sha_1_ooo.lens[3] = 0; + state->hmac_sha_1_ooo.lens[4] = 0xFFFF; + state->hmac_sha_1_ooo.lens[5] = 0xFFFF; + state->hmac_sha_1_ooo.lens[6] = 0xFFFF; + state->hmac_sha_1_ooo.lens[7] = 0xFFFF; + state->hmac_sha_1_ooo.unused_lanes = 0xFF03020100; + for (j = 0; j < AVX_NUM_SHA1_LANES; j++) { + state->hmac_sha_1_ooo.ldata[j].job_in_lane = NULL; + state->hmac_sha_1_ooo.ldata[j].extra_block[64] = 0x80; + memset(state->hmac_sha_1_ooo.ldata[j].extra_block + 65, + 0x00, + 64+7); + p = state->hmac_sha_1_ooo.ldata[j].outer_block; + memset(p + 5*4 + 1, + 0x00, + 64 - 5*4 - 1 - 2); + p[5*4] = 0x80; + p[64-2] = 0x02; + p[64-1] = 0xA0; + } + /* Init HMAC/SHA224 out-of-order fields */ + state->hmac_sha_224_ooo.lens[0] = 0; + state->hmac_sha_224_ooo.lens[1] = 0; + state->hmac_sha_224_ooo.lens[2] = 0; + state->hmac_sha_224_ooo.lens[3] = 0; + state->hmac_sha_224_ooo.lens[4] = 0xFFFF; + state->hmac_sha_224_ooo.lens[5] = 0xFFFF; + state->hmac_sha_224_ooo.lens[6] = 0xFFFF; + state->hmac_sha_224_ooo.lens[7] = 0xFFFF; + state->hmac_sha_224_ooo.unused_lanes = 0xFF03020100; + for (j = 0; j < AVX_NUM_SHA256_LANES; j++) { + state->hmac_sha_224_ooo.ldata[j].job_in_lane = NULL; + + p = state->hmac_sha_224_ooo.ldata[j].extra_block; + size = sizeof(state->hmac_sha_224_ooo.ldata[j].extra_block); + memset (p, 0x00, size); + p[64] = 0x80; + + p = state->hmac_sha_224_ooo.ldata[j].outer_block; + size = sizeof(state->hmac_sha_224_ooo.ldata[j].outer_block); + memset(p, 0x00, size); + p[7 * 4] = 0x80; /* digest 7 words long */ + p[64 - 2] = 0x02; /* length in little endian = 0x02E0 */ + p[64 - 1] = 0xE0; + } + + /* Init HMAC/SHA256 out-of-order fields */ + state->hmac_sha_256_ooo.lens[0] = 0; + state->hmac_sha_256_ooo.lens[1] = 0; + state->hmac_sha_256_ooo.lens[2] = 0; + state->hmac_sha_256_ooo.lens[3] = 0; + state->hmac_sha_256_ooo.lens[4] = 0xFFFF; + state->hmac_sha_256_ooo.lens[5] = 0xFFFF; + state->hmac_sha_256_ooo.lens[6] = 0xFFFF; + state->hmac_sha_256_ooo.lens[7] = 0xFFFF; + state->hmac_sha_256_ooo.unused_lanes = 0xFF03020100; + for (j = 0; j < AVX_NUM_SHA256_LANES; j++) { + state->hmac_sha_256_ooo.ldata[j].job_in_lane = NULL; + state->hmac_sha_256_ooo.ldata[j].extra_block[64] = 0x80; + memset(state->hmac_sha_256_ooo.ldata[j].extra_block + 65, + 0x00, + 64+7); + p = state->hmac_sha_256_ooo.ldata[j].outer_block; + memset(p + 8*4 + 1, + 0x00, + 64 - 8*4 - 1 - 2); + p[8 * 4] = 0x80; /* 8 digest words */ + p[64 - 2] = 0x03; /* length */ + p[64 - 1] = 0x00; + } + + + /* Init HMAC/SHA384 out-of-order fields */ + state->hmac_sha_384_ooo.lens[0] = 0; + state->hmac_sha_384_ooo.lens[1] = 0; + state->hmac_sha_384_ooo.lens[2] = 0xFFFF; + state->hmac_sha_384_ooo.lens[3] = 0xFFFF; + state->hmac_sha_384_ooo.lens[4] = 0xFFFF; + state->hmac_sha_384_ooo.lens[5] = 0xFFFF; + state->hmac_sha_384_ooo.lens[6] = 0xFFFF; + state->hmac_sha_384_ooo.lens[7] = 0xFFFF; + state->hmac_sha_384_ooo.unused_lanes = 0xFF0100; + for (j = 0; j < AVX_NUM_SHA512_LANES; j++) { + MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_384_ooo; + + ctx->ldata[j].job_in_lane = NULL; + ctx->ldata[j].extra_block[SHA_384_BLOCK_SIZE] = 0x80; + memset(ctx->ldata[j].extra_block + (SHA_384_BLOCK_SIZE + 1), + 0x00, SHA_384_BLOCK_SIZE + 7); + + p = ctx->ldata[j].outer_block; + memset(p + SHA384_DIGEST_SIZE_IN_BYTES + 1, 0x00, + /* special end point because this length is constant */ + SHA_384_BLOCK_SIZE - + SHA384_DIGEST_SIZE_IN_BYTES - 1 - 2); + /* mark the end */ + p[SHA384_DIGEST_SIZE_IN_BYTES] = 0x80; + /* hmac outer block length always of fixed size, + * it is OKey length, a whole message block length, 1024 bits, + * with padding plus the length of the inner digest, + * which is 384 bits, 1408 bits == 0x0580. + * The input message block needs to be converted to big endian + * within the sha implementation before use. + */ + p[SHA_384_BLOCK_SIZE - 2] = 0x05; + p[SHA_384_BLOCK_SIZE - 1] = 0x80; + } + + /* Init HMAC/SHA512 out-of-order fields */ + state->hmac_sha_512_ooo.lens[0] = 0; + state->hmac_sha_512_ooo.lens[1] = 0; + state->hmac_sha_512_ooo.lens[2] = 0xFFFF; + state->hmac_sha_512_ooo.lens[3] = 0xFFFF; + state->hmac_sha_512_ooo.lens[4] = 0xFFFF; + state->hmac_sha_512_ooo.lens[5] = 0xFFFF; + state->hmac_sha_512_ooo.lens[6] = 0xFFFF; + state->hmac_sha_512_ooo.lens[7] = 0xFFFF; + state->hmac_sha_512_ooo.unused_lanes = 0xFF0100; + for (j = 0; j < AVX_NUM_SHA512_LANES; j++) { + MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_512_ooo; + + ctx->ldata[j].job_in_lane = NULL; + ctx->ldata[j].extra_block[SHA_512_BLOCK_SIZE] = 0x80; + memset(ctx->ldata[j].extra_block + (SHA_512_BLOCK_SIZE + 1), + 0x00, SHA_512_BLOCK_SIZE + 7); + p = ctx->ldata[j].outer_block; + memset(p + SHA512_DIGEST_SIZE_IN_BYTES + 1, 0x00, + /* special end point because this length is constant */ + SHA_512_BLOCK_SIZE - + SHA512_DIGEST_SIZE_IN_BYTES - 1 - 2); + /* mark the end */ + p[SHA512_DIGEST_SIZE_IN_BYTES] = 0x80; + /* + * hmac outer block length always of fixed size, + * it is OKey length, a whole message block length, 1024 bits, + * with padding plus the length of the inner digest, + * which is 512 bits, 1536 bits == 0x600. + * The input message block needs to be converted to big endian + * within the sha implementation before use. + */ + p[SHA_512_BLOCK_SIZE - 2] = 0x06; + p[SHA_512_BLOCK_SIZE - 1] = 0x00; + } + + + /* Init HMAC/MD5 out-of-order fields */ + state->hmac_md5_ooo.lens[0] = 0; + state->hmac_md5_ooo.lens[1] = 0; + state->hmac_md5_ooo.lens[2] = 0; + state->hmac_md5_ooo.lens[3] = 0; + state->hmac_md5_ooo.lens[4] = 0; + state->hmac_md5_ooo.lens[5] = 0; + state->hmac_md5_ooo.lens[6] = 0; + state->hmac_md5_ooo.lens[7] = 0; + state->hmac_md5_ooo.lens[8] = 0xFFFF; + state->hmac_md5_ooo.lens[9] = 0xFFFF; + state->hmac_md5_ooo.lens[10] = 0xFFFF; + state->hmac_md5_ooo.lens[11] = 0xFFFF; + state->hmac_md5_ooo.lens[12] = 0xFFFF; + state->hmac_md5_ooo.lens[13] = 0xFFFF; + state->hmac_md5_ooo.lens[14] = 0xFFFF; + state->hmac_md5_ooo.lens[15] = 0xFFFF; + state->hmac_md5_ooo.unused_lanes = 0xF76543210; + for (j = 0; j < AVX_NUM_MD5_LANES; j++) { + state->hmac_md5_ooo.ldata[j].job_in_lane = NULL; + + p = state->hmac_md5_ooo.ldata[j].extra_block; + size = sizeof(state->hmac_md5_ooo.ldata[j].extra_block); + memset (p, 0x00, size); + p[64] = 0x80; + + p = state->hmac_md5_ooo.ldata[j].outer_block; + size = sizeof(state->hmac_md5_ooo.ldata[j].outer_block); + memset(p, 0x00, size); + p[4 * 4] = 0x80; + p[64 - 7] = 0x02; + p[64 - 8] = 0x80; + } + + /* Init AES/XCBC OOO fields */ + state->aes_xcbc_ooo.lens[0] = 0; + state->aes_xcbc_ooo.lens[1] = 0; + state->aes_xcbc_ooo.lens[2] = 0; + state->aes_xcbc_ooo.lens[3] = 0; + state->aes_xcbc_ooo.lens[4] = 0; + state->aes_xcbc_ooo.lens[5] = 0; + state->aes_xcbc_ooo.lens[6] = 0; + state->aes_xcbc_ooo.lens[7] = 0; + state->aes_xcbc_ooo.unused_lanes = 0xF76543210; + for (j = 0; j < 8; j++) { + state->aes_xcbc_ooo.ldata[j].job_in_lane = NULL; + state->aes_xcbc_ooo.ldata[j].final_block[16] = 0x80; + memset(state->aes_xcbc_ooo.ldata[j].final_block + 17, 0x00, 15); + } + + /* Init AES-CCM auth out-of-order fields */ + for (j = 0; j < 8; j++) { + state->aes_ccm_ooo.init_done[j] = 0; + state->aes_ccm_ooo.lens[j] = 0; + state->aes_ccm_ooo.job_in_lane[j] = NULL; + } + state->aes_ccm_ooo.unused_lanes = 0xF76543210; + + /* Init AES-CMAC auth out-of-order fields */ + for (j = 0; j < 8; j++) { + state->aes_cmac_ooo.init_done[j] = 0; + state->aes_cmac_ooo.lens[j] = 0; + state->aes_cmac_ooo.job_in_lane[j] = NULL; + } + state->aes_cmac_ooo.unused_lanes = 0xF76543210; + + /* Init "in order" components */ + state->next_job = 0; + state->earliest_job = -1; + + /* set AVX handlers */ + state->get_next_job = get_next_job_avx; + state->submit_job = submit_job_avx; + state->submit_job_nocheck = submit_job_nocheck_avx; + state->get_completed_job = get_completed_job_avx; + state->flush_job = flush_job_avx; + state->queue_size = queue_size_avx; + state->keyexp_128 = aes_keyexp_128_avx; + state->keyexp_192 = aes_keyexp_192_avx; + state->keyexp_256 = aes_keyexp_256_avx; + state->cmac_subkey_gen_128 = aes_cmac_subkey_gen_avx; + state->xcbc_keyexp = aes_xcbc_expand_key_avx; + state->des_key_sched = des_key_schedule; + state->sha1_one_block = sha1_one_block_avx; + state->sha1 = sha1_avx; + state->sha224_one_block = sha224_one_block_avx; + state->sha224 = sha224_avx; + state->sha256_one_block = sha256_one_block_avx; + state->sha256 = sha256_avx; + state->sha384_one_block = sha384_one_block_avx; + state->sha384 = sha384_avx; + state->sha512_one_block = sha512_one_block_avx; + state->sha512 = sha512_avx; + state->md5_one_block = md5_one_block_avx; + state->aes128_cfb_one = aes_cfb_128_one_avx; + + state->eea3_1_buffer = zuc_eea3_1_buffer_avx; + state->eea3_4_buffer = zuc_eea3_4_buffer_avx; + state->eea3_n_buffer = zuc_eea3_n_buffer_avx; + state->eia3_1_buffer = zuc_eia3_1_buffer_avx; + + state->f8_1_buffer = kasumi_f8_1_buffer_avx; + state->f8_1_buffer_bit = kasumi_f8_1_buffer_bit_avx; + state->f8_2_buffer = kasumi_f8_2_buffer_avx; + state->f8_3_buffer = kasumi_f8_3_buffer_avx; + state->f8_4_buffer = kasumi_f8_4_buffer_avx; + state->f8_n_buffer = kasumi_f8_n_buffer_avx; + state->f9_1_buffer = kasumi_f9_1_buffer_avx; + state->f9_1_buffer_user = kasumi_f9_1_buffer_user_avx; + state->kasumi_init_f8_key_sched = kasumi_init_f8_key_sched_avx; + state->kasumi_init_f9_key_sched = kasumi_init_f9_key_sched_avx; + state->kasumi_key_sched_size = kasumi_key_sched_size_avx; + + state->snow3g_f8_1_buffer_bit = snow3g_f8_1_buffer_bit_avx; + state->snow3g_f8_1_buffer = snow3g_f8_1_buffer_avx; + state->snow3g_f8_2_buffer = snow3g_f8_2_buffer_avx; + state->snow3g_f8_4_buffer = snow3g_f8_4_buffer_avx; + state->snow3g_f8_8_buffer = snow3g_f8_8_buffer_avx; + state->snow3g_f8_n_buffer = snow3g_f8_n_buffer_avx; + state->snow3g_f8_8_buffer_multikey = snow3g_f8_8_buffer_multikey_avx; + state->snow3g_f8_n_buffer_multikey = snow3g_f8_n_buffer_multikey_avx; + state->snow3g_f9_1_buffer = snow3g_f9_1_buffer_avx; + state->snow3g_init_key_sched = snow3g_init_key_sched_avx; + state->snow3g_key_sched_size = snow3g_key_sched_size_avx; + +#ifndef NO_GCM + state->gcm128_enc = aes_gcm_enc_128_avx_gen2; + state->gcm192_enc = aes_gcm_enc_192_avx_gen2; + state->gcm256_enc = aes_gcm_enc_256_avx_gen2; + state->gcm128_dec = aes_gcm_dec_128_avx_gen2; + state->gcm192_dec = aes_gcm_dec_192_avx_gen2; + state->gcm256_dec = aes_gcm_dec_256_avx_gen2; + state->gcm128_init = aes_gcm_init_128_avx_gen2; + state->gcm192_init = aes_gcm_init_192_avx_gen2; + state->gcm256_init = aes_gcm_init_256_avx_gen2; + state->gcm128_enc_update = aes_gcm_enc_128_update_avx_gen2; + state->gcm192_enc_update = aes_gcm_enc_192_update_avx_gen2; + state->gcm256_enc_update = aes_gcm_enc_256_update_avx_gen2; + state->gcm128_dec_update = aes_gcm_dec_128_update_avx_gen2; + state->gcm192_dec_update = aes_gcm_dec_192_update_avx_gen2; + state->gcm256_dec_update = aes_gcm_dec_256_update_avx_gen2; + state->gcm128_enc_finalize = aes_gcm_enc_128_finalize_avx_gen2; + state->gcm192_enc_finalize = aes_gcm_enc_192_finalize_avx_gen2; + state->gcm256_enc_finalize = aes_gcm_enc_256_finalize_avx_gen2; + state->gcm128_dec_finalize = aes_gcm_dec_128_finalize_avx_gen2; + state->gcm192_dec_finalize = aes_gcm_dec_192_finalize_avx_gen2; + state->gcm256_dec_finalize = aes_gcm_dec_256_finalize_avx_gen2; + state->gcm128_precomp = aes_gcm_precomp_128_avx_gen2; + state->gcm192_precomp = aes_gcm_precomp_192_avx_gen2; + state->gcm256_precomp = aes_gcm_precomp_256_avx_gen2; + state->gcm128_pre = aes_gcm_pre_128_avx_gen2; + state->gcm192_pre = aes_gcm_pre_192_avx_gen2; + state->gcm256_pre = aes_gcm_pre_256_avx_gen2; +#endif +} + +#include "mb_mgr_code.h" diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_flush_avx.asm new file mode 100644 index 000000000..750a630aa --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_flush_avx.asm @@ -0,0 +1,298 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "include/reg_sizes.asm" + +extern sha1_mult_avx + +section .data +default rel + +align 16 +byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b +x80: ;ddq 0x00000000000000000000000000000080 + dq 0x0000000000000080, 0x0000000000000000 +x00: ;ddq 0x00000000000000000000000000000000 + dq 0x0000000000000000, 0x0000000000000000 + +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 + ;ddq 0x00000000000000000000FFFF00000000 + dq 0x0000FFFF00000000, 0x0000000000000000 + ;ddq 0x0000000000000000FFFF000000000000 + dq 0xFFFF000000000000, 0x0000000000000000 +one: dq 1 +two: dq 2 +three: dq 3 + +section .text + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r12-r15 +%define idx rbp + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 + +%endif + +; This routine clobbers rbx, rbp +struc STACK +_gpr_save: resq 2 +_rsp_save: resq 1 +endstruc + +%define APPEND(a,b) a %+ b + +; JOB* flush_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state) +; arg 1 : rcx : state +MKGLOBAL(flush_job_hmac_avx,function,internal) +flush_job_hmac_avx: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes] + bt unused_lanes, 32+7 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel one] + cmp qword [state + _ldata + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel two] + cmp qword [state + _ldata + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel three] + +copy_lane_data: + ; copy valid lane (idx) to empty lanes + vmovdqa xmm0, [state + _lens] + mov tmp, [state + _args_data_ptr + PTR_SZ*idx] + +%assign I 0 +%rep 4 + cmp qword [state + _ldata + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args_data_ptr + PTR_SZ*I], tmp + vpor xmm0, xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + + vmovdqa [state + _lens], xmm0 + + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + vpshuflw xmm1, xmm1, 0 + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mult_avx + ; state is intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens + 2*idx], 1 + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + + ;; idx determines which column + ;; read off from consecutive rows + vmovd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] + vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1 + vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2 + vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3 + vpshufb xmm0, xmm0, [rel byteswap] + mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE] + bswap DWORD(tmp) + vmovdqa [lane_data + _outer_block], xmm0 + mov [lane_data + _outer_block + 4*4], DWORD(tmp) + + mov tmp, [job + _auth_key_xor_opad] + vmovdqu xmm0, [tmp] + mov DWORD(tmp), [tmp + 4*4] + vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0 + vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1 + vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2 + vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3 + mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp) + jmp copy_lane_data + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + mov [state + _lens + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp copy_lane_data + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 12 bytes + mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] + mov DWORD(tmp4), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE] + bswap DWORD(tmp2) + bswap DWORD(tmp4) + mov [p + 0*4], DWORD(tmp2) + mov [p + 1*4], DWORD(tmp4) + mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE] + bswap DWORD(tmp2) + mov [p + 2*4], DWORD(tmp2) + + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 12 + je clear_ret + + ;; copy remaining 8 bytes to return 20 byte digest + mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE] + mov DWORD(tmp4), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE] + bswap DWORD(tmp2) + bswap DWORD(tmp4) + mov [p + 3*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2) + mov [p + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp4) + +clear_ret: + +%ifdef SAFE_DATA + vpxor xmm0, xmm0 + + ;; Clear digest (20B), outer_block (20B) and extra_block (64B) + ;; of returned job and NULL jobs +%assign I 0 +%rep 4 + cmp qword [state + _ldata + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0 + jne APPEND(skip_clear_,I) + + ;; Clear digest + mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 0*SHA1_DIGEST_ROW_SIZE], 0 + mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 1*SHA1_DIGEST_ROW_SIZE], 0 + mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 2*SHA1_DIGEST_ROW_SIZE], 0 + mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 3*SHA1_DIGEST_ROW_SIZE], 0 + mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*I + 4*SHA1_DIGEST_ROW_SIZE], 0 + + lea lane_data, [state + _ldata + (I*_HMAC_SHA1_LANE_DATA_size)] + ;; Clear first 64 bytes of extra_block +%assign offset 0 +%rep 4 + vmovdqa [lane_data + _extra_block + offset], xmm0 +%assign offset (offset + 16) +%endrep + + ;; Clear first 20 bytes of outer_block + vmovdqa [lane_data + _outer_block], xmm0 + mov dword [lane_data + _outer_block + 16], 0 + +APPEND(skip_clear_,I): +%assign I (I+1) +%endrep + +%endif ;; SAFE_DATA + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_flush_avx.asm new file mode 100644 index 000000000..a53ad0843 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_flush_avx.asm @@ -0,0 +1,321 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "include/reg_sizes.asm" + +extern md5_x4x2_avx + +section .data +default rel +align 16 +dupw: ;ddq 0x01000100010001000100010001000100 + dq 0x0100010001000100, 0x0100010001000100 +x80: ;ddq 0x00000000000000000000000000000080 + dq 0x0000000000000080, 0x0000000000000000 +x00: ;ddq 0x00000000000000000000000000000000 + dq 0x0000000000000000, 0x0000000000000000 +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 + ;ddq 0x00000000000000000000FFFF00000000 + dq 0x0000FFFF00000000, 0x0000000000000000 + ;ddq 0x0000000000000000FFFF000000000000 + dq 0xFFFF000000000000, 0x0000000000000000 + ;ddq 0x000000000000FFFF0000000000000000 + dq 0x0000000000000000, 0x000000000000FFFF + ;ddq 0x00000000FFFF00000000000000000000 + dq 0x0000000000000000, 0x00000000FFFF0000 + ;ddq 0x0000FFFF000000000000000000000000 + dq 0x0000000000000000, 0x0000FFFF00000000 + ;ddq 0xFFFF0000000000000000000000000000 + dq 0x0000000000000000, 0xFFFF000000000000 +one: dq 1 +two: dq 2 +three: dq 3 +four: dq 4 +five: dq 5 +six: dq 6 +seven: dq 7 + +section .text + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbp +%define idx rbp + +; unused_lanes must be in rax-rdx +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 +%define tmp5 r9 + +%endif + +; This routine and/or the called routine clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +%define APPEND(a,b) a %+ b + +; JOB* flush_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state) +; arg 1 : rcx : state +MKGLOBAL(flush_job_hmac_md5_avx,function,internal) +flush_job_hmac_md5_avx: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_md5] + bt unused_lanes, 32+3 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata_md5 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel one] + cmp qword [state + _ldata_md5 + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel two] + cmp qword [state + _ldata_md5 + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel three] + cmp qword [state + _ldata_md5 + 4 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel four] + cmp qword [state + _ldata_md5 + 5 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel five] + cmp qword [state + _ldata_md5 + 6 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel six] + cmp qword [state + _ldata_md5 + 7 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel seven] + +copy_lane_data: + ; copy good lane (idx) to empty lanes + vmovdqa xmm0, [state + _lens_md5] + mov tmp, [state + _args_data_ptr_md5 + PTR_SZ*idx] + +%assign I 0 +%rep 8 + cmp qword [state + _ldata_md5 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args_data_ptr_md5 + PTR_SZ*I], tmp + vpor xmm0, xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + + vmovdqa [state + _lens_md5], xmm0 + + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + vpshufb xmm1, [rel dupw] ; duplicate words across all lanes + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _lens_md5], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call md5_x4x2_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_md5 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens_md5 + 2*idx], 1 + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp + + vmovd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] + vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1 + vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2 + vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3 + vmovdqa [lane_data + _outer_block], xmm0 + + mov tmp, [job + _auth_key_xor_opad] + vmovdqu xmm0, [tmp] + vmovd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0 + vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1 + vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2 + vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3 + jmp copy_lane_data + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + mov [state + _lens_md5 + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp copy_lane_data + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + mov unused_lanes, [state + _unused_lanes_md5] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes_md5], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 12 bytes + mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] + mov DWORD(tmp4), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE] + mov DWORD(tmp5), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE] +; bswap DWORD(tmp2) +; bswap DWORD(tmp4) +; bswap DWORD(tmp3) + mov [p + 0*4], DWORD(tmp2) + mov [p + 1*4], DWORD(tmp4) + mov [p + 2*4], DWORD(tmp5) + + cmp DWORD [job_rax + _auth_tag_output_len_in_bytes], 12 + je clear_ret + + ; copy 16 bytes + mov DWORD(tmp5), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE] + mov [p + 3*4], DWORD(tmp5) + +clear_ret: + +%ifdef SAFE_DATA + vpxor xmm0, xmm0 + + ;; Clear digest (16B), outer_block (16B) and extra_block (64B) + ;; of returned job and NULL jobs +%assign I 0 +%rep 8 + cmp qword [state + _ldata_md5 + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0 + jne APPEND(skip_clear_,I) + + ;; Clear digest (16 bytes) +%assign J 0 +%rep 4 + mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*I + J*MD5_DIGEST_ROW_SIZE], 0 +%assign J (J+1) +%endrep + + lea lane_data, [state + _ldata_md5 + (I*_HMAC_SHA1_LANE_DATA_size)] + ;; Clear first 64 bytes of extra_block +%assign offset 0 +%rep 4 + vmovdqa [lane_data + _extra_block + offset], xmm0 +%assign offset (offset + 16) +%endrep + + ;; Clear first 16 bytes of outer_block + vmovdqa [lane_data + _outer_block], xmm0 + +APPEND(skip_clear_,I): +%assign I (I+1) +%endrep + +%endif ;; SAFE_DATA + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_submit_avx.asm new file mode 100644 index 000000000..5e4627dca --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_submit_avx.asm @@ -0,0 +1,355 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "include/memcpy.asm" +%include "include/reg_sizes.asm" +%include "include/const.inc" + +extern md5_x4x2_avx + +section .data +default rel +align 16 +dupw: ;ddq 0x01000100010001000100010001000100 + dq 0x0100010001000100, 0x0100010001000100 + +section .text + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define reg3 rcx +%define reg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define reg3 rdi +%define reg4 rsi +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbp +%define last_len rbp +%define idx rbp + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx +%define tmp4 rbx + +%define job_rax rax +%define len rax + +%define size_offset reg3 +%define tmp2 reg3 + +%define lane reg4 +%define tmp3 reg4 + +%define extra_blocks r8 + +%define tmp r9 +%define p2 r9 + +%define lane_data r10 + +%endif + +; This routine and/or the called routine clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +; JOB* submit_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state, JOB_AES_HMAC *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +MKGLOBAL(submit_job_hmac_md5_avx,function,internal) +submit_job_hmac_md5_avx: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_md5] + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_md5 + lane_data] + mov [state + _unused_lanes_md5], unused_lanes + mov len, [job + _msg_len_to_hash_in_bytes] + mov tmp, len + shr tmp, 6 ; divide by 64, len in terms of blocks + + mov [lane_data + _job_in_lane], job + mov dword [lane_data + _outer_done], 0 + + ;; insert len into proper lane + vmovdqa xmm0, [state + _lens_md5] + XVPINSRW xmm0, xmm1, p, lane, tmp, scale_x16 + vmovdqa [state + _lens_md5], xmm0 + + mov last_len, len + and last_len, 63 + lea extra_blocks, [last_len + 9 + 63] + shr extra_blocks, 6 + mov [lane_data + _extra_blocks], DWORD(extra_blocks) + + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + mov [state + _args_data_ptr_md5 + PTR_SZ*lane], p + + cmp len, 64 + jb copy_lt64 + +fast_copy: + add p, len + vmovdqu xmm0, [p - 64 + 0*16] + vmovdqu xmm1, [p - 64 + 1*16] + vmovdqu xmm2, [p - 64 + 2*16] + vmovdqu xmm3, [p - 64 + 3*16] + vmovdqa [lane_data + _extra_block + 0*16], xmm0 + vmovdqa [lane_data + _extra_block + 1*16], xmm1 + vmovdqa [lane_data + _extra_block + 2*16], xmm2 + vmovdqa [lane_data + _extra_block + 3*16], xmm3 +end_fast_copy: + + mov size_offset, extra_blocks + shl size_offset, 6 + sub size_offset, last_len + add size_offset, 64-8 + mov [lane_data + _size_offset], DWORD(size_offset) + mov start_offset, 64 + sub start_offset, last_len + mov [lane_data + _start_offset], DWORD(start_offset) + + lea tmp, [8*64 + 8*len] +; bswap tmp + mov [lane_data + _extra_block + size_offset], tmp + + mov tmp, [job + _auth_key_xor_ipad] + vmovdqu xmm0, [tmp] + vmovd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 0*MD5_DIGEST_ROW_SIZE], xmm0 + vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1 + vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2 + vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3 + + test len, ~63 + jnz ge64_bytes + +lt64_bytes: + vmovdqa xmm0, [state + _lens_md5] + XVPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16 + vmovdqa [state + _lens_md5], xmm0 + + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_md5 + PTR_SZ*lane], tmp + mov dword [lane_data + _extra_blocks], 0 + +ge64_bytes: + cmp unused_lanes, 0xf + jne return_null + jmp start_loop + + align 16 +start_loop: + ; Find min length + vmovdqa xmm0, [state + _lens_md5] + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _lens_md5], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call md5_x4x2_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_md5 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + + vmovdqa xmm0, [state + _lens_md5] + XVPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16 + vmovdqa [state + _lens_md5], xmm0 + + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp + + vmovd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] + vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1 + vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2 + vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3 +; vpshufb xmm0, [byteswap wrt rip] + vmovdqa [lane_data + _outer_block], xmm0 + + mov tmp, [job + _auth_key_xor_opad] + vmovdqu xmm0, [tmp] + vmovd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0 + vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1 + vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2 + vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3 + jmp start_loop + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + + vmovdqa xmm0, [state + _lens_md5] + XVPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16 + vmovdqa [state + _lens_md5], xmm0 + + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp start_loop + + align 16 + +copy_lt64: + ;; less than one message block of data + ;; beginning of source block + ;; destination extrablock but backwards by len from where 0x80 pre-populated + ;; p2 clobbers unused_lanes, undo before exiting + lea p2, [lane_data + _extra_block + 64] + sub p2, len + memcpy_avx_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 + mov unused_lanes, [state + _unused_lanes_md5] + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes_md5] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes_md5], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 12 bytes + mov DWORD(tmp), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] + mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE] + mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE] + mov [p + 0*4], DWORD(tmp) + mov [p + 1*4], DWORD(tmp2) + mov [p + 2*4], DWORD(tmp3) + + cmp DWORD [job_rax + _auth_tag_output_len_in_bytes], 12 + je clear_ret + + ; copy 16 bytes + mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE] + mov [p + 3*4], DWORD(tmp3) + +clear_ret: + +%ifdef SAFE_DATA + ;; Clear digest (16B), outer_block (16B) and extra_block (64B) of returned job + mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], 0 + mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 0 + mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 0 + mov dword [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 0 + + vpxor xmm0, xmm0 + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_md5 + lane_data] + ;; Clear first 64 bytes of extra_block +%assign offset 0 +%rep 4 + vmovdqa [lane_data + _extra_block + offset], xmm0 +%assign offset (offset + 16) +%endrep + + ;; Clear first 16 bytes of outer_block + vmovdqa [lane_data + _outer_block], xmm0 +%endif + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_flush_avx.asm new file mode 100644 index 000000000..416dfb869 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_flush_avx.asm @@ -0,0 +1,31 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define FUNC flush_job_hmac_sha_224_avx +%define SHA224 + +%include "avx/mb_mgr_hmac_sha_256_flush_avx.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_submit_avx.asm new file mode 100644 index 000000000..ad0721cd7 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_submit_avx.asm @@ -0,0 +1,31 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define FUNC submit_job_hmac_sha_224_avx +%define SHA224 + +%include "avx/mb_mgr_hmac_sha_256_submit_avx.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_flush_avx.asm new file mode 100644 index 000000000..0d8b8e50e --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_flush_avx.asm @@ -0,0 +1,356 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "include/reg_sizes.asm" + +extern sha_256_mult_avx + +section .data +default rel +align 16 +byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 + ;ddq 0x00000000000000000000FFFF00000000 + dq 0x0000FFFF00000000, 0x0000000000000000 + ;ddq 0x0000000000000000FFFF000000000000 + dq 0xFFFF000000000000, 0x0000000000000000 +one: dq 1 +two: dq 2 +three: dq 3 + +section .text + +%ifndef FUNC +%define FUNC flush_job_hmac_sha_256_avx +%endif + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r13-r15 +%define idx rbp + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 + +%define tmp5 r9 + +%define tmp6 r10 + +%endif + +; This routine clobbers rbx, rbp; called routine also clobbers r12 +struc STACK +_gpr_save: resq 3 +_rsp_save: resq 1 +endstruc + +%define APPEND(a,b) a %+ b + +; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state) +; arg 1 : rcx : state +MKGLOBAL(FUNC,function,internal) +FUNC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_sha256] + bt unused_lanes, 32+7 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata_sha256 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel one] + cmp qword [state + _ldata_sha256 + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel two] + cmp qword [state + _ldata_sha256 + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel three] + +copy_lane_data: + ; copy idx to empty lanes + vmovdqa xmm0, [state + _lens_sha256] + mov tmp, [state + _args_data_ptr_sha256 + 8*idx] + +%assign I 0 +%rep 4 + cmp qword [state + _ldata_sha256 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args_data_ptr_sha256 + 8*I], tmp + vpor xmm0, xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + + vmovdqa [state + _lens_sha256], xmm0 + + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + vpshuflw xmm1, xmm1, 0 + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _lens_sha256], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha_256_mult_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_sha256 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens_sha256 + 2*idx], 1 + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr_sha256 + 8*idx], tmp + + vmovd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] + vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1 + vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2 + vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3 + vpshufb xmm0, xmm0, [rel byteswap] + vmovd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE] + vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1 + vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2 +%ifndef SHA224 + vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3 +%endif + vpshufb xmm1, xmm1, [rel byteswap] + + vmovdqa [lane_data + _outer_block], xmm0 + vmovdqa [lane_data + _outer_block + 4*4], xmm1 +%ifdef SHA224 + mov dword [lane_data + _outer_block + 7*4], 0x80 +%endif + + mov tmp, [job + _auth_key_xor_opad] + vmovdqu xmm0, [tmp] + vmovdqu xmm1, [tmp + 4*4] + vmovd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0 + vpextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 + vpextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 + vpextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 + vmovd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1 + vpextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 + vpextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 + vpextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 + jmp copy_lane_data + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_sha256 + 8*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp copy_lane_data + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + mov unused_lanes, [state + _unused_lanes_sha256] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes_sha256], unused_lanes + + mov p, [job_rax + _auth_tag_output] +%ifdef SHA224 + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 14 + jne copy_full_digest +%else + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 16 + jne copy_full_digest +%endif + + ;; copy 14 bytes for SHA224 / 16 bytes for SHA256 + mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp6), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE] + bswap DWORD(tmp2) + bswap DWORD(tmp4) + bswap DWORD(tmp6) + bswap DWORD(tmp5) + mov [p + 0*4], DWORD(tmp2) + mov [p + 1*4], DWORD(tmp4) + mov [p + 2*4], DWORD(tmp6) +%ifdef SHA224 + mov [p + 3*4], WORD(tmp5) +%else + mov [p + 3*4], DWORD(tmp5) +%endif + jmp clear_ret + +copy_full_digest: + ;; copy 28 bytes for SHA224 / 32 bytes for SHA256 + mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp6), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE] + bswap DWORD(tmp2) + bswap DWORD(tmp4) + bswap DWORD(tmp6) + bswap DWORD(tmp5) + mov [p + 0*4], DWORD(tmp2) + mov [p + 1*4], DWORD(tmp4) + mov [p + 2*4], DWORD(tmp6) + mov [p + 3*4], DWORD(tmp5) + + mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp6), [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE] +%ifndef SHA224 + mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE] +%endif + bswap DWORD(tmp2) + bswap DWORD(tmp4) + bswap DWORD(tmp6) +%ifndef SHA224 + bswap DWORD(tmp5) +%endif + mov [p + 4*4], DWORD(tmp2) + mov [p + 5*4], DWORD(tmp4) + mov [p + 6*4], DWORD(tmp6) +%ifndef SHA224 + mov [p + 7*4], DWORD(tmp5) +%endif + +clear_ret: + +%ifdef SAFE_DATA + vpxor xmm0, xmm0 + + ;; Clear digest (28B/32B), outer_block (28B/32B) and extra_block (64B) + ;; of returned job and NULL jobs +%assign I 0 +%rep 4 + cmp qword [state + _ldata_sha256 + (I*_HMAC_SHA1_LANE_DATA_size) + _job_in_lane], 0 + jne APPEND(skip_clear_,I) + + ;; Clear digest (28 bytes for SHA-224, 32 bytes for SHA-256 bytes) +%assign J 0 +%rep 7 + mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*I + J*SHA256_DIGEST_ROW_SIZE], 0 +%assign J (J+1) +%endrep +%ifndef SHA224 + mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*I + 7*SHA256_DIGEST_ROW_SIZE], 0 +%endif + + lea lane_data, [state + _ldata_sha256 + (I*_HMAC_SHA1_LANE_DATA_size)] + ;; Clear first 64 bytes of extra_block +%assign offset 0 +%rep 4 + vmovdqa [lane_data + _extra_block + offset], xmm0 +%assign offset (offset + 16) +%endrep + + ;; Clear first 28 bytes (SHA-224) or 32 bytes (SHA-256) of outer_block + vmovdqa [lane_data + _outer_block], xmm0 +%ifdef SHA224 + mov qword [lane_data + _outer_block + 16], 0 + mov dword [lane_data + _outer_block + 24], 0 +%else + vmovdqa [lane_data + _outer_block + 16], xmm0 +%endif + +APPEND(skip_clear_,I): +%assign I (I+1) +%endrep + +%endif ;; SAFE_DATA + +return: + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov rsp, [rsp + _rsp_save] ; original SP + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_submit_avx.asm new file mode 100644 index 000000000..738d88b94 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_submit_avx.asm @@ -0,0 +1,428 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "include/reg_sizes.asm" +%include "include/memcpy.asm" +%include "include/const.inc" + +extern sha_256_mult_avx + +section .data +default rel +align 16 +byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +section .text + +%ifndef FUNC +%define FUNC submit_job_hmac_sha_256_avx +%endif + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define reg3 rcx +%define reg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define reg3 rdi +%define reg4 rsi +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r13-r15 +%define last_len rbp +%define idx rbp + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx +%define tmp4 rbx + +%define job_rax rax +%define len rax + +%define size_offset reg3 +%define tmp2 reg3 + +%define lane reg4 +%define tmp3 reg4 + +%define extra_blocks r8 + +%define tmp r9 +%define p2 r9 + +%define lane_data r10 + +%endif + +; This routine clobbers rbx, rbp, rsi, rdi; called routine also clobbers r12 +struc STACK +_gpr_save: resq 5 +_rsp_save: resq 1 +endstruc + +; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +MKGLOBAL(FUNC,function,internal) +FUNC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 +%ifndef LINUX + mov [rsp + _gpr_save + 8*3], rsi + mov [rsp + _gpr_save + 8*4], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_sha256] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_sha256 + lane_data] + mov [state + _unused_lanes_sha256], unused_lanes + mov len, [job + _msg_len_to_hash_in_bytes] + mov tmp, len + shr tmp, 6 ; divide by 64, len in terms of blocks + + mov [lane_data + _job_in_lane], job + mov dword [lane_data + _outer_done], 0 + + vmovdqa xmm0, [state + _lens_sha256] + XVPINSRW xmm0, xmm1, p, lane, tmp, scale_x16 + vmovdqa [state + _lens_sha256], xmm0 + + mov last_len, len + and last_len, 63 + lea extra_blocks, [last_len + 9 + 63] + shr extra_blocks, 6 + mov [lane_data + _extra_blocks], DWORD(extra_blocks) + + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + mov [state + _args_data_ptr_sha256 + 8*lane], p + + cmp len, 64 + jb copy_lt64 + +fast_copy: + add p, len + vmovdqu xmm0, [p - 64 + 0*16] + vmovdqu xmm1, [p - 64 + 1*16] + vmovdqu xmm2, [p - 64 + 2*16] + vmovdqu xmm3, [p - 64 + 3*16] + vmovdqa [lane_data + _extra_block + 0*16], xmm0 + vmovdqa [lane_data + _extra_block + 1*16], xmm1 + vmovdqa [lane_data + _extra_block + 2*16], xmm2 + vmovdqa [lane_data + _extra_block + 3*16], xmm3 +end_fast_copy: + + mov size_offset, extra_blocks + shl size_offset, 6 + sub size_offset, last_len + add size_offset, 64-8 + mov [lane_data + _size_offset], DWORD(size_offset) + mov start_offset, 64 + sub start_offset, last_len + mov [lane_data + _start_offset], DWORD(start_offset) + + lea tmp, [8*64 + 8*len] + bswap tmp + mov [lane_data + _extra_block + size_offset], tmp + + mov tmp, [job + _auth_key_xor_ipad] + vmovdqu xmm0, [tmp] + vmovdqu xmm1, [tmp + 4*4] + vmovd [state + _args_digest_sha256 + 4*lane + 0*SHA256_DIGEST_ROW_SIZE], xmm0 + vpextrd [state + _args_digest_sha256 + 4*lane + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 + vpextrd [state + _args_digest_sha256 + 4*lane + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 + vpextrd [state + _args_digest_sha256 + 4*lane + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 + vmovd [state + _args_digest_sha256 + 4*lane + 4*SHA256_DIGEST_ROW_SIZE], xmm1 + vpextrd [state + _args_digest_sha256 + 4*lane + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 + vpextrd [state + _args_digest_sha256 + 4*lane + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 + vpextrd [state + _args_digest_sha256 + 4*lane + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 + + test len, ~63 + jnz ge64_bytes + +lt64_bytes: + vmovdqa xmm0, [state + _lens_sha256] + XVPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16 + vmovdqa [state + _lens_sha256], xmm0 + + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_sha256 + 8*lane], tmp + mov dword [lane_data + _extra_blocks], 0 + +ge64_bytes: + cmp unused_lanes, 0xff + jne return_null + jmp start_loop + + align 16 +start_loop: + ; Find min length + vmovdqa xmm0, [state + _lens_sha256] + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + vpshuflw xmm1, xmm1, 0 + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _lens_sha256], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha_256_mult_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_sha256 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + + vmovdqa xmm0, [state + _lens_sha256] + XVPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16 + vmovdqa [state + _lens_sha256], xmm0 + + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr_sha256 + 8*idx], tmp + + vmovd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] + vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1 + vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2 + vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3 + vpshufb xmm0, xmm0, [rel byteswap] + vmovd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE] + vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1 + vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2 +%ifndef SHA224 + vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3 +%endif + vpshufb xmm1, xmm1, [rel byteswap] + vmovdqa [lane_data + _outer_block], xmm0 + vmovdqa [lane_data + _outer_block + 4*4], xmm1 +%ifdef SHA224 + mov dword [lane_data + _outer_block + 7*4], 0x80 +%endif + + mov tmp, [job + _auth_key_xor_opad] + vmovdqu xmm0, [tmp] + vmovdqu xmm1, [tmp + 4*4] + vmovd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0 + vpextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 + vpextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 + vpextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 + vmovd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1 + vpextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 + vpextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 + vpextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 + + jmp start_loop + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + + vmovdqa xmm0, [state + _lens_sha256] + XVPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16 + vmovdqa [state + _lens_sha256], xmm0 + + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_sha256 + 8*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp start_loop + + align 16 + +copy_lt64: + ;; less than one message block of data + ;; beginning of source block + ;; destination extrablock but backwards by len from where 0x80 pre-populated + ;; p2 clobbers unused_lanes, undo before exit + lea p2, [lane_data + _extra_block + 64] + sub p2, len + memcpy_avx_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 + mov unused_lanes, [state + _unused_lanes_sha256] + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes_sha256] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes_sha256], unused_lanes + + mov p, [job_rax + _auth_tag_output] + +%ifdef SHA224 + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 14 + jne copy_full_digest +%else + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 16 + jne copy_full_digest +%endif + ; copy 14 bytes for SHA224 / 16 bytes for SHA256 + mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE] + bswap DWORD(tmp) + bswap DWORD(tmp2) + bswap DWORD(tmp3) + bswap DWORD(tmp4) + mov [p + 0*4], DWORD(tmp) + mov [p + 1*4], DWORD(tmp2) + mov [p + 2*4], DWORD(tmp3) +%ifdef SHA224 + mov [p + 3*4], WORD(tmp4) +%else + mov [p + 3*4], DWORD(tmp4) +%endif + jmp clear_ret + +copy_full_digest: + ;; copy 28 bytes for SHA224 / 32 bytes for SHA256 + mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE] + bswap DWORD(tmp) + bswap DWORD(tmp2) + bswap DWORD(tmp3) + bswap DWORD(tmp4) + mov [p + 0*4], DWORD(tmp) + mov [p + 1*4], DWORD(tmp2) + mov [p + 2*4], DWORD(tmp3) + mov [p + 3*4], DWORD(tmp4) + + mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE] +%ifndef SHA224 + mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE] +%endif + bswap DWORD(tmp) + bswap DWORD(tmp2) + bswap DWORD(tmp3) +%ifndef SHA224 + bswap DWORD(tmp4) +%endif + mov [p + 4*4], DWORD(tmp) + mov [p + 5*4], DWORD(tmp2) + mov [p + 6*4], DWORD(tmp3) +%ifndef SHA224 + mov [p + 7*4], DWORD(tmp4) +%endif + +clear_ret: + +%ifdef SAFE_DATA + ;; Clear digest (28B/32B), outer_block (28B/32B) and extra_block (64B) of returned job +%assign J 0 +%rep 7 + mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*idx + J*SHA256_DIGEST_ROW_SIZE], 0 +%assign J (J+1) +%endrep +%ifndef SHA224 + mov dword [state + _args_digest_sha256 + SHA256_DIGEST_WORD_SIZE*idx + 7*SHA256_DIGEST_ROW_SIZE], 0 +%endif + + vpxor xmm0, xmm0 + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_sha256 + lane_data] + ;; Clear first 64 bytes of extra_block +%assign offset 0 +%rep 4 + vmovdqa [lane_data + _extra_block + offset], xmm0 +%assign offset (offset + 16) +%endrep + + ;; Clear first 28 bytes (SHA-224) or 32 bytes (SHA-256) of outer_block + vmovdqa [lane_data + _outer_block], xmm0 +%ifdef SHA224 + mov qword [lane_data + _outer_block + 16], 0 + mov dword [lane_data + _outer_block + 24], 0 +%else + vmovdqa [lane_data + _outer_block + 16], xmm0 +%endif +%endif ;; SAFE_DATA + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*3] + mov rdi, [rsp + _gpr_save + 8*4] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_flush_avx.asm new file mode 100644 index 000000000..f3491ab27 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_flush_avx.asm @@ -0,0 +1,31 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define FUNC flush_job_hmac_sha_384_avx +%define SHA_X_DIGEST_SIZE 384 + +%include "avx/mb_mgr_hmac_sha_512_flush_avx.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_submit_avx.asm new file mode 100644 index 000000000..a2fb0f1c6 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_submit_avx.asm @@ -0,0 +1,31 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define FUNC submit_job_hmac_sha_384_avx +%define SHA_X_DIGEST_SIZE 384 + +%include "avx/mb_mgr_hmac_sha_512_submit_avx.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_flush_avx.asm new file mode 100644 index 000000000..2de170948 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_flush_avx.asm @@ -0,0 +1,339 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "include/reg_sizes.asm" + +extern sha512_x2_avx + +section .data +default rel +align 16 +byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607 + dq 0x0001020304050607, 0x08090a0b0c0d0e0f +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 +one: dq 1 + +section .text + +%ifndef FUNC +%define FUNC flush_job_hmac_sha_512_avx +%define SHA_X_DIGEST_SIZE 512 +%endif + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r12-r15 +%define idx rbp + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 + +%define tmp5 r9 + +%define tmp6 r10 + +%endif + +; This routine clobbers rbx, rbp +struc STACK +_gpr_save: resq 2 +_rsp_save: resq 1 +endstruc + +%define APPEND(a,b) a %+ b + +; JOB* FUNC(MB_MGR_HMAC_SHA_512_OOO *state) +; arg 1 : rcx : state +MKGLOBAL(FUNC,function,internal) +FUNC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_sha512] + bt unused_lanes, 16+7 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata_sha512 + 1 * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0 + cmovne idx, [rel one] +copy_lane_data: + ; copy good lane (idx) to empty lanes + vmovdqa xmm0, [state + _lens_sha512] + mov tmp, [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*idx] + +%assign I 0 +%rep 2 + cmp qword [state + _ldata_sha512 + I * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0 + jne APPEND(skip_,I) + mov [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*I], tmp + vpor xmm0, xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + + vmovdqa [state + _lens_sha512], xmm0 + + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + vpshuflw xmm1, xmm1, 0xA0 + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _lens_sha512], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha512_x2_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _SHA512_LANE_DATA_size + lea lane_data, [state + _ldata_sha512 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done_sha512], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done_sha512], 1 + mov DWORD(size_offset), [lane_data + _size_offset_sha512] + mov qword [lane_data + _extra_block_sha512 + size_offset], 0 + mov word [state + _lens_sha512 + 2*idx], 1 + lea tmp, [lane_data + _outer_block_sha512] + mov job, [lane_data + _job_in_lane_sha512] + mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp + + ; move digest into data location + %assign I 0 + %rep (SHA_X_DIGEST_SIZE / (8*16)) + vmovq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE] + vpinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1 + vpshufb xmm0, [rel byteswap] + vmovdqa [lane_data + _outer_block_sha512 + I * 16], xmm0 + %assign I (I+1) + %endrep + + ; move the opad key into digest + mov tmp, [job + _auth_key_xor_opad] + + %assign I 0 + %rep 4 + vmovdqu xmm0, [tmp + I * 16] + vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE], xmm0 + vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 + %assign I (I+1) + %endrep + + jmp copy_lane_data + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset_sha512] + mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block_sha512 + start_offset] + mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks_sha512], 0 + jmp copy_lane_data + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane_sha512] + mov qword [lane_data + _job_in_lane_sha512], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + mov unused_lanes, [state + _unused_lanes_sha512] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes_sha512], unused_lanes + + mov p, [job_rax + _auth_tag_output] + +%if (SHA_X_DIGEST_SIZE != 384) + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 32 + jne copy_full_digest +%else + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 24 + jne copy_full_digest +%endif + + ;; copy 32 bytes for SHA512 / 24 bytes for SHA384 + mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE] +%if (SHA_X_DIGEST_SIZE != 384) + mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] +%endif + bswap QWORD(tmp2) + bswap QWORD(tmp4) + bswap QWORD(tmp6) +%if (SHA_X_DIGEST_SIZE != 384) + bswap QWORD(tmp5) +%endif + mov [p + 0*8], QWORD(tmp2) + mov [p + 1*8], QWORD(tmp4) + mov [p + 2*8], QWORD(tmp6) +%if (SHA_X_DIGEST_SIZE != 384) + mov [p + 3*8], QWORD(tmp5) +%endif + jmp clear_ret + +copy_full_digest: + ;; copy 64 bytes for SHA512 / 48 bytes for SHA384 + mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] + bswap QWORD(tmp2) + bswap QWORD(tmp4) + bswap QWORD(tmp6) + bswap QWORD(tmp5) + mov [p + 0*8], QWORD(tmp2) + mov [p + 1*8], QWORD(tmp4) + mov [p + 2*8], QWORD(tmp6) + mov [p + 3*8], QWORD(tmp5) + + mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 4*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 5*SHA512_DIGEST_ROW_SIZE] +%if (SHA_X_DIGEST_SIZE != 384) + mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA512_DIGEST_ROW_SIZE] +%endif + bswap QWORD(tmp2) + bswap QWORD(tmp4) +%if (SHA_X_DIGEST_SIZE != 384) + bswap QWORD(tmp6) + bswap QWORD(tmp5) +%endif + mov [p + 4*8], QWORD(tmp2) + mov [p + 5*8], QWORD(tmp4) +%if (SHA_X_DIGEST_SIZE != 384) + mov [p + 6*8], QWORD(tmp6) + mov [p + 7*8], QWORD(tmp5) +%endif + +clear_ret: + +%ifdef SAFE_DATA + vpxor xmm0, xmm0 + + ;; Clear digest (48B/64B), outer_block (48B/64B) and extra_block (128B) of returned job +%assign I 0 +%rep 2 + cmp qword [state + _ldata_sha512 + (I*_SHA512_LANE_DATA_size) + _job_in_lane_sha512], 0 + jne APPEND(skip_clear_,I) + + ;; Clear digest (48 bytes for SHA-384, 64 bytes for SHA-512 bytes) +%assign J 0 +%rep 6 + mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + J*SHA512_DIGEST_ROW_SIZE], 0 +%assign J (J+1) +%endrep +%if (SHA_X_DIGEST_SIZE != 384) + mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + 6*SHA512_DIGEST_ROW_SIZE], 0 + mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*I + 7*SHA512_DIGEST_ROW_SIZE], 0 +%endif + + lea lane_data, [state + _ldata_sha512 + (I*_SHA512_LANE_DATA_size)] + ;; Clear first 128 bytes of extra_block +%assign offset 0 +%rep 8 + vmovdqa [lane_data + _extra_block + offset], xmm0 +%assign offset (offset + 16) +%endrep + + ;; Clear first 48 bytes (SHA-384) or 64 bytes (SHA-512) of outer_block + vmovdqa [lane_data + _outer_block], xmm0 + vmovdqa [lane_data + _outer_block + 16], xmm0 + vmovdqa [lane_data + _outer_block + 32], xmm0 +%if (SHA_X_DIGEST_SIZE != 384) + vmovdqa [lane_data + _outer_block + 48], xmm0 +%endif + +APPEND(skip_clear_,I): +%assign I (I+1) +%endrep + +%endif ;; SAFE_DATA + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_submit_avx.asm new file mode 100644 index 000000000..b37884d0f --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_submit_avx.asm @@ -0,0 +1,416 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "include/reg_sizes.asm" +%include "include/memcpy.asm" +%include "include/const.inc" + +extern sha512_x2_avx + +section .data +default rel +align 16 +byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607 + dq 0x0001020304050607, 0x08090a0b0c0d0e0f + +section .text + +%ifndef FUNC +%define FUNC submit_job_hmac_sha_512_avx +%define SHA_X_DIGEST_SIZE 512 +%endif + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define reg3 rcx +%define reg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define reg3 rdi +%define reg4 rsi +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r12-r15 +%define last_len rbp +%define idx rbp + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx +%define tmp4 rbx + +%define job_rax rax +%define len rax + +%define size_offset reg3 +%define tmp2 reg3 + +%define lane reg4 +%define tmp3 reg4 + +%define extra_blocks r8 + +%define tmp r9 +%define p2 r9 + +%define lane_data r10 + +%endif + +; This routine clobbers rbx, rbp, rsi, rdi +struc STACK +_gpr_save: resq 4 +_rsp_save: resq 1 +endstruc + +; JOB* FUNC(MB_MGR_HMAC_sha_512_OOO *state, JOB_AES_HMAC *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +MKGLOBAL(FUNC,function,internal) +FUNC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp +%ifndef LINUX + mov [rsp + _gpr_save + 8*2], rsi + mov [rsp + _gpr_save + 8*3], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_sha512] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + imul lane_data, lane, _SHA512_LANE_DATA_size + lea lane_data, [state + _ldata_sha512 + lane_data] + mov [state + _unused_lanes_sha512], unused_lanes + mov len, [job + _msg_len_to_hash_in_bytes] + mov tmp, len + shr tmp, 7 ; divide by 128, len in terms of blocks + + mov [lane_data + _job_in_lane_sha512], job + mov dword [lane_data + _outer_done_sha512], 0 + + vmovdqa xmm0, [state + _lens_sha512] + XVPINSRW xmm0, xmm1, p, lane, tmp, scale_x16 + vmovdqa [state + _lens_sha512], xmm0 + + mov last_len, len + and last_len, 127 + lea extra_blocks, [last_len + 17 + 127] + shr extra_blocks, 7 + mov [lane_data + _extra_blocks_sha512], DWORD(extra_blocks) + + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], p + + cmp len, 128 + jb copy_lt128 + +fast_copy: + add p, len +%assign I 0 +%rep 2 + vmovdqu xmm0, [p - 128 + I*4*16 + 0*16] + vmovdqu xmm1, [p - 128 + I*4*16 + 1*16] + vmovdqu xmm2, [p - 128 + I*4*16 + 2*16] + vmovdqu xmm3, [p - 128 + I*4*16 + 3*16] + vmovdqa [lane_data + _extra_block_sha512 + I*4*16 + 0*16], xmm0 + vmovdqa [lane_data + _extra_block_sha512 + I*4*16 + 1*16], xmm1 + vmovdqa [lane_data + _extra_block_sha512 + I*4*16 + 2*16], xmm2 + vmovdqa [lane_data + _extra_block_sha512 + I*4*16 + 3*16], xmm3 +%assign I (I+1) +%endrep + +end_fast_copy: + + mov size_offset, extra_blocks + shl size_offset, 7 + sub size_offset, last_len + add size_offset, 128-8 + mov [lane_data + _size_offset_sha512], DWORD(size_offset) + mov start_offset, 128 + sub start_offset, last_len + mov [lane_data + _start_offset_sha512], DWORD(start_offset) + + lea tmp, [8*128 + 8*len] + bswap tmp + mov [lane_data + _extra_block_sha512 + size_offset], tmp + + mov tmp, [job + _auth_key_xor_ipad] + +%assign I 0 +%rep 4 + vmovdqu xmm0, [tmp + I * 2 * 8] + vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I)*SHA512_DIGEST_ROW_SIZE], xmm0 + vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 +%assign I (I+1) +%endrep + + test len, ~127 + jnz ge128_bytes + +lt128_bytes: + vmovdqa xmm0, [state + _lens_sha512] + XVPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16 + vmovdqa [state + _lens_sha512], xmm0 + + lea tmp, [lane_data + _extra_block_sha512 + start_offset] + mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], tmp ;; 8 to hold a UINT8 + mov dword [lane_data + _extra_blocks_sha512], 0 + +ge128_bytes: + cmp unused_lanes, 0xff + jne return_null + jmp start_loop + + align 16 +start_loop: + ; Find min length + vmovdqa xmm0, [state + _lens_sha512] + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...1) + cmp len2, 0 + je len_is_0 + + vpshuflw xmm1, xmm1, 0xA0 + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _lens_sha512], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha512_x2_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _SHA512_LANE_DATA_size + lea lane_data, [state + _ldata_sha512 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done_sha512], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done_sha512], 1 + mov DWORD(size_offset), [lane_data + _size_offset_sha512] + mov qword [lane_data + _extra_block_sha512 + size_offset], 0 + + vmovdqa xmm0, [state + _lens_sha512] + XVPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16 + vmovdqa [state + _lens_sha512], xmm0 + + lea tmp, [lane_data + _outer_block_sha512] + mov job, [lane_data + _job_in_lane_sha512] + mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp + +%assign I 0 +%rep (SHA_X_DIGEST_SIZE / (8 * 16)) + vmovq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE] + vpinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1 + vpshufb xmm0, [rel byteswap] + vmovdqa [lane_data + _outer_block_sha512 + I * 16], xmm0 +%assign I (I+1) +%endrep + + mov tmp, [job + _auth_key_xor_opad] +%assign I 0 +%rep 4 + vmovdqu xmm0, [tmp + I * 16] + vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE], xmm0 + vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 +%assign I (I+1) +%endrep + + jmp start_loop + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset_sha512] + + vmovdqa xmm0, [state + _lens_sha512] + XVPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16 + vmovdqa [state + _lens_sha512], xmm0 + + lea tmp, [lane_data + _extra_block_sha512 + start_offset] + mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp ;; idx is index of shortest length message + mov dword [lane_data + _extra_blocks_sha512], 0 + jmp start_loop + + align 16 +copy_lt128: + ;; less than one message block of data + ;; destination extra block but backwards by len from where 0x80 pre-populated + lea p2, [lane_data + _extra_block + 128] + sub p2, len + memcpy_avx_128_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 + mov unused_lanes, [state + _unused_lanes_sha512] + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane_sha512] + mov unused_lanes, [state + _unused_lanes_sha512] + mov qword [lane_data + _job_in_lane_sha512], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes_sha512], unused_lanes + + mov p, [job_rax + _auth_tag_output] + +%if (SHA_X_DIGEST_SIZE != 384) + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 32 + jne copy_full_digest +%else + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 24 + jne copy_full_digest +%endif + ;; copy 32 bytes for SHA512 / 24 bytes and SHA384 + mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE] +%if (SHA_X_DIGEST_SIZE != 384) + mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] +%endif + bswap QWORD(tmp) + bswap QWORD(tmp2) + bswap QWORD(tmp3) +%if (SHA_X_DIGEST_SIZE != 384) + bswap QWORD(tmp4) +%endif + mov [p + 0*8], QWORD(tmp) + mov [p + 1*8], QWORD(tmp2) + mov [p + 2*8], QWORD(tmp3) +%if (SHA_X_DIGEST_SIZE != 384) + mov [p + 3*8], QWORD(tmp4) +%endif + jmp clear_ret + +copy_full_digest: + ;; copy 64 bytes for SHA512 / 48 bytes and SHA384 + mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] + bswap QWORD(tmp) + bswap QWORD(tmp2) + bswap QWORD(tmp3) + bswap QWORD(tmp4) + mov [p + 0*8], QWORD(tmp) + mov [p + 1*8], QWORD(tmp2) + mov [p + 2*8], QWORD(tmp3) + mov [p + 3*8], QWORD(tmp4) + + mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 4*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 5*SHA512_DIGEST_ROW_SIZE] +%if (SHA_X_DIGEST_SIZE != 384) + mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA512_DIGEST_ROW_SIZE] +%endif + bswap QWORD(tmp) + bswap QWORD(tmp2) +%if (SHA_X_DIGEST_SIZE != 384) + bswap QWORD(tmp3) + bswap QWORD(tmp4) +%endif + mov [p + 4*8], QWORD(tmp) + mov [p + 5*8], QWORD(tmp2) +%if (SHA_X_DIGEST_SIZE != 384) + mov [p + 6*8], QWORD(tmp3) + mov [p + 7*8], QWORD(tmp4) +%endif + +clear_ret: + +%ifdef SAFE_DATA + ;; Clear digest (48B/64B), outer_block (48B/64B) and extra_block (128B) of returned job +%assign J 0 +%rep 6 + mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + J*SHA512_DIGEST_ROW_SIZE], 0 +%assign J (J+1) +%endrep +%if (SHA_X_DIGEST_SIZE != 384) + mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 6*SHA256_DIGEST_ROW_SIZE], 0 + mov qword [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 7*SHA256_DIGEST_ROW_SIZE], 0 +%endif + + vpxor xmm0, xmm0 + imul lane_data, idx, _SHA512_LANE_DATA_size + lea lane_data, [state + _ldata_sha512 + lane_data] + ;; Clear first 128 bytes of extra_block +%assign offset 0 +%rep 8 + vmovdqa [lane_data + _extra_block + offset], xmm0 +%assign offset (offset + 16) +%endrep + + ;; Clear first 48 bytes (SHA-384) or 64 bytes (SHA-512) of outer_block + vmovdqa [lane_data + _outer_block], xmm0 + vmovdqa [lane_data + _outer_block + 16], xmm0 + vmovdqa [lane_data + _outer_block + 32], xmm0 +%if (SHA_X_DIGEST_SIZE != 384) + vmovdqa [lane_data + _outer_block + 48], xmm0 +%endif +%endif ;; SAFE_DATA + +return: + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*2] + mov rdi, [rsp + _gpr_save + 8*3] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_submit_avx.asm new file mode 100644 index 000000000..418f0bc43 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_submit_avx.asm @@ -0,0 +1,358 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "include/reg_sizes.asm" +%include "include/memcpy.asm" +%include "include/const.inc" + +extern sha1_mult_avx + +section .data +default rel +align 16 +byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +section .text + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define reg3 rcx +%define reg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define reg3 rdi +%define reg4 rsi +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r12-r15 +%define last_len rbp +%define idx rbp + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx +%define tmp4 rbx + +%define job_rax rax +%define len rax + +%define size_offset reg3 +%define tmp2 reg3 + +%define lane reg4 +%define tmp3 reg4 + +%define extra_blocks r8 + +%define tmp r9 +%define p2 r9 + +%define lane_data r10 + +%endif + +; This routine clobbers rdi, rsi, rbx, rbp +struc STACK +_gpr_save: resq 4 +_rsp_save: resq 1 +endstruc + +; JOB* submit_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +MKGLOBAL(submit_job_hmac_avx,function,internal) +submit_job_hmac_avx: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp +%ifndef LINUX + mov [rsp + _gpr_save + 8*2], rsi + mov [rsp + _gpr_save + 8*3], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov len, [job + _msg_len_to_hash_in_bytes] + mov tmp, len + shr tmp, 6 ; divide by 64, len in terms of blocks + + mov [lane_data + _job_in_lane], job + mov dword [lane_data + _outer_done], 0 + + vmovdqa xmm0, [state + _lens] + XVPINSRW xmm0, xmm1, p, lane, tmp, scale_x16 + vmovdqa [state + _lens], xmm0 + + mov last_len, len + and last_len, 63 + lea extra_blocks, [last_len + 9 + 63] + shr extra_blocks, 6 + mov [lane_data + _extra_blocks], DWORD(extra_blocks) + + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + mov [state + _args_data_ptr + PTR_SZ*lane], p + cmp len, 64 + jb copy_lt64 + +fast_copy: + add p, len + vmovdqu xmm0, [p - 64 + 0*16] + vmovdqu xmm1, [p - 64 + 1*16] + vmovdqu xmm2, [p - 64 + 2*16] + vmovdqu xmm3, [p - 64 + 3*16] + vmovdqa [lane_data + _extra_block + 0*16], xmm0 + vmovdqa [lane_data + _extra_block + 1*16], xmm1 + vmovdqa [lane_data + _extra_block + 2*16], xmm2 + vmovdqa [lane_data + _extra_block + 3*16], xmm3 +end_fast_copy: + + mov size_offset, extra_blocks + shl size_offset, 6 + sub size_offset, last_len + add size_offset, 64-8 + mov [lane_data + _size_offset], DWORD(size_offset) + mov start_offset, 64 + sub start_offset, last_len + mov [lane_data + _start_offset], DWORD(start_offset) + + lea tmp, [8*64 + 8*len] + bswap tmp + mov [lane_data + _extra_block + size_offset], tmp + + mov tmp, [job + _auth_key_xor_ipad] + vmovdqu xmm0, [tmp] + mov DWORD(tmp), [tmp + 4*4] + vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 0*SHA1_DIGEST_ROW_SIZE], xmm0 + vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1 + vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2 + vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3 + mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp) + + test len, ~63 + jnz ge64_bytes + +lt64_bytes: + vmovdqa xmm0, [state + _lens] + XVPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16 + vmovdqa [state + _lens], xmm0 + + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr + PTR_SZ*lane], tmp + mov dword [lane_data + _extra_blocks], 0 + +ge64_bytes: + cmp unused_lanes, 0xff + jne return_null + jmp start_loop + + align 16 +start_loop: + ; Find min length + vmovdqa xmm0, [state + _lens] + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + vpshuflw xmm1, xmm1, 0 + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mult_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + + vmovdqa xmm0, [state + _lens] + XVPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16 + vmovdqa [state + _lens], xmm0 + + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + + vmovd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] + vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1 + vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2 + vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3 + vpshufb xmm0, xmm0, [rel byteswap] + mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE] + bswap DWORD(tmp) + vmovdqa [lane_data + _outer_block], xmm0 + mov [lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) + + mov tmp, [job + _auth_key_xor_opad] + vmovdqu xmm0, [tmp] + mov DWORD(tmp), [tmp + 4*4] + vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0 + vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1 + vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2 + vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3 + mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp) + jmp start_loop + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + + vmovdqa xmm0, [state + _lens] + XVPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16 + vmovdqa [state + _lens], xmm0 + + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp start_loop + + align 16 +copy_lt64: + ;; less than one message block of data + ;; beginning of source block + ;; destination extrablock but backwards by len from where 0x80 pre-populated + lea p2, [lane_data + _extra_block + 64] + sub p2, len + memcpy_avx_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 + mov unused_lanes, [state + _unused_lanes] + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 12 bytes + mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] + mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE] + mov DWORD(tmp3), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE] + bswap DWORD(tmp) + bswap DWORD(tmp2) + bswap DWORD(tmp3) + mov [p + 0*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) + mov [p + 1*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2) + mov [p + 2*SHA1_DIGEST_WORD_SIZE], DWORD(tmp3) + + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 12 + je clear_ret + + ;; copy remaining 8 bytes to return 20 byte digest + mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE] + mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE] + bswap DWORD(tmp) + bswap DWORD(tmp2) + mov [p + 3*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) + mov [p + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2) + +clear_ret: + +%ifdef SAFE_DATA + ;; Clear digest (20B), outer_block (20B) and extra_block (64B) of returned job + mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], 0 + mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 0 + mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 0 + mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 0 + mov dword [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], 0 + + vpxor xmm0, xmm0 + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + ;; Clear first 64 bytes of extra_block +%assign offset 0 +%rep 4 + vmovdqa [lane_data + _extra_block + offset], xmm0 +%assign offset (offset + 16) +%endrep + + ;; Clear first 20 bytes of outer_block + vmovdqa [lane_data + _outer_block], xmm0 + mov dword [lane_data + _outer_block + 16], 0 +%endif + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*2] + mov rdi, [rsp + _gpr_save + 8*3] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/md5_x4x2_avx.asm b/src/spdk/intel-ipsec-mb/avx/md5_x4x2_avx.asm new file mode 100644 index 000000000..1aa2c2600 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/md5_x4x2_avx.asm @@ -0,0 +1,716 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; code to compute octal MD5 using AVX + +;; Stack must be aligned to 16 bytes before call +;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 +;; Windows preserves: rcx rbp +;; +;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15 +;; Linux preserves: rdi rbp +;; +;; clobbers xmm0-15 + +%include "include/os.asm" +%include "mb_mgr_datastruct.asm" + +extern MD5_TABLE + +section .data +default rel +align 64 +ONES: + dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff + +section .text + +%ifdef LINUX +;; Linux Registers +%define arg1 rdi +%define arg2 rsi +%define mem1 rcx +%define mem2 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define mem1 rdi +%define mem2 rsi +%endif + +;; rbp is not clobbered + +%define state arg1 +%define num_blks arg2 + +%define inp0 r8 +%define inp1 r9 +%define inp2 r10 +%define inp3 r11 +%define inp4 r12 +%define inp5 r13 +%define inp6 r14 +%define inp7 r15 + +%define TBL rax +%define IDX rbx + +%define A xmm0 +%define B xmm1 +%define C xmm2 +%define D xmm3 +%define E xmm4 ; tmp +%define F xmm5 ; tmp + +%define A2 xmm6 +%define B2 xmm7 +%define C2 xmm8 +%define D2 xmm9 + + +%define FUN E +%define TMP F +%define FUN2 xmm10 +%define TMP2 xmm11 + +%define T0 xmm10 +%define T1 xmm11 +%define T2 xmm12 +%define T3 xmm13 +%define T4 xmm14 +%define T5 xmm15 + +; Stack Layout +; +; 470 DD2 +; 460 CC2 +; 450 BB2 +; 440 AA2 +; 430 DD +; 420 CC +; 410 BB +; 400 AA +; +; 3F0 data2[15] for lanes 7...4 \ +; ... \ +; 300 data2[0] for lanes 7...4 \ +; 2F0 data2[15] for lanes 3...0 > mem block 2 +; ... / +; 210 data2[1] for lanes 3...0 / +; 200 data2[0] for lanes 3...0 / +; +; 1F0 data1[15] for lanes 7...4 \ +; ... \ +; 100 data1[0] for lanes 7...4 \ +; F0 data1[15] for lanes 3...0 > mem block 1 +; ... / +; 10 data1[1] for lanes 3...0 / +; 0 data1[0] for lanes 3...0 / + +; stack size must be an odd multiple of 8 bytes in size +struc STACK +_DATA: reso 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs +_DIGEST: reso 8 ; stores AA-DD, AA2-DD2 + resb 8 ; for alignment +endstruc +%define STACK_SIZE STACK_size + +%define AA rsp + _DIGEST + 16*0 +%define BB rsp + _DIGEST + 16*1 +%define CC rsp + _DIGEST + 16*2 +%define DD rsp + _DIGEST + 16*3 +%define AA2 rsp + _DIGEST + 16*4 +%define BB2 rsp + _DIGEST + 16*5 +%define CC2 rsp + _DIGEST + 16*6 +%define DD2 rsp + _DIGEST + 16*7 + +;; +;; MD5 left rotations (number of bits) +;; +rot11 equ 7 +rot12 equ 12 +rot13 equ 17 +rot14 equ 22 +rot21 equ 5 +rot22 equ 9 +rot23 equ 14 +rot24 equ 20 +rot31 equ 4 +rot32 equ 11 +rot33 equ 16 +rot34 equ 23 +rot41 equ 6 +rot42 equ 10 +rot43 equ 15 +rot44 equ 21 + +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a3 a2 a1 a0} +; r1 = {b3 b2 b1 b0} +; r2 = {c3 c2 c1 c0} +; r3 = {d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; r0 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} + + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} + + vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} + vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} + + vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} +%endmacro + +;; +;; Magic functions defined in RFC 1321 +;; +; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z)))) +%macro MAGIC_F 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + vpxor %%F,%%Z, %%Y + vpand %%F,%%F,%%X + vpxor %%F,%%F,%%Z +%endmacro + +; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y)) +%macro MAGIC_G 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + MAGIC_F %%F,%%Z,%%X,%%Y +%endmacro + +; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z)) +%macro MAGIC_H 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + vpxor %%F,%%Z, %%Y + vpxor %%F,%%F, %%X +%endmacro + +; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z))) +%macro MAGIC_I 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + vpxor %%F,%%Z,[rel ONES] ; pnot %%F + vpor %%F,%%F,%%X + vpxor %%F,%%F,%%Y +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpsrld %%tmp, %%reg, (32-%%imm) + vpslld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +;; +;; single MD5 step +;; +;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) +;; +; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot +%macro MD5_STEP1 14 +%define %%MAGIC_FUN %1 +%define %%A %2 +%define %%B %3 +%define %%C %4 +%define %%D %5 +%define %%A2 %6 +%define %%B2 %7 +%define %%C2 %8 +%define %%D2 %9 +%define %%FUN %10 +%define %%TMP %11 +%define %%data %12 +%define %%MD5const %13 +%define %%nrot %14 + + vpaddd %%A, %%A, %%MD5const + vpaddd %%A2, %%A2, %%MD5const + vpaddd %%A, %%A, [%%data] + vpaddd %%A2, %%A2, [%%data + 16*16] + %%MAGIC_FUN %%FUN, %%B,%%C,%%D + vpaddd %%A, %%A, %%FUN + %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2 + vpaddd %%A2, %%A2, %%FUN + PROLD %%A,%%nrot, %%TMP + PROLD %%A2,%%nrot, %%TMP + vpaddd %%A, %%A, %%B + vpaddd %%A2, %%A2, %%B2 +%endmacro + +;; +;; single MD5 step +;; +;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) +;; +; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data, +; MD5const, nrot +%macro MD5_STEP 16 +%define %%MAGIC_FUN %1 +%define %%A %2 +%define %%B %3 +%define %%C %4 +%define %%D %5 +%define %%A2 %6 +%define %%B2 %7 +%define %%C2 %8 +%define %%D2 %9 +%define %%FUN %10 +%define %%TMP %11 +%define %%FUN2 %12 +%define %%TMP2 %13 +%define %%data %14 +%define %%MD5const %15 +%define %%nrot %16 + + vmovdqa %%TMP,[%%data] + vmovdqa %%TMP2,[%%data + 16*16] + vpaddd %%A, %%A, %%MD5const + vpaddd %%A2, %%A2, %%MD5const + vpaddd %%A, %%A, %%TMP + vpaddd %%A2, %%A2, %%TMP2 + %%MAGIC_FUN %%FUN, %%B,%%C,%%D + %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2 + vpaddd %%A, %%A, %%FUN + vpaddd %%A2, %%A2, %%FUN2 + PROLD %%A,%%nrot, %%TMP + PROLD %%A2,%%nrot, %%TMP2 + vpaddd %%A, %%A, %%B + vpaddd %%A2, %%A2, %%B2 +%endmacro + +; void md5_x4x2_avx(MD5_ARGS *args, UINT64 num_blks) +; arg 1 : pointer to MD5_ARGS structure +; arg 2 : number of blocks (>=1) +; +align 32 +MKGLOBAL(md5_x4x2_avx,function,internal) +md5_x4x2_avx: + + sub rsp, STACK_SIZE + + ;; each row of transposed digests is split into 2 parts, the right half stored in A, and left half in A2 + ;; Initialize digests + vmovdqa A,[state + 0*16 + 0*MD5_DIGEST_ROW_SIZE] + vmovdqa B,[state + 0*16 + 1*MD5_DIGEST_ROW_SIZE] + vmovdqa C,[state + 0*16 + 2*MD5_DIGEST_ROW_SIZE] + vmovdqa D,[state + 0*16 + 3*MD5_DIGEST_ROW_SIZE] + + vmovdqa A2,[state + 1*16 + 0*MD5_DIGEST_ROW_SIZE] + vmovdqa B2,[state + 1*16 + 1*MD5_DIGEST_ROW_SIZE] + vmovdqa C2,[state + 1*16 + 2*MD5_DIGEST_ROW_SIZE] + vmovdqa D2,[state + 1*16 + 3*MD5_DIGEST_ROW_SIZE] + + lea TBL, [rel MD5_TABLE] + + ;; load input pointers + mov inp0,[state+_data_ptr_md5 +0*PTR_SZ] + mov inp1,[state+_data_ptr_md5 +1*PTR_SZ] + mov inp2,[state+_data_ptr_md5 +2*PTR_SZ] + mov inp3,[state+_data_ptr_md5 +3*PTR_SZ] + mov inp4,[state+_data_ptr_md5 +4*PTR_SZ] + mov inp5,[state+_data_ptr_md5 +5*PTR_SZ] + mov inp6,[state+_data_ptr_md5 +6*PTR_SZ] + mov inp7,[state+_data_ptr_md5 +7*PTR_SZ] + xor IDX, IDX + + ; Make ping-pong pointers to the two memory blocks + mov mem1, rsp + lea mem2, [rsp + 16*16*2] + +;; Load first block of data and save back to stack +%assign I 0 +%rep 4 + vmovdqu T2,[inp0+IDX+I*16] + vmovdqu T1,[inp1+IDX+I*16] + vmovdqu T4,[inp2+IDX+I*16] + vmovdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem1+(I*4+0)*16],T0 + vmovdqa [mem1+(I*4+1)*16],T1 + vmovdqa [mem1+(I*4+2)*16],T2 + vmovdqa [mem1+(I*4+3)*16],T3 + + vmovdqu T2,[inp4+IDX+I*16] + vmovdqu T1,[inp5+IDX+I*16] + vmovdqu T4,[inp6+IDX+I*16] + vmovdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem1+(I*4+0)*16 + 16*16],T0 + vmovdqa [mem1+(I*4+1)*16 + 16*16],T1 + vmovdqa [mem1+(I*4+2)*16 + 16*16],T2 + vmovdqa [mem1+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) +%endrep + +lloop: + ; save old digests + vmovdqa [AA], A + vmovdqa [BB], B + vmovdqa [CC], C + vmovdqa [DD], D + ; save old digests + vmovdqa [AA2], A2 + vmovdqa [BB2], B2 + vmovdqa [CC2], C2 + vmovdqa [DD2], D2 + + add IDX, 4*16 + sub num_blks, 1 + je lastblock + + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+ 0*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 1*16, [TBL+ 1*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+ 2*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 3*16, [TBL+ 3*16], rot14 + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+ 4*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 5*16, [TBL+ 5*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+ 6*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 7*16, [TBL+ 7*16], rot14 + +%assign I 0 + vmovdqu T2,[inp0+IDX+I*16] + vmovdqu T1,[inp1+IDX+I*16] + vmovdqu T4,[inp2+IDX+I*16] + vmovdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16],T0 + vmovdqa [mem2+(I*4+1)*16],T1 + vmovdqa [mem2+(I*4+2)*16],T2 + vmovdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+ 8*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 9*16, [TBL+ 9*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+10*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +11*16, [TBL+11*16], rot14 + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+12*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +13*16, [TBL+13*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+14*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +15*16, [TBL+15*16], rot14 + + + vmovdqu T2,[inp4+IDX+I*16] + vmovdqu T1,[inp5+IDX+I*16] + vmovdqu T4,[inp6+IDX+I*16] + vmovdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16 + 16*16],T0 + vmovdqa [mem2+(I*4+1)*16 + 16*16],T1 + vmovdqa [mem2+(I*4+2)*16 + 16*16],T2 + vmovdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+16*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 6*16, [TBL+17*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+18*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 0*16, [TBL+19*16], rot24 + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+20*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +10*16, [TBL+21*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+22*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 4*16, [TBL+23*16], rot24 + + vmovdqu T2,[inp0+IDX+I*16] + vmovdqu T1,[inp1+IDX+I*16] + vmovdqu T4,[inp2+IDX+I*16] + vmovdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16],T0 + vmovdqa [mem2+(I*4+1)*16],T1 + vmovdqa [mem2+(I*4+2)*16],T2 + vmovdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+24*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +14*16, [TBL+25*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+26*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 8*16, [TBL+27*16], rot24 + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+28*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 2*16, [TBL+29*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+30*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +12*16, [TBL+31*16], rot24 + + vmovdqu T2,[inp4+IDX+I*16] + vmovdqu T1,[inp5+IDX+I*16] + vmovdqu T4,[inp6+IDX+I*16] + vmovdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16 + 16*16],T0 + vmovdqa [mem2+(I*4+1)*16 + 16*16],T1 + vmovdqa [mem2+(I*4+2)*16 + 16*16],T2 + vmovdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+32*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 8*16, [TBL+33*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+34*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +14*16, [TBL+35*16], rot34 + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+36*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 4*16, [TBL+37*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+38*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +10*16, [TBL+39*16], rot34 + + vmovdqu T2,[inp0+IDX+I*16] + vmovdqu T1,[inp1+IDX+I*16] + vmovdqu T4,[inp2+IDX+I*16] + vmovdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16],T0 + vmovdqa [mem2+(I*4+1)*16],T1 + vmovdqa [mem2+(I*4+2)*16],T2 + vmovdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+40*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 0*16, [TBL+41*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+42*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 6*16, [TBL+43*16], rot34 + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+44*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +12*16, [TBL+45*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+46*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 2*16, [TBL+47*16], rot34 + + vmovdqu T2,[inp4+IDX+I*16] + vmovdqu T1,[inp5+IDX+I*16] + vmovdqu T4,[inp6+IDX+I*16] + vmovdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16 + 16*16],T0 + vmovdqa [mem2+(I*4+1)*16 + 16*16],T1 + vmovdqa [mem2+(I*4+2)*16 + 16*16],T2 + vmovdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+48*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 7*16, [TBL+49*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+50*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 5*16, [TBL+51*16], rot44 + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+52*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 3*16, [TBL+53*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+54*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 1*16, [TBL+55*16], rot44 + + vmovdqu T2,[inp0+IDX+I*16] + vmovdqu T1,[inp1+IDX+I*16] + vmovdqu T4,[inp2+IDX+I*16] + vmovdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16],T0 + vmovdqa [mem2+(I*4+1)*16],T1 + vmovdqa [mem2+(I*4+2)*16],T2 + vmovdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+56*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +15*16, [TBL+57*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+58*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +13*16, [TBL+59*16], rot44 + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+60*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +11*16, [TBL+61*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+62*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 9*16, [TBL+63*16], rot44 + + vmovdqu T2,[inp4+IDX+I*16] + vmovdqu T1,[inp5+IDX+I*16] + vmovdqu T4,[inp6+IDX+I*16] + vmovdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16 + 16*16],T0 + vmovdqa [mem2+(I*4+1)*16 + 16*16],T1 + vmovdqa [mem2+(I*4+2)*16 + 16*16],T2 + vmovdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + + vpaddd A,A,[AA] + vpaddd B,B,[BB] + vpaddd C,C,[CC] + vpaddd D,D,[DD] + + vpaddd A2,A2,[AA2] + vpaddd B2,B2,[BB2] + vpaddd C2,C2,[CC2] + vpaddd D2,D2,[DD2] + + ; swap mem1 and mem2 + xchg mem1, mem2 + + jmp lloop + +lastblock: + + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+ 0*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+ 1*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+ 2*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+ 3*16], rot14 + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+ 4*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+ 5*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+ 6*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+ 7*16], rot14 + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+ 8*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+ 9*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+10*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+11*16], rot14 + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+12*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+13*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+14*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+15*16], rot14 + + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+16*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+17*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+18*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+19*16], rot24 + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+20*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+21*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+22*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+23*16], rot24 + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+24*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+25*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+26*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+27*16], rot24 + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+28*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+29*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+30*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+31*16], rot24 + + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+32*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+33*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+34*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+35*16], rot34 + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+36*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+37*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+38*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+39*16], rot34 + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+40*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+41*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+42*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+43*16], rot34 + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+44*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+45*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+46*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+47*16], rot34 + + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+48*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+49*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+50*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+51*16], rot44 + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+52*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+53*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+54*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+55*16], rot44 + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+56*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+57*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+58*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+59*16], rot44 + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+60*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+61*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+62*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+63*16], rot44 + + vpaddd A,A,[AA] + vpaddd B,B,[BB] + vpaddd C,C,[CC] + vpaddd D,D,[DD] + + vpaddd A2,A2,[AA2] + vpaddd B2,B2,[BB2] + vpaddd C2,C2,[CC2] + vpaddd D2,D2,[DD2] + + ; write out digests + vmovdqu [state + 0*16 + 0*MD5_DIGEST_ROW_SIZE ], A + vmovdqu [state + 0*16 + 1*MD5_DIGEST_ROW_SIZE ], B + vmovdqu [state + 0*16 + 2*MD5_DIGEST_ROW_SIZE ], C + vmovdqu [state + 0*16 + 3*MD5_DIGEST_ROW_SIZE ], D + vmovdqu [state + 1*16 + 0*MD5_DIGEST_ROW_SIZE], A2 + vmovdqu [state + 1*16 + 1*MD5_DIGEST_ROW_SIZE], B2 + vmovdqu [state + 1*16 + 2*MD5_DIGEST_ROW_SIZE], C2 + vmovdqu [state + 1*16 + 3*MD5_DIGEST_ROW_SIZE], D2 + + ;; update input pointers + add inp0, IDX + add inp1, IDX + add inp2, IDX + add inp3, IDX + add inp4, IDX + add inp5, IDX + add inp6, IDX + add inp7, IDX + mov [state +_data_ptr_md5 + 0*PTR_SZ], inp0 + mov [state +_data_ptr_md5 + 1*PTR_SZ], inp1 + mov [state +_data_ptr_md5 + 2*PTR_SZ], inp2 + mov [state +_data_ptr_md5 + 3*PTR_SZ], inp3 + mov [state +_data_ptr_md5 + 4*PTR_SZ], inp4 + mov [state +_data_ptr_md5 + 5*PTR_SZ], inp5 + mov [state +_data_ptr_md5 + 6*PTR_SZ], inp6 + mov [state +_data_ptr_md5 + 7*PTR_SZ], inp7 + + ;; Clear stack frame (72*16 bytes) +%ifdef SAFE_DATA + vpxor xmm0, xmm0 +%assign i 0 +%rep (2*2*16+8) + vmovdqa [rsp + i*16], xmm0 +%assign i (i+1) +%endrep +%endif + + ;;;;;;;;;;;;;;;; + ;; Postamble + add rsp, STACK_SIZE + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/pon_avx.asm b/src/spdk/intel-ipsec-mb/avx/pon_avx.asm new file mode 100644 index 000000000..8510dc4a3 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/pon_avx.asm @@ -0,0 +1,1170 @@ +;; +;; Copyright (c) 2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%use smartalign + +%include "job_aes_hmac.asm" +%include "include/os.asm" +%include "include/memcpy.asm" + +;;; This is implementation of stitched algorithms: AES128-CTR + CRC32 + BIP +;;; This combination is required by PON/xPON/gPON standard. +;;; Note: BIP is running XOR of double words +;;; Order of operations: +;;; - encrypt: HEC update (XGEM header), CRC32 (Ethernet FCS), AES-CTR and BIP +;;; - decrypt: BIP, AES-CTR and CRC32 (Ethernet FCS) + +extern byteswap_const +extern ddq_add_1 + +section .data +default rel + +;;; Precomputed constants for CRC32 (Ethernet FCS) +;;; Details of the CRC algorithm and 4 byte buffer of +;;; {0x01, 0x02, 0x03, 0x04}: +;;; Result Poly Init RefIn RefOut XorOut +;;; 0xB63CFBCD 0x04C11DB7 0xFFFFFFFF true true 0xFFFFFFFF +align 16 +rk1: + dq 0x00000000ccaa009e, 0x00000001751997d0 + +align 16 +rk5: + dq 0x00000000ccaa009e, 0x0000000163cd6124 + +align 16 +rk7: + dq 0x00000001f7011640, 0x00000001db710640 + +align 16 +pshufb_shf_table: + ;; use these values for shift registers with the pshufb instruction + dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 + dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +align 16 +init_crc_value: + dq 0x00000000FFFFFFFF, 0x0000000000000000 + +align 16 +mask: + dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 + +align 16 +mask2: + dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF +align 16 +mask3: + dq 0x8080808080808080, 0x8080808080808080 + +align 16 +mask_out_top_bytes: + dq 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF + dq 0x0000000000000000, 0x0000000000000000 + +align 16 +ddq_add_1_1: + dq 0x1, 0x1 + +;; Precomputed constants for HEC calculation (XGEM header) +;; POLY 0x53900000: +;; k1 = 0xf9800000 +;; k2 = 0xa0900000 +;; k3 = 0x7cc00000 +;; q = 0x46b927ec +;; p_res = 0x53900000 + +align 16 +k3_q: + dq 0x7cc00000, 0x46b927ec + +align 16 +p_res: + dq 0x53900000, 0 + +align 16 +mask_out_top_64bits: + dq 0xffffffff_ffffffff, 0 + +section .text + +%define NUM_AES_ROUNDS 10 + +%define xcounter xmm0 +%define xbip xmm1 +%define xcrc xmm2 +%define xcrckey xmm3 +%define xtmp1 xmm4 +%define xtmp2 xmm5 +%define xtmp3 xmm6 +%define xtmp4 xmm7 +%define xtmp5 xmm8 +%define xtmp6 xmm9 +%define xtmp7 xmm10 +%define xtmp8 xmm11 +%define xtmp9 xmm12 +%define xtmp10 xmm13 +%define xtmp11 xmm14 + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define arg4 rcx +%define tmp_1 r8 +%define tmp_2 r9 +%define tmp_3 r10 +%define tmp_4 r11 +%define tmp_5 r12 +%define tmp_6 r13 +%define tmp_7 r14 +%else +%define arg1 rcx +%define arg2 rdx +%define arg3 r8 +%define arg4 r9 +%define tmp_1 r10 +%define tmp_2 r11 +%define tmp_3 rax +%define tmp_4 r12 +%define tmp_5 r13 +%define tmp_6 r14 +%define tmp_7 r15 +%endif + +%define job arg1 + +%define p_in arg2 +%define p_keys arg3 +%define p_out arg4 + +%define num_bytes tmp_1 ; bytes to cipher +%define tmp tmp_2 +%define ctr_check tmp_3 ; counter block overflow check +%define bytes_to_crc tmp_4 ; number of bytes to crc ( < num_bytes) + +%define ethernet_fcs tmp_6 ; not used together with tmp3 +%define tmp2 tmp_5 +%define tmp3 tmp_6 + +%define write_back_crc tmp_7 +%define decrypt_not_done tmp_7 + +;;; ============================================================================ +;;; Does all AES encryption rounds +%macro AES_ENC_ROUNDS 3 +%define %%KP %1 ; [in] pointer to expanded keys +%define %%N_ROUNDS %2 ; [in] max rounds (128bit: 10, 12, 14) +%define %%BLOCK %3 ; [in/out] XMM with encrypted block + +%assign round 0 + vpxor %%BLOCK, %%BLOCK, [%%KP + (round * 16)] + +%rep (%%N_ROUNDS - 1) +%assign round (round + 1) + vaesenc %%BLOCK, %%BLOCK, [%%KP + (round * 16)] +%endrep + +%assign round (round + 1) + vaesenclast %%BLOCK, %%BLOCK, [%%KP + (round * 16)] + +%endmacro + +;;; ============================================================================ +;;; Does all AES encryption rounds on 4 blocks +%macro AES_ENC_ROUNDS_4 7 +%define %%KP %1 ; [in] pointer to expanded keys +%define %%N_ROUNDS %2 ; [in] max rounds (128bit: 10, 12, 14) +%define %%BLOCK1 %3 ; [in/out] XMM with encrypted block +%define %%BLOCK2 %4 ; [in/out] XMM with encrypted block +%define %%BLOCK3 %5 ; [in/out] XMM with encrypted block +%define %%BLOCK4 %6 ; [in/out] XMM with encrypted block +%define %%XT1 %7 ; [clobbered] temporary XMM register + +%assign round 0 + vmovdqa %%XT1, [%%KP + (round * 16)] + vpxor %%BLOCK1, %%BLOCK1, %%XT1 + vpxor %%BLOCK2, %%BLOCK2, %%XT1 + vpxor %%BLOCK3, %%BLOCK3, %%XT1 + vpxor %%BLOCK4, %%BLOCK4, %%XT1 + +%rep (%%N_ROUNDS - 1) +%assign round (round + 1) + vmovdqa %%XT1, [%%KP + (round * 16)] + vaesenc %%BLOCK1, %%BLOCK1, %%XT1 + vaesenc %%BLOCK2, %%BLOCK2, %%XT1 + vaesenc %%BLOCK3, %%BLOCK3, %%XT1 + vaesenc %%BLOCK4, %%BLOCK4, %%XT1 +%endrep + +%assign round (round + 1) + vmovdqa %%XT1, [%%KP + (round * 16)] + vaesenclast %%BLOCK1, %%BLOCK1, %%XT1 + vaesenclast %%BLOCK2, %%BLOCK2, %%XT1 + vaesenclast %%BLOCK3, %%BLOCK3, %%XT1 + vaesenclast %%BLOCK4, %%BLOCK4, %%XT1 +%endmacro + +;;; ============================================================================ +;;; CRC multiply before XOR against data block +%macro CRC_CLMUL 3 +%define %%XCRC_IN_OUT %1 ; [in/out] XMM with CRC (can be anything if "no_crc" below) +%define %%XCRC_MUL %2 ; [in] XMM with CRC constant (can be anything if "no_crc" below) +%define %%XTMP %3 ; [clobbered] temporary XMM + + vpclmulqdq %%XTMP, %%XCRC_IN_OUT, %%XCRC_MUL, 0x01 + vpclmulqdq %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%XCRC_MUL, 0x10 + vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%XTMP +%endmacro + +;;; ============================================================================ +;;; PON stitched algorithm round on a single AES block (16 bytes): +;;; AES-CTR (optional, depending on %%CIPH) +;;; - prepares counter block +;;; - encrypts counter block +;;; - loads text +;;; - xor's text against encrypted blocks +;;; - stores cipher text +;;; BIP +;;; - BIP update on 4 x 32-bits +;;; CRC32 +;;; - CRC32 calculation +;;; Note: via selection of no_crc, no_bip, no_load, no_store different macro +;;; behaviour can be achieved to match needs of the overall algorithm. +%macro DO_PON 15 +%define %%KP %1 ; [in] GP, pointer to expanded keys +%define %%N_ROUNDS %2 ; [in] number of AES rounds (10, 12 or 14) +%define %%CTR %3 ; [in/out] XMM with counter block +%define %%INP %4 ; [in/out] GP with input text pointer or "no_load" +%define %%OUTP %5 ; [in/out] GP with output text pointer or "no_store" +%define %%XBIP_IN_OUT %6 ; [in/out] XMM with BIP value or "no_bip" +%define %%XCRC_IN_OUT %7 ; [in/out] XMM with CRC (can be anything if "no_crc" below) +%define %%XCRC_MUL %8 ; [in] XMM with CRC constant (can be anything if "no_crc" below) +%define %%TXMM0 %9 ; [clobbered|out] XMM temporary or data out (no_store) +%define %%TXMM1 %10 ; [clobbered|in] XMM temporary or data in (no_load) +%define %%TXMM2 %11 ; [clobbered] XMM temporary +%define %%CRC_TYPE %12 ; [in] "first_crc" or "next_crc" or "no_crc" +%define %%DIR %13 ; [in] "ENC" or "DEC" +%define %%CIPH %14 ; [in] "CTR" or "NO_CTR" +%define %%CTR_CHECK %15 ; [in/out] GP with 64bit counter (to identify overflow) + +%ifidn %%CIPH, CTR + ;; prepare counter blocks for encryption + vpshufb %%TXMM0, %%CTR, [rel byteswap_const] + ;; perform 1 increment on whole 128 bits + add %%CTR_CHECK, 1 + jc %%_ctr_overflow + vpaddq %%CTR, %%CTR, [rel ddq_add_1] + jmp %%_ctr_overflow_done +%%_ctr_overflow: + vpaddq %%CTR, %%CTR, [rel ddq_add_1_1] +%%_ctr_overflow_done: +%endif + + ;; CRC calculation +%ifidn %%CRC_TYPE, next_crc + ;; CRC_MUL macro could be used here but its xor affects + ;; performance (blocks cipher xor's) so doing CLMUL + ;; only here and xor is done after the cipher. + vpclmulqdq %%TXMM2, %%XCRC_IN_OUT, %%XCRC_MUL, 0x01 + vpclmulqdq %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%XCRC_MUL, 0x10 +%endif + +%ifnidn %%INP, no_load + vmovdqu %%TXMM1, [%%INP] +%endif + +%ifidn %%CIPH, CTR + ;; AES rounds + AES_ENC_ROUNDS %%KP, %%N_ROUNDS, %%TXMM0 + + ;; xor plaintext/ciphertext against encrypted counter blocks + vpxor %%TXMM0, %%TXMM0, %%TXMM1 +%else ;; CIPH = NO_CTR + ;; register copy is needed as no_load/no_store options need it + vmovdqa %%TXMM0, %%TXMM1 +%endif ;; CIPH = CTR + +%ifnidn %%CRC_TYPE, no_crc +%ifidn %%CRC_TYPE, next_crc + ;; Finish split CRC_MUL() operation + vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXMM2 +%endif +%ifidn %%CIPH, CTR + ;; CRC calculation for ENCRYPTION/DECRYPTION + ;; - always XOR against plaintext block +%ifidn %%DIR, ENC + vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXMM1 +%else + vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXMM0 +%endif ; DECRYPT +%else ;; CIPH = NO_CTR + ;; CRC calculation for NO CIPHER option + vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXMM1 +%endif ;; CIPH = CTR +%endif ;; CRC_TYPE != NO_CRC + + ;; store the result in the output buffer +%ifnidn %%OUTP, no_store +%ifidn %%CIPH, CTR + vmovdqu [%%OUTP], %%TXMM0 +%else ;; CIPH = NO_CTR + vmovdqu [%%OUTP], %%TXMM1 +%endif ;; CIPH = CTR +%endif + + ;; update BIP value - always use cipher text for BIP +%ifnidn %%XBIP_IN_OUT, no_bip +%ifidn %%CIPH, CTR +%ifidn %%DIR, ENC + vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%TXMM0 +%else + vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%TXMM1 +%endif ; DECRYPT +%else ;; CIPH = NO_CTR + vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%TXMM1 +%endif ;; CIPH = CTR +%endif ;; !NO_BIP + + ;; increment in/out pointers +%ifnidn %%INP, no_load + add %%INP, 16 +%endif +%ifnidn %%OUTP, no_store + add %%OUTP, 16 +%endif +%endmacro ; DO_PON + +;;; ============================================================================ +;;; PON stitched algorithm round on a single AES block (16 bytes): +;;; AES-CTR (optional, depending on %%CIPH) +;;; - prepares counter block +;;; - encrypts counter block +;;; - loads text +;;; - xor's text against encrypted blocks +;;; - stores cipher text +;;; BIP +;;; - BIP update on 4 x 32-bits +;;; CRC32 +;;; - CRC32 calculation +;;; Note: via selection of no_crc, no_bip, no_load, no_store different macro +;;; behaviour can be achieved to match needs of the overall algorithm. +%macro DO_PON_4 23 +%define %%KP %1 ; [in] GP, pointer to expanded keys +%define %%N_ROUNDS %2 ; [in] number of AES rounds (10, 12 or 14) +%define %%CTR %3 ; [in/out] XMM with counter block +%define %%INP %4 ; [in/out] GP with input text pointer or "no_load" +%define %%OUTP %5 ; [in/out] GP with output text pointer or "no_store" +%define %%XBIP_IN_OUT %6 ; [in/out] XMM with BIP value or "no_bip" +%define %%XCRC_IN_OUT %7 ; [in/out] XMM with CRC (can be anything if "no_crc" below) +%define %%XCRC_MUL %8 ; [in] XMM with CRC constant (can be anything if "no_crc" below) +%define %%T0 %9 ; [clobbered] XMM temporary +%define %%T1 %10 ; [clobbered] XMM temporary +%define %%T2 %11 ; [clobbered] XMM temporary +%define %%T3 %12 ; [clobbered] XMM temporary +%define %%T4 %13 ; [clobbered] XMM temporary +%define %%T5 %14 ; [clobbered] XMM temporary +%define %%T6 %15 ; [clobbered] XMM temporary +%define %%T7 %16 ; [clobbered] XMM temporary +%define %%T8 %17 ; [clobbered] XMM temporary +%define %%T9 %18 ; [clobbered] XMM temporary +%define %%T10 %19 ; [clobbered] XMM temporary +%define %%CRC_TYPE %20 ; [in] "first_crc" or "next_crc" or "no_crc" +%define %%DIR %21 ; [in] "ENC" or "DEC" +%define %%CIPH %22 ; [in] "CTR" or "NO_CTR" +%define %%CTR_CHECK %23 ; [in/out] GP with 64bit counter (to identify overflow) + +%define %%CTR1 %%T3 +%define %%CTR2 %%T4 +%define %%CTR3 %%T5 +%define %%CTR4 %%T6 + +%define %%TXT1 %%T7 +%define %%TXT2 %%T8 +%define %%TXT3 %%T9 +%define %%TXT4 %%T10 + +%ifidn %%CIPH, CTR + ;; prepare counter blocks for encryption + vmovdqa %%T0, [rel ddq_add_1] + vmovdqa %%T2, [rel byteswap_const] + + ;; CTR1: copy saved CTR value as CTR1 + vmovdqa %%CTR1, %%CTR + + cmp %%CTR_CHECK, 0xffff_ffff_ffff_ffff - 4 + ja %%_ctr_will_overflow + + ;; case in which 64-bit counter will not overflow + vpaddq %%CTR2, %%CTR1, %%T0 + vpaddq %%CTR3, %%CTR2, %%T0 + vpaddq %%CTR4, %%CTR3, %%T0 + vpaddq %%CTR, %%CTR4, %%T0 + vpshufb %%CTR1, %%CTR1, %%T2 + vpshufb %%CTR2, %%CTR2, %%T2 + vpshufb %%CTR3, %%CTR3, %%T2 + vpshufb %%CTR4, %%CTR4, %%T2 + add %%CTR_CHECK, 4 + jmp %%_ctr_update_done + +%%_ctr_will_overflow: + vmovdqa %%T1, [rel ddq_add_1_1] + ;; CTR2: perform 1 increment on whole 128 bits + add %%CTR_CHECK, 1 + jc %%_ctr2_overflow + vpaddq %%CTR2, %%CTR1, %%T0 + jmp %%_ctr2_overflow_done +%%_ctr2_overflow: + vpaddq %%CTR2, %%CTR1, %%T1 +%%_ctr2_overflow_done: + vpshufb %%CTR1, %%CTR1, %%T2 + + ;; CTR3: perform 1 increment on whole 128 bits + add %%CTR_CHECK, 1 + jc %%_ctr3_overflow + vpaddq %%CTR3, %%CTR2, %%T0 + jmp %%_ctr3_overflow_done +%%_ctr3_overflow: + vpaddq %%CTR3, %%CTR2, %%T1 +%%_ctr3_overflow_done: + vpshufb %%CTR2, %%CTR2, %%T2 + + ;; CTR4: perform 1 increment on whole 128 bits + add %%CTR_CHECK, 1 + jc %%_ctr4_overflow + vpaddq %%CTR4, %%CTR3, %%T0 + jmp %%_ctr4_overflow_done +%%_ctr4_overflow: + vpaddq %%CTR4, %%CTR3, %%T1 +%%_ctr4_overflow_done: + vpshufb %%CTR3, %%CTR3, %%T2 + + ;; CTR: perform 1 increment on whole 128 bits (for the next iteration) + add %%CTR_CHECK, 1 + jc %%_ctr_overflow + vpaddq %%CTR, %%CTR4, %%T0 + jmp %%_ctr_overflow_done +%%_ctr_overflow: + vpaddq %%CTR, %%CTR4, %%T1 +%%_ctr_overflow_done: + vpshufb %%CTR4, %%CTR4, %%T2 +%%_ctr_update_done: +%endif + +%ifidn %%CRC_TYPE, next_crc + ;; CRC_MUL macro could be used here but its xor affects + ;; performance (blocks cipher xor's) so doing CLMUL + ;; only here and xor is done after the cipher. + vpclmulqdq %%T2, %%XCRC_IN_OUT, %%XCRC_MUL, 0x01 + vpclmulqdq %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%XCRC_MUL, 0x10 +%endif + + ;; load plaintext/ciphertext + vmovdqu %%TXT1, [%%INP] + vmovdqu %%TXT2, [%%INP + 16] + vmovdqu %%TXT3, [%%INP + 32] + vmovdqu %%TXT4, [%%INP + 48] + +%ifidn %%CIPH, CTR + AES_ENC_ROUNDS_4 %%KP, %%N_ROUNDS, %%CTR1, %%CTR2, %%CTR3, %%CTR4, %%T0 + + ;; xor plaintext/ciphertext against encrypted counter blocks + vpxor %%CTR1, %%CTR1, %%TXT1 + vpxor %%CTR2, %%CTR2, %%TXT2 + vpxor %%CTR3, %%CTR3, %%TXT3 + vpxor %%CTR4, %%CTR4, %%TXT4 +%endif ;; CIPH = CTR + +%ifidn %%CRC_TYPE, next_crc + ;; Finish split CRC_MUL() operation + vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%T2 +%endif +%ifidn %%CIPH, CTR +%ifidn %%DIR, ENC + ;; CRC calculation for ENCRYPTION (blocks 1 & 2) + ;; - XOR CRC against plaintext block + vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT1 + + CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2 + vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT2 + + CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2 +%else + ;; CRC calculation for DECRYPTION (blocks 1 & 2) + ;; - XOR CRC against plaintext block + vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%CTR1 + + CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2 + vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%CTR2 + + CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2 +%endif ; DECRYPT +%else ;; CIPH = NO_CTR + ;; CRC calculation for NO CIPHER option (blocks 1 & 2) + ;; - XOR CRC against plaintext block + vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT1 + + CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2 + vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT2 + + CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2 +%endif ;; CIPH = CTR + + ;; store ciphertext/plaintext +%ifidn %%CIPH, CTR + vmovdqu [%%OUTP], %%CTR1 + vmovdqu [%%OUTP + 16], %%CTR2 + vmovdqu [%%OUTP + 32], %%CTR3 + vmovdqu [%%OUTP + 48], %%CTR4 +%else ;; CIPH = NO_CTR + vmovdqu [%%OUTP], %%TXT1 + vmovdqu [%%OUTP + 16], %%TXT2 + vmovdqu [%%OUTP + 32], %%TXT3 + vmovdqu [%%OUTP + 48], %%TXT4 +%endif ;; CIPH = CTR + + ;; update BIP value +%ifidn %%CIPH, CTR + ;; - always use ciphertext for BIP +%ifidn %%DIR, ENC + vpxor %%T0, %%CTR1, %%CTR2 + vpxor %%T1, %%CTR3, %%CTR4 +%else + vpxor %%T0, %%TXT1, %%TXT2 + vpxor %%T1, %%TXT3, %%TXT4 +%endif ; DECRYPT +%else ;; CIPH = NO_CTR + vpxor %%T0, %%TXT1, %%TXT2 + vpxor %%T1, %%TXT3, %%TXT4 +%endif ;; CIPH = CTR + vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%T0 + vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%T1 + + ;; increment in/out pointers + add %%INP, 64 + add %%OUTP, 64 + +%ifidn %%CIPH, CTR +%ifidn %%DIR, ENC + ;; CRC calculation for ENCRYPTION (blocks 3 & 4) + ;; - XOR CRC against plaintext block + vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT3 + + CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2 + vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT4 +%else + ;; CRC calculation for DECRYPTION (blocks 3 & 4) + ;; - XOR CRC against plaintext block + vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%CTR3 + + CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2 + vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%CTR4 +%endif ; DECRYPT +%else ;; CIPH = NO_CTR + ;; CRC calculation for NO CIPHER option (blocks 3 & 4) + ;; - XOR CRC against plaintext block + vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT3 + + CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2 + vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT4 +%endif ;; CIPH = CTR + +%endmacro ; DO_PON_4 + +;;; ============================================================================ +;;; CIPHER and BIP specified number of bytes +%macro CIPHER_BIP_REST 14 +%define %%NUM_BYTES %1 ; [in/clobbered] number of bytes to cipher +%define %%DIR %2 ; [in] "ENC" or "DEC" +%define %%CIPH %3 ; [in] "CTR" or "NO_CTR" +%define %%PTR_IN %4 ; [in/clobbered] GPR pointer to input buffer +%define %%PTR_OUT %5 ; [in/clobbered] GPR pointer to output buffer +%define %%PTR_KEYS %6 ; [in] GPR pointer to expanded keys +%define %%XBIP_IN_OUT %7 ; [in/out] XMM 128-bit BIP state +%define %%XCTR_IN_OUT %8 ; [in/out] XMM 128-bit AES counter block +%define %%XMMT1 %9 ; [clobbered] temporary XMM +%define %%XMMT2 %10 ; [clobbered] temporary XMM +%define %%XMMT3 %11 ; [clobbered] temporary XMM +%define %%CTR_CHECK %12 ; [in/out] GP with 64bit counter (to identify overflow) +%define %%GPT1 %13 ; [clobbered] temporary GP +%define %%GPT2 %14 ; [clobbered] temporary GP + + align 16 +%%_cipher_last_blocks: + cmp %%NUM_BYTES, 16 + jb %%_partial_block_left + + DO_PON %%PTR_KEYS, NUM_AES_ROUNDS, %%XCTR_IN_OUT, %%PTR_IN, %%PTR_OUT, %%XBIP_IN_OUT, \ + no_crc, no_crc, %%XMMT1, %%XMMT2, %%XMMT3, no_crc, %%DIR, %%CIPH, %%CTR_CHECK + sub %%NUM_BYTES, 16 + jz %%_bip_done + jmp %%_cipher_last_blocks + +%%_partial_block_left: + simd_load_avx_15_1 %%XMMT2, %%PTR_IN, %%NUM_BYTES + + ;; DO_PON() is not loading nor storing the data in this case: + ;; XMMT2 = data in + ;; XMMT1 = data out + DO_PON %%PTR_KEYS, NUM_AES_ROUNDS, %%XCTR_IN_OUT, no_load, no_store, no_bip, \ + no_crc, no_crc, %%XMMT1, %%XMMT2, %%XMMT3, no_crc, %%DIR, %%CIPH, %%CTR_CHECK + + ;; bip update for partial block (mask out bytes outside the message) + lea %%GPT1, [rel mask_out_top_bytes + 16] + sub %%GPT1, %%NUM_BYTES + vmovdqu %%XMMT3, [%%GPT1] + ;; put masked cipher text into XMMT2 for BIP update +%ifidn %%DIR, ENC + vpand %%XMMT2, %%XMMT1, %%XMMT3 +%else + vpand %%XMMT2, %%XMMT2, %%XMMT3 +%endif + vpxor %%XBIP_IN_OUT, %%XMMT2 + + ;; store partial bytes in the output buffer + simd_store_avx_15 %%PTR_OUT, %%XMMT1, %%NUM_BYTES, %%GPT1, %%GPT2 + +%%_bip_done: +%endmacro ; CIPHER_BIP_REST + +;; ============================================================================= +;; Barrett reduction from 128-bits to 32-bits modulo Ethernet FCS polynomial + +%macro CRC32_REDUCE_128_TO_32 5 +%define %%CRC %1 ; [out] GP to store 32-bit Ethernet FCS value +%define %%XCRC %2 ; [in/clobbered] XMM with CRC +%define %%XT1 %3 ; [clobbered] temporary xmm register +%define %%XT2 %4 ; [clobbered] temporary xmm register +%define %%XT3 %5 ; [clobbered] temporary xmm register + +%define %%XCRCKEY %%XT3 + + ;; compute crc of a 128-bit value + vmovdqa %%XCRCKEY, [rel rk5] + + ;; 64b fold + vpclmulqdq %%XT1, %%XCRC, %%XCRCKEY, 0x00 + vpsrldq %%XCRC, %%XCRC, 8 + vpxor %%XCRC, %%XCRC, %%XT1 + + ;; 32b fold + vpslldq %%XT1, %%XCRC, 4 + vpclmulqdq %%XT1, %%XT1, %%XCRCKEY, 0x10 + vpxor %%XCRC, %%XCRC, %%XT1 + +%%_crc_barrett: + ;; Barrett reduction + vpand %%XCRC, [rel mask2] + vmovdqa %%XT1, %%XCRC + vmovdqa %%XT2, %%XCRC + vmovdqa %%XCRCKEY, [rel rk7] + + vpclmulqdq %%XCRC, %%XCRCKEY, 0x00 + vpxor %%XCRC, %%XT2 + vpand %%XCRC, [rel mask] + vmovdqa %%XT2, %%XCRC + vpclmulqdq %%XCRC, %%XCRCKEY, 0x10 + vpxor %%XCRC, %%XT2 + vpxor %%XCRC, %%XT1 + vpextrd DWORD(%%CRC), %%XCRC, 2 ; 32-bit CRC value + not DWORD(%%CRC) +%endmacro + +;; ============================================================================= +;; Barrett reduction from 128-bits to 32-bits modulo 0x53900000 polynomial + +%macro HEC_REDUCE_128_TO_32 4 +%define %%XMM_IN_OUT %1 ; [in/out] xmm register with data in and out +%define %%XT1 %2 ; [clobbered] temporary xmm register +%define %%XT2 %3 ; [clobbered] temporary xmm register +%define %%XT3 %4 ; [clobbered] temporary xmm register + +%define %%K3_Q %%XT1 +%define %%P_RES %%XT2 +%define %%XTMP %%XT3 + + ;; 128 to 64 bit reduction + vmovdqa %%K3_Q, [k3_q] + vmovdqa %%P_RES, [p_res] + + vpclmulqdq %%XTMP, %%XMM_IN_OUT, %%K3_Q, 0x01 ; K3 + vpxor %%XTMP, %%XTMP, %%XMM_IN_OUT + + vpclmulqdq %%XTMP, %%XTMP, %%K3_Q, 0x01 ; K3 + vpxor %%XMM_IN_OUT, %%XTMP, %%XMM_IN_OUT + + vpand %%XMM_IN_OUT, [rel mask_out_top_64bits] + + ;; 64 to 32 bit reduction + vpsrldq %%XTMP, %%XMM_IN_OUT, 4 + vpclmulqdq %%XTMP, %%XTMP, %%K3_Q, 0x10 ; Q + vpxor %%XTMP, %%XTMP, %%XMM_IN_OUT + vpsrldq %%XTMP, %%XTMP, 4 + + vpclmulqdq %%XTMP, %%XTMP, %%P_RES, 0x00 ; P + vpxor %%XMM_IN_OUT, %%XTMP, %%XMM_IN_OUT +%endmacro + +;; ============================================================================= +;; Barrett reduction from 64-bits to 32-bits modulo 0x53900000 polynomial + +%macro HEC_REDUCE_64_TO_32 4 +%define %%XMM_IN_OUT %1 ; [in/out] xmm register with data in and out +%define %%XT1 %2 ; [clobbered] temporary xmm register +%define %%XT2 %3 ; [clobbered] temporary xmm register +%define %%XT3 %4 ; [clobbered] temporary xmm register + +%define %%K3_Q %%XT1 +%define %%P_RES %%XT2 +%define %%XTMP %%XT3 + + vmovdqa %%K3_Q, [k3_q] + vmovdqa %%P_RES, [p_res] + + ;; 64 to 32 bit reduction + vpsrldq %%XTMP, %%XMM_IN_OUT, 4 + vpclmulqdq %%XTMP, %%XTMP, %%K3_Q, 0x10 ; Q + vpxor %%XTMP, %%XTMP, %%XMM_IN_OUT + vpsrldq %%XTMP, %%XTMP, 4 + + vpclmulqdq %%XTMP, %%XTMP, %%P_RES, 0x00 ; P + vpxor %%XMM_IN_OUT, %%XTMP, %%XMM_IN_OUT +%endmacro + +;; ============================================================================= +;; HEC compute and header update for 32-bit XGEM headers +%macro HEC_COMPUTE_32 6 +%define %%HEC_IN_OUT %1 ; [in/out] GP register with HEC in LE format +%define %%GT1 %2 ; [clobbered] temporary GP register +%define %%XT1 %4 ; [clobbered] temporary xmm register +%define %%XT2 %5 ; [clobbered] temporary xmm register +%define %%XT3 %6 ; [clobbered] temporary xmm register +%define %%XT4 %7 ; [clobbered] temporary xmm register + + mov DWORD(%%GT1), DWORD(%%HEC_IN_OUT) + ;; shift out 13 bits of HEC value for CRC computation + shr DWORD(%%GT1), 13 + + ;; mask out current HEC value to merge with an updated HEC at the end + and DWORD(%%HEC_IN_OUT), 0xffff_e000 + + ;; prepare the message for CRC computation + vmovd %%XT1, DWORD(%%GT1) + vpslldq %%XT1, 4 ; shift left by 32-bits + + HEC_REDUCE_64_TO_32 %%XT1, %%XT2, %%XT3, %%XT4 + + ;; extract 32-bit value + ;; - normally perform 20 bit shift right but bit 0 is a parity bit + vmovd DWORD(%%GT1), %%XT1 + shr DWORD(%%GT1), (20 - 1) + + ;; merge header bytes with updated 12-bit CRC value and + ;; compute parity + or DWORD(%%GT1), DWORD(%%HEC_IN_OUT) + popcnt DWORD(%%HEC_IN_OUT), DWORD(%%GT1) + and DWORD(%%HEC_IN_OUT), 1 + or DWORD(%%HEC_IN_OUT), DWORD(%%GT1) +%endmacro + +;; ============================================================================= +;; HEC compute and header update for 64-bit XGEM headers +%macro HEC_COMPUTE_64 6 +%define %%HEC_IN_OUT %1 ; [in/out] GP register with HEC in LE format +%define %%GT1 %2 ; [clobbered] temporary GP register +%define %%XT1 %3 ; [clobbered] temporary xmm register +%define %%XT2 %4 ; [clobbered] temporary xmm register +%define %%XT3 %5 ; [clobbered] temporary xmm register +%define %%XT4 %6 ; [clobbered] temporary xmm register + + mov %%GT1, %%HEC_IN_OUT + ;; shift out 13 bits of HEC value for CRC computation + shr %%GT1, 13 + + ;; mask out current HEC value to merge with an updated HEC at the end + and %%HEC_IN_OUT, 0xffff_ffff_ffff_e000 + + ;; prepare the message for CRC computation + vmovq %%XT1, %%GT1 + vpslldq %%XT1, 4 ; shift left by 32-bits + + HEC_REDUCE_128_TO_32 %%XT1, %%XT2, %%XT3, %%XT4 + + ;; extract 32-bit value + ;; - normally perform 20 bit shift right but bit 0 is a parity bit + vmovd DWORD(%%GT1), %%XT1 + shr DWORD(%%GT1), (20 - 1) + + ;; merge header bytes with updated 12-bit CRC value and + ;; compute parity + or %%GT1, %%HEC_IN_OUT + popcnt %%HEC_IN_OUT, %%GT1 + and %%HEC_IN_OUT, 1 + or %%HEC_IN_OUT, %%GT1 +%endmacro + +;;; ============================================================================ +;;; PON stitched algorithm of AES128-CTR, CRC and BIP +;;; - this is master macro that implements encrypt/decrypt API +;;; - calls other macros and directly uses registers +;;; defined at the top of the file +%macro AES128_CTR_PON 2 +%define %%DIR %1 ; [in] direction "ENC" or "DEC" +%define %%CIPH %2 ; [in] cipher "CTR" or "NO_CTR" + + push r12 + push r13 + push r14 +%ifndef LINUX + push r15 +%endif + +%ifidn %%DIR, ENC + ;; by default write back CRC for encryption + mov DWORD(write_back_crc), 1 +%else + ;; mark decryption as finished + mov DWORD(decrypt_not_done), 1 +%endif + ;; START BIP (and update HEC if encrypt direction) + ;; - load XGEM header (8 bytes) for BIP (not part of encrypted payload) + ;; - convert it into LE + ;; - update HEC field in the header + ;; - convert it into BE + ;; - store back the header (with updated HEC) + ;; - start BIP + ;; (free to use tmp_1, tmp2 and tmp_3 at this stage) + mov tmp_2, [job + _src] + add tmp_2, [job + _hash_start_src_offset_in_bytes] + mov tmp_3, [tmp_2] +%ifidn %%DIR, ENC + bswap tmp_3 ; go to LE + HEC_COMPUTE_64 tmp_3, tmp_1, xtmp1, xtmp2, xtmp3, xtmp4 + mov bytes_to_crc, tmp_3 + shr bytes_to_crc, (48 + 2) ; PLI = MSB 14 bits + bswap tmp_3 ; go back to BE + mov [tmp_2], tmp_3 + vmovq xbip, tmp_3 +%else + vmovq xbip, tmp_3 + mov bytes_to_crc, tmp_3 + bswap bytes_to_crc ; go to LE + shr bytes_to_crc, (48 + 2) ; PLI = MSB 14 bits +%endif + cmp bytes_to_crc, 4 + ja %%_crc_not_zero + ;; XGEM payload shorter or equal to 4 bytes +%ifidn %%DIR, ENC + ;; On encryption, do not write Ethernet FCS back into the message + xor DWORD(write_back_crc), DWORD(write_back_crc) +%else + ;; Mark decryption as not finished + ;; - Ethernet FCS is not computed + ;; - decrypt + BIP to be done at the end + xor DWORD(decrypt_not_done), DWORD(decrypt_not_done) +%endif + mov DWORD(bytes_to_crc), 4 ; it will be zero after the next line (avoid jmp) +%%_crc_not_zero: + sub bytes_to_crc, 4 ; subtract size of the CRC itself + +%ifidn %%CIPH, CTR + ;; - read 16 bytes of IV + ;; - convert to little endian format + ;; - save least significant 8 bytes in GP register for overflow check + mov tmp, [job + _iv] + vmovdqu xcounter, [tmp] + vpshufb xcounter, [rel byteswap_const] + vmovq ctr_check, xcounter +%endif + + ;; get input buffer (after XGEM header) + mov p_in, [job + _src] + add p_in, [job + _cipher_start_src_offset_in_bytes] + + ;; get output buffer + mov p_out, [job + _dst] + +%ifidn %%CIPH, CTR + ;; get key pointers + mov p_keys, [job + _aes_enc_key_expanded] +%endif + + ;; initial CRC value + vmovdqa xcrc, [rel init_crc_value] + + ;; load CRC constants + vmovdqa xcrckey, [rel rk1] ; rk1 and rk2 in xcrckey + + ;; get number of bytes to cipher +%ifidn %%CIPH, CTR + mov num_bytes, [job + _msg_len_to_cipher_in_bytes] +%else + ;; Message length to cipher is 0 + ;; - length is obtained from message length to hash (BIP) minus XGEM header size + mov num_bytes, [job + _msg_len_to_hash_in_bytes] + sub num_bytes, 8 +%endif + or bytes_to_crc, bytes_to_crc + jz %%_crc_done + + cmp bytes_to_crc, 32 + jae %%_at_least_32_bytes + +%ifidn %%DIR, DEC + ;; decrypt the buffer first + mov tmp, num_bytes + CIPHER_BIP_REST tmp, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \ + xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3 + + ;; correct in/out pointers - go back to start of the buffers + mov tmp, num_bytes + and tmp, -16 ; partial block handler doesn't increment pointers + sub p_in, tmp + sub p_out, tmp +%endif ; DECRYPTION + + ;; less than 32 bytes + cmp bytes_to_crc, 16 + je %%_exact_16_left + jl %%_less_than_16_left + ;; load the plaintext +%ifidn %%DIR, ENC + vmovdqu xtmp1, [p_in] +%else + vmovdqu xtmp1, [p_out] +%endif + vpxor xcrc, xtmp1 ; xor the initial crc value + jmp %%_crc_two_xmms + +%%_exact_16_left: +%ifidn %%DIR, ENC + vmovdqu xtmp1, [p_in] +%else + vmovdqu xtmp1, [p_out] +%endif + vpxor xcrc, xtmp1 ; xor the initial crc value + jmp %%_128_done + +%%_less_than_16_left: +%ifidn %%DIR, ENC + simd_load_avx_15_1 xtmp1, p_in, bytes_to_crc +%else + simd_load_avx_15_1 xtmp1, p_out, bytes_to_crc +%endif + vpxor xcrc, xtmp1 ; xor the initial crc value + + lea tmp, [rel pshufb_shf_table] + vmovdqu xtmp1, [tmp + bytes_to_crc] + vpshufb xcrc, xtmp1 + jmp %%_128_done + +%%_at_least_32_bytes: + cmp bytes_to_crc, 64 + jb %%_crc_below_64_bytes + + DO_PON_4 p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \ + xcrc, xcrckey, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, xtmp6, \ + xtmp7, xtmp8, xtmp9, xtmp10, xtmp11, first_crc, %%DIR, \ + %%CIPH, ctr_check + sub num_bytes, 64 + sub bytes_to_crc, 64 +%ifidn %%DIR, ENC + jz %%_128_done +%endif + + align 16 +%%_main_loop_64: + cmp bytes_to_crc, 64 + jb %%_main_loop + + DO_PON_4 p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \ + xcrc, xcrckey, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, xtmp6, \ + xtmp7, xtmp8, xtmp9, xtmp10, xtmp11, next_crc, %%DIR, \ + %%CIPH, ctr_check + sub num_bytes, 64 + sub bytes_to_crc, 64 +%ifidn %%DIR, ENC + jz %%_128_done +%endif + jmp %%_main_loop_64 + +%%_crc_below_64_bytes: + DO_PON p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \ + xcrc, xcrckey, xtmp1, xtmp2, xtmp3, first_crc, %%DIR, \ + %%CIPH, ctr_check + sub num_bytes, 16 + sub bytes_to_crc, 16 + + align 16 +%%_main_loop: + cmp bytes_to_crc, 16 + jb %%_exit_loop + DO_PON p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \ + xcrc, xcrckey, xtmp1, xtmp2, xtmp3, next_crc, %%DIR, \ + %%CIPH, ctr_check + sub num_bytes, 16 + sub bytes_to_crc, 16 +%ifidn %%DIR, ENC + jz %%_128_done +%endif + jmp %%_main_loop + +%%_exit_loop: + +%ifidn %%DIR, DEC + ;; decrypt rest of the message including CRC and optional padding + mov tmp, num_bytes + + CIPHER_BIP_REST tmp, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \ + xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3 + + mov tmp, num_bytes ; correct in/out pointers - to point before cipher & BIP + and tmp, -16 ; partial block handler doesn't increment pointers + sub p_in, tmp + sub p_out, tmp + + or bytes_to_crc, bytes_to_crc + jz %%_128_done +%endif ; DECRYPTION + + ;; Partial bytes left - complete CRC calculation +%%_crc_two_xmms: + lea tmp, [rel pshufb_shf_table] + vmovdqu xtmp2, [tmp + bytes_to_crc] + ;; @note: in case of in-place operation (default) this load is + ;; creating store-to-load problem. + ;; However, there is no easy way to address it at the moment. +%ifidn %%DIR, ENC + vmovdqu xtmp1, [p_in - 16 + bytes_to_crc] ; xtmp1 = data for CRC +%else + vmovdqu xtmp1, [p_out - 16 + bytes_to_crc] ; xtmp1 = data for CRC +%endif + vmovdqa xtmp3, xcrc + vpshufb xcrc, xtmp2 ; top num_bytes with LSB xcrc + vpxor xtmp2, [rel mask3] + vpshufb xtmp3, xtmp2 ; bottom (16 - num_bytes) with MSB xcrc + + ;; data bytes_to_crc (top) blended with MSB bytes of CRC (bottom) + vpblendvb xtmp3, xtmp1, xtmp2 + + ;; final CRC calculation + vpclmulqdq xtmp1, xcrc, xcrckey, 0x01 + vpclmulqdq xcrc, xcrc, xcrckey, 0x10 + vpxor xcrc, xtmp3 + vpxor xcrc, xtmp1 + +%%_128_done: + CRC32_REDUCE_128_TO_32 ethernet_fcs, xcrc, xtmp1, xtmp2, xcrckey + +%%_crc_done: + ;; @todo - store-to-load problem in ENC case (to be fixed later) + ;; - store CRC in input buffer and authentication tag output + ;; - encrypt remaining bytes +%ifidn %%DIR, ENC + or DWORD(write_back_crc), DWORD(write_back_crc) + jz %%_skip_crc_write_back + mov [p_in + bytes_to_crc], DWORD(ethernet_fcs) +%%_skip_crc_write_back: +%endif + mov tmp, [job + _auth_tag_output] + mov [tmp + 4], DWORD(ethernet_fcs) + + or num_bytes, num_bytes + jz %%_do_not_cipher_the_rest + + ;; encrypt rest of the message + ;; - partial bytes including CRC and optional padding + ;; decrypt rest of the message + ;; - this may only happen when XGEM payload is short and padding is added +%ifidn %%DIR, DEC + or DWORD(decrypt_not_done), DWORD(decrypt_not_done) + jnz %%_do_not_cipher_the_rest +%endif + CIPHER_BIP_REST num_bytes, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \ + xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3 + +%%_do_not_cipher_the_rest: + + ;; finalize BIP + vpsrldq xtmp1, xbip, 4 + vpsrldq xtmp2, xbip, 8 + vpsrldq xtmp3, xbip, 12 + vpxor xtmp1, xtmp1, xtmp2 + vpxor xbip, xbip, xtmp3 + vpxor xbip, xbip, xtmp1 + vmovd [tmp], xbip ; tmp already holds _auth_tag_output + + ;; set job status + or dword [job + _status], STS_COMPLETED + + ;; return job + mov rax, job + +%ifndef LINUX + pop r15 +%endif + pop r14 + pop r13 + pop r12 +%endmacro ; AES128_CTR_PON + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; submit_job_pon_enc_avx(JOB_AES_HMAC *job) +align 64 +MKGLOBAL(submit_job_pon_enc_avx,function,internal) +submit_job_pon_enc_avx: + AES128_CTR_PON ENC, CTR + ret + +;;; submit_job_pon_dec_avx(JOB_AES_HMAC *job) +align 64 +MKGLOBAL(submit_job_pon_dec_avx,function,internal) +submit_job_pon_dec_avx: + AES128_CTR_PON DEC, CTR + ret + +;;; submit_job_pon_enc_no_ctr_avx(JOB_AES_HMAC *job) +align 64 +MKGLOBAL(submit_job_pon_enc_no_ctr_avx,function,internal) +submit_job_pon_enc_no_ctr_avx: + AES128_CTR_PON ENC, NO_CTR + ret + +;;; submit_job_pon_dec_no_ctr_avx(JOB_AES_HMAC *job) +align 64 +MKGLOBAL(submit_job_pon_dec_no_ctr_avx,function,internal) +submit_job_pon_dec_no_ctr_avx: + AES128_CTR_PON DEC, NO_CTR + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/sha1_mult_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha1_mult_avx.asm new file mode 100644 index 000000000..b850a227b --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/sha1_mult_avx.asm @@ -0,0 +1,434 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "mb_mgr_datastruct.asm" + +section .data +default rel + +align 16 +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b +K00_19: ;ddq 0x5A8279995A8279995A8279995A827999 + dq 0x5A8279995A827999, 0x5A8279995A827999 +K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 +K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC +K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 + +section .text + +;; code to compute quad SHA1 using AVX +;; derived from ...\sha1_multiple\sha1_quad4.asm +;; variation of sha1_mult2.asm : clobbers all xmm regs, rcx left intact +;; rbx, rsi, rdi, rbp, r12-r15 left intact +;; This version is not safe to call from C/C++ + +;; Stack must be aligned to 16 bytes before call +;; Windows clobbers: rax rdx r8 r9 r10 r11 +;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15 +;; +;; Linux clobbers: rax rsi r8 r9 r10 r11 +;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15 +;; +;; clobbers xmm0-15 + +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a3 a2 a1 a0} +; r1 = {b3 b2 b1 b0} +; r2 = {c3 c2 c1 c0} +; r3 = {d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; r0 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} + + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} + + vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} + + vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} + + vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} +%endmacro +;; +;; Magic functions defined in FIPS 180-1 +;; +; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D))) +%macro MAGIC_F0 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpxor %%regF, %%regC,%%regD + vpand %%regF, %%regF,%%regB + vpxor %%regF, %%regF,%%regD +%endmacro + +; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F1 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpxor %%regF,%%regD,%%regC + vpxor %%regF,%%regF,%%regB +%endmacro + +; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D)) +%macro MAGIC_F2 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpor %%regF,%%regB,%%regC + vpand %%regT,%%regB,%%regC + vpand %%regF,%%regF,%%regD + vpor %%regF,%%regF,%%regT +%endmacro + +; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F3 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpsrld %%tmp, %%reg, (32-(%%imm)) + vpslld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; non-destructive +; PROLD_nd reg, imm, tmp, src +%macro PROLD_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpsrld %%tmp, %%src, (32-(%%imm)) + vpslld %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +%macro SHA1_STEP_00_15 10 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 + vpaddd %%regE, %%regE,%%immCNT + vpaddd %%regE, %%regE,[rsp + (%%memW * 16)] + PROLD_nd %%regT,5, %%regF,%%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + vpaddd %%regE, %%regE,%%regF +%endmacro + +%macro SHA1_STEP_16_79 10 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 + vpaddd %%regE, %%regE,%%immCNT + + vmovdqa W14, [rsp + ((%%memW - 14) & 15) * 16] + vpxor W16, W16, W14 + vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 16] + vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 16] + + vpsrld %%regF, W16, (32-1) + vpslld W16, W16, 1 + vpor %%regF, %%regF, W16 + ROTATE_W + + vmovdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF + vpaddd %%regE, %%regE,%%regF + + PROLD_nd %%regT,5, %%regF, %%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + vpaddd %%regE,%%regE,%%regF +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; FRAMESZ must be an odd multiple of 8 +%define FRAMESZ 16*16 + 8 + +%define VMOVPS vmovdqu + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define inp0 r8 +%define inp1 r9 +%define inp2 r10 +%define inp3 r11 + +%define IDX rax + +%define A xmm0 +%define B xmm1 +%define C xmm2 +%define D xmm3 +%define E xmm4 +%define F xmm5 ; tmp +%define G xmm6 ; tmp + +%define TMP G +%define FUN F +%define K xmm7 + +%define AA xmm8 +%define BB xmm9 +%define CC xmm10 +%define DD xmm11 +%define EE xmm12 + +%define T0 xmm6 +%define T1 xmm7 +%define T2 xmm8 +%define T3 xmm9 +%define T4 xmm10 +%define T5 xmm11 + +%define W14 xmm13 +%define W15 xmm14 +%define W16 xmm15 + +%macro ROTATE_ARGS 0 +%xdefine TMP_ E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%macro ROTATE_W 0 +%xdefine TMP_ W16 +%xdefine W16 W15 +%xdefine W15 W14 +%xdefine W14 TMP_ +%endm + +align 32 + +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +; void sha1_mult_avx(SHA1_ARGS *args, UINT32 size_in_blocks); +; arg 1 : rcx : pointer to args +; arg 2 : rdx : size (in blocks) ;; assumed to be >= 1 +MKGLOBAL(sha1_mult_avx,function,internal) +sha1_mult_avx: + + sub rsp, FRAMESZ + + ;; Initialize digests + vmovdqa A, [arg1 + 0*SHA1_DIGEST_ROW_SIZE] + vmovdqa B, [arg1 + 1*SHA1_DIGEST_ROW_SIZE] + vmovdqa C, [arg1 + 2*SHA1_DIGEST_ROW_SIZE] + vmovdqa D, [arg1 + 3*SHA1_DIGEST_ROW_SIZE] + vmovdqa E, [arg1 + 4*SHA1_DIGEST_ROW_SIZE] + + ;; transpose input onto stack + mov inp0,[arg1 + _data_ptr_sha1 + 0*PTR_SZ] + mov inp1,[arg1 + _data_ptr_sha1 + 1*PTR_SZ] + mov inp2,[arg1 + _data_ptr_sha1 + 2*PTR_SZ] + mov inp3,[arg1 + _data_ptr_sha1 + 3*PTR_SZ] + + xor IDX, IDX +lloop: + vmovdqa F, [rel PSHUFFLE_BYTE_FLIP_MASK] +%assign I 0 +%rep 4 + VMOVPS T2,[inp0+IDX] + VMOVPS T1,[inp1+IDX] + VMOVPS T4,[inp2+IDX] + VMOVPS T3,[inp3+IDX] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vpshufb T0, T0, F + vmovdqa [rsp+(I*4+0)*16],T0 + vpshufb T1, T1, F + vmovdqa [rsp+(I*4+1)*16],T1 + vpshufb T2, T2, F + vmovdqa [rsp+(I*4+2)*16],T2 + vpshufb T3, T3, F + vmovdqa [rsp+(I*4+3)*16],T3 + add IDX, 4*4 +%assign I (I+1) +%endrep + + ; save old digests + vmovdqa AA, A + vmovdqa BB, B + vmovdqa CC, C + vmovdqa DD, D + vmovdqa EE, E + +;; +;; perform 0-79 steps +;; + vmovdqa K, [rel K00_19] +;; do rounds 0...15 +%assign I 0 +%rep 16 + SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 16...19 + vmovdqa W16, [rsp + ((16 - 16) & 15) * 16] + vmovdqa W15, [rsp + ((16 - 15) & 15) * 16] +%rep 4 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 20...39 + vmovdqa K, [rel K20_39] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 40...59 + vmovdqa K, [rel K40_59] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 60...79 + vmovdqa K, [rel K60_79] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3 + ROTATE_ARGS +%assign I (I+1) +%endrep + + vpaddd A,A,AA + vpaddd B,B,BB + vpaddd C,C,CC + vpaddd D,D,DD + vpaddd E,E,EE + + sub arg2, 1 + jne lloop + + ; write out digests + vmovdqa [arg1 + 0*SHA1_DIGEST_ROW_SIZE], A + vmovdqa [arg1 + 1*SHA1_DIGEST_ROW_SIZE], B + vmovdqa [arg1 + 2*SHA1_DIGEST_ROW_SIZE], C + vmovdqa [arg1 + 3*SHA1_DIGEST_ROW_SIZE], D + vmovdqa [arg1 + 4*SHA1_DIGEST_ROW_SIZE], E + + ; update input pointers + add inp0, IDX + mov [arg1 + _data_ptr_sha1 + 0*PTR_SZ], inp0 + add inp1, IDX + mov [arg1 + _data_ptr_sha1 + 1*PTR_SZ], inp1 + add inp2, IDX + mov [arg1 + _data_ptr_sha1 + 2*PTR_SZ], inp2 + add inp3, IDX + mov [arg1 + _data_ptr_sha1 + 3*PTR_SZ], inp3 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + ;; Clear all stack containing part of message +%ifdef SAFE_DATA + vpxor xmm0, xmm0 +%assign i 0 +%rep 16 + vmovdqa [rsp + i*16], xmm0 +%assign i (i+1) +%endrep +%endif + + add rsp, FRAMESZ + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/sha1_one_block_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha1_one_block_avx.asm new file mode 100644 index 000000000..090285e54 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/sha1_one_block_avx.asm @@ -0,0 +1,501 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; SHA1 code, hybrid, rolled, interleaved +; Uses AVX instructions +%include "include/os.asm" + +section .data +default rel +align 16 +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b +K00_19: ;ddq 0x5A8279995A8279995A8279995A827999 + dq 0x5A8279995A827999, 0x5A8279995A827999 +K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 +K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC +K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 + +section .text + +%define VMOVDQ vmovdqu ;; assume buffers not aligned + +%ifdef LINUX +%define INP rdi ; 1st arg +%define CTX rsi ; 2nd arg +%define REG3 edx +%define REG4 ecx +%else +%define INP rcx ; 1st arg +%define CTX rdx ; 2nd arg +%define REG3 edi +%define REG4 esi +%endif + +%define FRAMESZ 3*16 + 1*8 +%define _RSP FRAMESZ-1*8 + rsp + +%define a eax +%define b ebx +%define c REG3 +%define d REG4 +%define e r8d +%define T1 r9d +%define f r10d +%define RND r11d +%define g r12d +%define h r13d + +%define XTMP0 xmm0 +%define XTMP1 xmm1 +%define XK xmm2 + +%xdefine X0 xmm3 +%xdefine X1 xmm4 +%xdefine X2 xmm5 +%xdefine X3 xmm6 +%xdefine X4 xmm7 + +%define XFER xmm8 + +%define SZ 4 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros + +%macro rotate_Xs 0 +%xdefine X_ X0 +%xdefine X0 X1 +%xdefine X1 X2 +%xdefine X2 X3 +%xdefine X3 X4 +%xdefine X4 X_ +%endmacro + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + + +;; Magic functions defined in FIPS 180-1 +;; +; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D))) +%macro MAGIC_F0 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + mov %%regF,%%regC + xor %%regF,%%regD + and %%regF,%%regB + xor %%regF,%%regD +%endmacro + +; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F1 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + mov %%regF,%%regD + xor %%regF,%%regC + xor %%regF,%%regB +%endmacro + +; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D)) +%macro MAGIC_F2 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + mov %%regF,%%regB + mov %%regT,%%regB + or %%regF,%%regC + and %%regT,%%regC + and %%regF,%%regD + or %%regF,%%regT +%endmacro + +; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F3 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT +%endmacro + +;; input is T1 +%macro ROUND 1 +%define %%MAGIC %1 + add e,T1 + mov T1,a + rol T1,5 + add e,T1 + %%MAGIC h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + rol b,30 + add h,e +ROTATE_ARGS +%endmacro + +%macro do_4i 1 + vpaddd XFER, XK, X0 + vpextrd T1, XFER, 0 + ;ROUND %1 + add e,T1 + ;SCHEDULE_4 + vpalignr XTMP0, X1, X0, 8 ; XTMP0 = W[-14] + mov T1,a + rol T1,5 + vpxor XTMP1, X2, X0 ; XTMP1 = W[-8] ^ W[-16] + add e,T1 + vpxor XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-8] ^ W[-14] ^ W[-16] + %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + + ;; Finish low half + rol b,30 + vpsrldq X4, X3, 4 ; X4 = W[-3] {xxBA} + add h,e +ROTATE_ARGS + vpextrd T1, XFER, 1 + ;ROUND %1 + add e,T1 + vpxor X4, X4, XTMP0 + mov T1,a + rol T1,5 + ;; rotate X4 left 1 + vpsrld XTMP1, X4, (32-1) + add e,T1 + vpslld X4, X4, 1 + %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + vpxor X4, X4, XTMP1 ; X4 = W[0] {xxBA} + rol b,30 + add h,e +ROTATE_ARGS + vpextrd T1, XFER, 2 + ;ROUND %1 + add e,T1 + mov T1,a + + ;; Finish high half + vpalignr XTMP1, X4, X3, 4 ; XTMP1 = w[-3] {DCxx} + rol T1,5 + add e,T1 + vpxor XTMP0, XTMP0, XTMP1 + %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + ;; rotate XTMP0 left 1 + vpsrld XTMP1, XTMP0, (32-1) + rol b,30 + add h,e +ROTATE_ARGS + vpextrd T1, XFER, 3 + ;ROUND %1 + add e,T1 + mov T1,a + vpslld XTMP0, XTMP0, 1 + rol T1,5 + add e,T1 + vpxor XTMP0, XTMP0, XTMP1 ; XTMP0 = W[0] {DCxx} + %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + ;; COMBINE HALVES + vshufps X4, X4, XTMP0, 11100100b ; X4 = X[0] {DCBA} + rol b,30 + add h,e + + rotate_Xs +ROTATE_ARGS +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sha1_block_avx(void *input_data, UINT32 digest[5]) +;; arg 1 : (in) pointer to input data +;; arg 2 : (in/out) pointer to read/write digest +MKGLOBAL(sha1_block_avx,function,internal) +align 32 +sha1_block_avx: + push rbx + push rsi + push rdi + push r12 + push r13 + + vmovdqa XTMP0, [rel PSHUFFLE_BYTE_FLIP_MASK] + +%ifndef LINUX + mov rax,rsp ; copy rsp + sub rsp,FRAMESZ + and rsp,-16 ; align stack frame + mov [_RSP],rax ; save copy of rsp + vmovdqa [rsp + 0 * 16], xmm6 + vmovdqa [rsp + 1 * 16], xmm7 + vmovdqa [rsp + 2 * 16], xmm8 +%endif + + VMOVDQ X0, [INP + 0*16] + VMOVDQ X1, [INP + 1*16] + + ;; load next message block + VMOVDQ X2, [INP + 2*16] + VMOVDQ X3, [INP + 3*16] + + ;; set up a-f based on h0-h4 + ;; byte swap first 16 dwords + mov a, [SZ*0 + CTX] + vpshufb X0, XTMP0 + mov b, [SZ*1 + CTX] + vpshufb X1, XTMP0 + mov c, [SZ*2 + CTX] + vpshufb X2, XTMP0 + mov d, [SZ*3 + CTX] + vpshufb X3, XTMP0 + mov e, [SZ*4 + CTX] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; do rounds 00-19 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqa XK, [rel K00_19] + mov RND, 3 + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + jmp loop1_5 +align 16 +loop1: + + do_4i MAGIC_F0 + +loop1_5: + do_4i MAGIC_F0 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + vmovdqa X0, X2 + vmovdqa X2, X4 + vmovdqa X4, X1 + vmovdqa X1, X3 + + sub RND, 1 + jne loop1 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; end rounds 00-19 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; do rounds 20-39 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqa XK, [rel K20_39] + mov RND, 3 + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + jmp loop2_5 +align 16 +loop2: + + do_4i MAGIC_F1 + +loop2_5: + do_4i MAGIC_F1 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + vmovdqa X0, X2 + vmovdqa X2, X4 + vmovdqa X4, X1 + vmovdqa X1, X3 + + sub RND, 1 + jne loop2 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; end rounds 20-39 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; do rounds 40-59 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqa XK, [rel K40_59] + mov RND, 3 + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + jmp loop3_5 +align 16 +loop3: + + do_4i MAGIC_F2 + +loop3_5: + do_4i MAGIC_F2 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + vmovdqa X0, X2 + vmovdqa X2, X4 + vmovdqa X4, X1 + vmovdqa X1, X3 + + sub RND, 1 + jne loop3 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; end rounds 40-59 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; do rounds 60-79 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqa XK, [rel K60_79] + + do_4i MAGIC_F3 + + vpaddd XFER, XK, X0 + vpextrd T1, XFER, 0 + ROUND MAGIC_F3 + vpextrd T1, XFER, 1 + ROUND MAGIC_F3 + vpextrd T1, XFER, 2 + ROUND MAGIC_F3 + vpextrd T1, XFER, 3 + ROUND MAGIC_F3 + + vpaddd XFER, XK, X1 + vpextrd T1, XFER, 0 + ROUND MAGIC_F3 + vpextrd T1, XFER, 1 + ROUND MAGIC_F3 + vpextrd T1, XFER, 2 + ROUND MAGIC_F3 + vpextrd T1, XFER, 3 + ROUND MAGIC_F3 + + vpaddd XFER, XK, X2 + vpextrd T1, XFER, 0 + ROUND MAGIC_F3 + vpextrd T1, XFER, 1 + ROUND MAGIC_F3 + vpextrd T1, XFER, 2 + ROUND MAGIC_F3 + vpextrd T1, XFER, 3 + ROUND MAGIC_F3 + + vpaddd XFER, XK, X3 + vpextrd T1, XFER, 0 + ROUND MAGIC_F3 + vpextrd T1, XFER, 1 + ROUND MAGIC_F3 + vpextrd T1, XFER, 2 + ROUND MAGIC_F3 + vpextrd T1, XFER, 3 + ROUND MAGIC_F3 + + ;; update result digest h0-h4 + add [SZ*0 + CTX], a + add [SZ*1 + CTX], b + add [SZ*2 + CTX], c + add [SZ*3 + CTX], d + add [SZ*4 + CTX], e + +%ifndef LINUX + vmovdqa xmm8, [rsp + 2 * 16] + vmovdqa xmm7, [rsp + 1 * 16] + vmovdqa xmm6, [rsp + 0 * 16] + +%ifdef SAFE_DATA + ;; Clear potential sensitive data stored in stack + vpxor xmm0, xmm0 + vmovdqa [rsp + 0 * 16], xmm0 + vmovdqa [rsp + 1 * 16], xmm0 + vmovdqa [rsp + 2 * 16], xmm0 +%endif + + mov rsp,[_RSP] +%endif ;; LINUX + + pop r13 + pop r12 + pop rdi + pop rsi + pop rbx + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/sha224_one_block_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha224_one_block_avx.asm new file mode 100644 index 000000000..57d997dd3 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/sha224_one_block_avx.asm @@ -0,0 +1,33 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define FUNC sha224_block_avx + +%include "avx/sha256_one_block_avx.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/sha256_one_block_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha256_one_block_avx.asm new file mode 100644 index 000000000..9c96f036b --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/sha256_one_block_avx.asm @@ -0,0 +1,553 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%include "include/os.asm" + +section .data +default rel +align 64 +K256: + dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +; shuffle xBxA -> 00BA +_SHUF_00BA: ;ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100 + dq 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF + +; shuffle xDxC -> DC00 +_SHUF_DC00: ;ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF + dq 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 + +section .text + +%define VMOVDQ vmovdqu ;; assume buffers not aligned + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros + +%macro MY_ROR 2 + shld %1,%1,(32-(%2)) +%endm + +; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask +; Load xmm with mem and byte swap each dword +%macro COPY_XMM_AND_BSWAP 3 + VMOVDQ %1, %2 + vpshufb %1, %1, %3 +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define X0 xmm4 +%define X1 xmm5 +%define X2 xmm6 +%define X3 xmm7 + +%define XTMP0 xmm0 +%define XTMP1 xmm1 +%define XTMP2 xmm2 +%define XTMP3 xmm3 +%define XTMP4 xmm8 +%define XFER xmm9 +%define XTMP5 xmm11 + +%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA +%define SHUF_DC00 xmm12 ; shuffle xDxC -> DC00 +%define BYTE_FLIP_MASK xmm13 + +%ifdef LINUX +%define CTX rsi ; 2nd arg +%define INP rdi ; 1st arg + +%define SRND rdi ; clobbers INP +%define c ecx +%define d r8d +%define e edx +%else +%define CTX rdx ; 2nd arg +%define INP rcx ; 1st arg + +%define SRND rcx ; clobbers INP +%define c edi +%define d esi +%define e r8d + +%endif +%define TBL rbp +%define a eax +%define b ebx + +%define f r9d +%define g r10d +%define h r11d + +%define y0 r13d +%define y1 r14d +%define y2 r15d + + +struc STACK +%ifndef LINUX +_XMM_SAVE: reso 7 +%endif +_XFER: reso 1 +endstruc + +%ifndef FUNC +%define FUNC sha256_block_avx +%endif + +; rotate_Xs +; Rotate values of symbols X0...X3 +%macro rotate_Xs 0 +%xdefine X_ X0 +%xdefine X0 X1 +%xdefine X1 X2 +%xdefine X2 X3 +%xdefine X3 X_ +%endm + +; ROTATE_ARGS +; Rotate values of symbols a...h +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +%macro FOUR_ROUNDS_AND_SCHED 0 + ;; compute s0 four at a time and s1 two at a time + ;; compute W[-16] + W[-7] 4 at a time + ;vmovdqa XTMP0, X3 + mov y0, e ; y0 = e + MY_ROR y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7] + MY_ROR y1, (22-13) ; y1 = a >> (22-13) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + ;vmovdqa XTMP1, X1 + xor y1, a ; y1 = a ^ (a >> (22-13) + xor y2, g ; y2 = f^g + vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16] + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + ;; compute s0 + vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15] + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + + MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH + + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + + vpsrld XTMP2, XTMP1, 7 + + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + + vpslld XTMP3, XTMP1, (32-7) + + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + + vpor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 + + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS + + mov y0, e ; y0 = e + mov y1, a ; y1 = a + + + MY_ROR y0, (25-11) ; y0 = e >> (25-11) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + MY_ROR y1, (22-13) ; y1 = a >> (22-13) + + vpsrld XTMP2, XTMP1,18 + + xor y1, a ; y1 = a ^ (a >> (22-13) + MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor y2, g ; y2 = f^g + + vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3 + + MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + + vpslld XTMP1, XTMP1, (32-18) + + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor y2, g ; y2 = CH = ((f^g)&e)^g + + vpxor XTMP3, XTMP3, XTMP1 + + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH + MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + + vpxor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 + + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + + vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0 + + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + ;; compute low s1 + vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS + ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA} + + mov y0, e ; y0 = e + mov y1, a ; y1 = a + MY_ROR y0, (25-11) ; y0 = e >> (25-11) + + ;vmovdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA} + + xor y0, e ; y0 = e ^ (e >> (25-11)) + MY_ROR y1, (22-13) ; y1 = a >> (22-13) + mov y2, f ; y2 = f + xor y1, a ; y1 = a ^ (a >> (22-13) + MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + + vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA} + + xor y2, g ; y2 = f^g + + vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xBxA} + + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + + vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xBxA} + + MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor y2, g ; y2 = CH = ((f^g)&e)^g + MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + vpxor XTMP2, XTMP2, XTMP3 + add y2, y0 ; y2 = S1 + CH + MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH + vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA} + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA} + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + ;; compute high s1 + vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC} + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS + ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC} + mov y0, e ; y0 = e + MY_ROR y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + ;vmovdqa XTMP5, XTMP2 ; XTMP5 = W[-2] {DDCC} + MY_ROR y1, (22-13) ; y1 = a >> (22-13) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + + vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC} + + xor y1, a ; y1 = a ^ (a >> (22-13) + xor y2, g ; y2 = f^g + + vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xDxC} + + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + + vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xDxC} + + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + + vpxor XTMP2, XTMP2, XTMP3 + + MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH + vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC} + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00} + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS +rotate_Xs +%endm + +;; input is [rsp + _XFER + %1 * 4] +%macro DO_ROUND 1 + mov y0, e ; y0 = e + MY_ROR y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + xor y0, e ; y0 = e ^ (e >> (25-11)) + MY_ROR y1, (22-13) ; y1 = a >> (22-13) + mov y2, f ; y2 = f + xor y1, a ; y1 = a ^ (a >> (22-13) + MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor y2, g ; y2 = f^g + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + and y2, e ; y2 = (f^g)&e + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + add y2, y0 ; y2 = S1 + CH + MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void FUNC(void *input_data, UINT32 digest[8], UINT64 num_blks) +;; arg 1 : pointer to input data +;; arg 2 : pointer to digest +section .text +MKGLOBAL(FUNC,function,internal) +align 32 +FUNC: + push rbx +%ifndef LINUX + push rsi + push rdi +%endif + push rbp + push r13 + push r14 + push r15 + + sub rsp,STACK_size +%ifndef LINUX + vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6 + vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7 + vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8 + vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9 + vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10 + vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11 + vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12 + vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13 +%endif + + ;; load initial digest + mov a, [4*0 + CTX] + mov b, [4*1 + CTX] + mov c, [4*2 + CTX] + mov d, [4*3 + CTX] + mov e, [4*4 + CTX] + mov f, [4*5 + CTX] + mov g, [4*6 + CTX] + mov h, [4*7 + CTX] + + vmovdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK] + vmovdqa SHUF_00BA, [rel _SHUF_00BA] + vmovdqa SHUF_DC00, [rel _SHUF_DC00] + + lea TBL,[rel K256] + + ;; byte swap first 16 dwords + COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK + + ;; schedule 48 input dwords, by doing 3 rounds of 16 each + mov SRND, 3 +align 16 +loop1: + vpaddd XFER, X0, [TBL + 0*16] + vmovdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + vpaddd XFER, X0, [TBL + 1*16] + vmovdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + vpaddd XFER, X0, [TBL + 2*16] + vmovdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + vpaddd XFER, X0, [TBL + 3*16] + vmovdqa [rsp + _XFER], XFER + add TBL, 4*16 + FOUR_ROUNDS_AND_SCHED + + sub SRND, 1 + jne loop1 + + mov SRND, 2 +loop2: + vpaddd XFER, X0, [TBL + 0*16] + vmovdqa [rsp + _XFER], XFER + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + + vpaddd XFER, X1, [TBL + 1*16] + vmovdqa [rsp + _XFER], XFER + add TBL, 2*16 + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + + vmovdqa X0, X2 + vmovdqa X1, X3 + + sub SRND, 1 + jne loop2 + + add [4*0 + CTX], a + add [4*1 + CTX], b + add [4*2 + CTX], c + add [4*3 + CTX], d + add [4*4 + CTX], e + add [4*5 + CTX], f + add [4*6 + CTX], g + add [4*7 + CTX], h + +done_hash: +%ifndef LINUX + vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16] + vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16] + vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16] + vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16] + vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16] + vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16] + vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16] + vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16] +%ifdef SAFE_DATA + ;; Clear potential sensitive data stored in stack + vpxor xmm0, xmm0 + vmovdqa [rsp + _XMM_SAVE + 0 * 16], xmm0 + vmovdqa [rsp + _XMM_SAVE + 1 * 16], xmm0 + vmovdqa [rsp + _XMM_SAVE + 2 * 16], xmm0 + vmovdqa [rsp + _XMM_SAVE + 3 * 16], xmm0 + vmovdqa [rsp + _XMM_SAVE + 4 * 16], xmm0 + vmovdqa [rsp + _XMM_SAVE + 5 * 16], xmm0 + vmovdqa [rsp + _XMM_SAVE + 6 * 16], xmm0 + vmovdqa [rsp + _XMM_SAVE + 7 * 16], xmm0 +%endif +%endif ;; LINUX + + add rsp, STACK_size + + pop r15 + pop r14 + pop r13 + pop rbp +%ifndef LINUX + pop rdi + pop rsi +%endif + pop rbx + + ret + + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/sha384_one_block_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha384_one_block_avx.asm new file mode 100644 index 000000000..dddc5df28 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/sha384_one_block_avx.asm @@ -0,0 +1,33 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define FUNC sha384_block_avx + +%include "avx/sha512_one_block_avx.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/sha512_one_block_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha512_one_block_avx.asm new file mode 100644 index 000000000..040518e76 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/sha512_one_block_avx.asm @@ -0,0 +1,473 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%include "include/os.asm" + +%define VMOVDQ vmovdqu ;; assume buffers not aligned + +%ifndef FUNC +%define FUNC sha512_block_avx +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros + +%macro MY_ROR 2 +shld %1,%1,(64-(%2)) +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask +; Load xmm with mem and byte swap each dword +%macro COPY_XMM_AND_BSWAP 3 + VMOVDQ %1, %2 + vpshufb %1, %3 +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define X0 xmm4 +%define X1 xmm5 +%define X2 xmm6 +%define X3 xmm7 +%define X4 xmm8 +%define X5 xmm9 +%define X6 xmm10 +%define X7 xmm11 + +%define XTMP0 xmm0 +%define XTMP1 xmm1 +%define XTMP2 xmm2 +%define XTMP3 xmm3 +%define XFER xmm13 + +%define BYTE_FLIP_MASK xmm12 + +%ifdef LINUX +%define CTX rsi ; 2nd arg +%define INP rdi ; 1st arg + +%define SRND rdi ; clobbers INP +%define c rcx +%define d r8 +%define e rdx +%else +%define CTX rdx ; 2nd arg +%define INP rcx ; 1st arg + +%define SRND rcx ; clobbers INP +%define c rdi +%define d rsi +%define e r8 + +%endif +%define TBL rbp +%define a rax +%define b rbx + +%define f r9 +%define g r10 +%define h r11 + +%define y0 r13 +%define y1 r14 +%define y2 r15 + +struc STACK +%ifndef LINUX +_XMM_SAVE: reso 8 +%endif +_XFER: reso 1 +endstruc + + +; rotate_Xs +; Rotate values of symbols X0...X7 +%macro rotate_Xs 0 +%xdefine X_ X0 +%xdefine X0 X1 +%xdefine X1 X2 +%xdefine X2 X3 +%xdefine X3 X4 +%xdefine X4 X5 +%xdefine X5 X6 +%xdefine X6 X7 +%xdefine X7 X_ +%endm + +; ROTATE_ARGS +; Rotate values of symbols a...h +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +%macro TWO_ROUNDS_AND_SCHED 0 + + vpalignr XTMP0, X5, X4, 8 ; XTMP0 = W[-7] + ;; compute s0 four at a time and s1 two at a time + ;; compute W[-16] + W[-7] 4 at a time + mov y0, e ; y0 = e + mov y1, a ; y1 = a + MY_ROR y0, (41-18) ; y0 = e >> (41-18) + vpaddq XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16] + xor y0, e ; y0 = e ^ (e >> (41-18)) + mov y2, f ; y2 = f + MY_ROR y1, (39-34) ; y1 = a >> (39-34) + ;; compute s0 + vpalignr XTMP1, X1, X0, 8 ; XTMP1 = W[-15] + xor y1, a ; y1 = a ^ (a >> (39-34) + MY_ROR y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14)) + vpsllq XTMP2, XTMP1, (64-1) + xor y2, g ; y2 = f^g + MY_ROR y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28)) + vpsrlq XTMP3, XTMP1, 1 + xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (41-14)) + and y2, e ; y2 = (f^g)&e + MY_ROR y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41) + vpor XTMP2, XTMP2, XTMP3 ; XTMP2 = W[-15] ror 1 + xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28)) + xor y2, g ; y2 = CH = ((f^g)&e)^g + add y2, y0 ; y2 = S1 + CH + vpsrlq XTMP3, XTMP1, 8 + add y2, [rsp + _XFER + 0*8] ; y2 = k + w + S1 + CH + MY_ROR y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39) + mov y0, a ; y0 = a + vpsllq X0, XTMP1, (64-8) + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + or y0, c ; y0 = a|c + vpor X0, X0, XTMP3 + add d, h ; d = d + t1 + and y2, c ; y2 = a&c + and y0, b ; y0 = (a|c)&b + vpsrlq XTMP1, XTMP1, 7 ; X0 = W[-15] >> 7 + add h, y1 ; h = t1 + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + vpxor XTMP1, XTMP1, XTMP2 ; XTMP1 = W[-15] ror 1 ^ W[-15] ror 8 + add h, y0 ; h = t1 + S0 + MAJ + vpxor XTMP1, XTMP1, X0 ; XTMP1 = s0 + + +ROTATE_ARGS + ;; compute s1 + vpaddq XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 + mov y0, e ; y0 = e + mov y1, a ; y1 = a + MY_ROR y0, (41-18) ; y0 = e >> (41-18) + vpsllq XTMP3, X7, (64-19) + xor y0, e ; y0 = e ^ (e >> (41-18)) + mov y2, f ; y2 = f + MY_ROR y1, (39-34) ; y1 = a >> (39-34) + vpsrlq X0, X7, 19 + xor y1, a ; y1 = a ^ (a >> (39-34) + MY_ROR y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14)) + vpor XTMP3, XTMP3, X0 ; XTMP3 = W[-2] ror 19 + xor y2, g ; y2 = f^g + MY_ROR y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28)) + vpsllq XTMP2, X7, (64-61) + xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (41-14)) + and y2, e ; y2 = (f^g)&e + MY_ROR y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41) + vpsrlq XTMP1, X7, 61 + xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28)) + xor y2, g ; y2 = CH = ((f^g)&e)^g + add y2, y0 ; y2 = S1 + CH + vpor XTMP2, XTMP2, XTMP1 ; XTMP2 = W[-2] ror 61 + add y2, [rsp + _XFER + 1*8] ; y2 = k + w + S1 + CH + MY_ROR y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39) + mov y0, a ; y0 = a + vpsrlq XTMP1, X7, 6 ; XTMP1 = W[-2] >> 6 + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + or y0, c ; y0 = a|c + vpxor XTMP1, XTMP1, XTMP2 + add d, h ; d = d + t1 + and y2, c ; y2 = a&c + and y0, b ; y0 = (a|c)&b + vpxor X0, XTMP3, XTMP1 ; X0 = s1 + add h, y1 ; h = t1 + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = t1 + S0 + MAJ + vpaddq X0, X0, XTMP0 ; X0 = {W[1], W[0]} + +ROTATE_ARGS +rotate_Xs +%endm + +;; input is [rsp + _XFER + %1 * 8] +%macro DO_ROUND 1 + mov y0, e ; y0 = e + MY_ROR y0, (41-18) ; y0 = e >> (41-18) + mov y1, a ; y1 = a + xor y0, e ; y0 = e ^ (e >> (41-18)) + MY_ROR y1, (39-34) ; y1 = a >> (39-34) + mov y2, f ; y2 = f + xor y1, a ; y1 = a ^ (a >> (39-34) + MY_ROR y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14)) + xor y2, g ; y2 = f^g + xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (25-6)) + MY_ROR y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28)) + and y2, e ; y2 = (f^g)&e + xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28)) + MY_ROR y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41) + xor y2, g ; y2 = CH = ((f^g)&e)^g + add y2, y0 ; y2 = S1 + CH + MY_ROR y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39) + add y2, [rsp + _XFER + %1*8] ; y2 = k + w + S1 + CH + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + or y0, c ; y0 = a|c + add d, h ; d = d + t1 + and y2, c ; y2 = a&c + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = t1 + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = t1 + S0 + MAJ + ROTATE_ARGS +%endm + +section .data +default rel +align 64 +K512: + dq 0x428a2f98d728ae22,0x7137449123ef65cd + dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + dq 0x3956c25bf348b538,0x59f111f1b605d019 + dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + dq 0xd807aa98a3030242,0x12835b0145706fbe + dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + dq 0x9bdc06a725c71235,0xc19bf174cf692694 + dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + dq 0x983e5152ee66dfab,0xa831c66d2db43210 + dq 0xb00327c898fb213f,0xbf597fc7beef0ee4 + dq 0xc6e00bf33da88fc2,0xd5a79147930aa725 + dq 0x06ca6351e003826f,0x142929670a0e6e70 + dq 0x27b70a8546d22ffc,0x2e1b21385c26c926 + dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + dq 0x650a73548baf63de,0x766a0abb3c77b2a8 + dq 0x81c2c92e47edaee6,0x92722c851482353b + dq 0xa2bfe8a14cf10364,0xa81a664bbc423001 + dq 0xc24b8b70d0f89791,0xc76c51a30654be30 + dq 0xd192e819d6ef5218,0xd69906245565a910 + dq 0xf40e35855771202a,0x106aa07032bbd1b8 + dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + dq 0x748f82ee5defb2fc,0x78a5636f43172f60 + dq 0x84c87814a1f0ab72,0x8cc702081a6439ec + dq 0x90befffa23631e28,0xa4506cebde82bde9 + dq 0xbef9a3f7b2c67915,0xc67178f2e372532b + dq 0xca273eceea26619c,0xd186b8c721c0c207 + dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + dq 0x06f067aa72176fba,0x0a637dc5a2c898a6 + dq 0x113f9804bef90dae,0x1b710b35131c471b + dq 0x28db77f523047d84,0x32caab7b40c72493 + dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +align 16 +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607 + dq 0x0001020304050607, 0x08090a0b0c0d0e0f + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void FUNC(void *input_data, UINT64 digest[8]) +;; arg 1 : pointer to input data +;; arg 2 : pointer to digest +section .text +MKGLOBAL(FUNC,function,internal) +align 32 +FUNC: + push rbx +%ifndef LINUX + push rsi + push rdi +%endif + push rbp + push r13 + push r14 + push r15 + + sub rsp,STACK_size +%ifndef LINUX + vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6 + vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7 + vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8 + vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9 + vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10 + vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11 + vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12 + vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13 +%endif + + ;; load initial digest + mov a, [8*0 + CTX] + mov b, [8*1 + CTX] + mov c, [8*2 + CTX] + mov d, [8*3 + CTX] + mov e, [8*4 + CTX] + mov f, [8*5 + CTX] + mov g, [8*6 + CTX] + mov h, [8*7 + CTX] + + vmovdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK] + + lea TBL,[rel K512] + + ;; byte swap first 16 qwords + COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X4, [INP + 4*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X5, [INP + 5*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X6, [INP + 6*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X7, [INP + 7*16], BYTE_FLIP_MASK + + ;; schedule 64 input qwords, by doing 4 iterations of 16 rounds + mov SRND, 4 +align 16 +loop1: + +%assign i 0 +%rep 7 + vpaddq XFER, X0, [TBL + i*16] + vmovdqa [rsp + _XFER], XFER + TWO_ROUNDS_AND_SCHED +%assign i (i+1) +%endrep + + vpaddq XFER, X0, [TBL + 7*16] + vmovdqa [rsp + _XFER], XFER + add TBL, 8*16 + TWO_ROUNDS_AND_SCHED + + sub SRND, 1 + jne loop1 + + mov SRND, 2 + jmp loop2a +loop2: + vmovdqa X0, X4 + vmovdqa X1, X5 + vmovdqa X2, X6 + vmovdqa X3, X7 + +loop2a: + vpaddq X0, X0, [TBL + 0*16] + vmovdqa [rsp + _XFER], X0 + DO_ROUND 0 + DO_ROUND 1 + + vpaddq X1, X1, [TBL + 1*16] + vmovdqa [rsp + _XFER], X1 + DO_ROUND 0 + DO_ROUND 1 + + vpaddq X2, X2, [TBL + 2*16] + vmovdqa [rsp + _XFER], X2 + DO_ROUND 0 + DO_ROUND 1 + + vpaddq X3, X3, [TBL + 3*16] + vmovdqa [rsp + _XFER], X3 + add TBL, 4*16 + DO_ROUND 0 + DO_ROUND 1 + + sub SRND, 1 + jne loop2 + + add [8*0 + CTX], a + add [8*1 + CTX], b + add [8*2 + CTX], c + add [8*3 + CTX], d + add [8*4 + CTX], e + add [8*5 + CTX], f + add [8*6 + CTX], g + add [8*7 + CTX], h + +done_hash: +%ifndef LINUX + vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16] + vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16] + vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16] + vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16] + vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16] + vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16] + vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16] + vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16] +%ifdef SAFE_DATA + ;; Clear potential sensitive data stored in stack + vpxor xmm0, xmm0 + vmovdqa [rsp + _XMM_SAVE + 0 * 16], xmm0 + vmovdqa [rsp + _XMM_SAVE + 1 * 16], xmm0 + vmovdqa [rsp + _XMM_SAVE + 2 * 16], xmm0 + vmovdqa [rsp + _XMM_SAVE + 3 * 16], xmm0 + vmovdqa [rsp + _XMM_SAVE + 4 * 16], xmm0 + vmovdqa [rsp + _XMM_SAVE + 5 * 16], xmm0 + vmovdqa [rsp + _XMM_SAVE + 6 * 16], xmm0 + vmovdqa [rsp + _XMM_SAVE + 7 * 16], xmm0 +%endif +%endif ;; LINUX + + add rsp, STACK_size + + pop r15 + pop r14 + pop r13 + pop rbp +%ifndef LINUX + pop rdi + pop rsi +%endif + pop rbx + + ret + + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/sha512_x2_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha512_x2_avx.asm new file mode 100644 index 000000000..d7d712e2c --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/sha512_x2_avx.asm @@ -0,0 +1,381 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; code to compute SHA512 by-2 using AVX +;; outer calling routine takes care of save and restore of XMM registers +;; Logic designed/laid out by JDG + +;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15 +;; Stack must be aligned to 16 bytes before call +;; Windows clobbers: rax rdx r8 r9 r10 r11 +;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15 +;; +;; Linux clobbers: rax rsi r8 r9 r10 r11 +;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15 +;; +;; clobbers xmm0-15 + +%include "include/os.asm" +%include "mb_mgr_datastruct.asm" +extern K512_2 + +section .data +default rel + +align 32 +; one from sha512_rorx +; this does the big endian to little endian conversion +; over a quad word +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607 + dq 0x0001020304050607, 0x08090a0b0c0d0e0f + ;ddq 0x18191a1b1c1d1e1f1011121314151617 + dq 0x1011121314151617, 0x18191a1b1c1d1e1f + +section .text + +%ifdef LINUX ; Linux definitions +%define arg1 rdi +%define arg2 rsi +%else ; Windows definitions +%define arg1 rcx +%define arg2 rdx +%endif + +; Common definitions +%define STATE arg1 +%define INP_SIZE arg2 + +%define IDX rax +%define ROUND r8 +%define TBL r11 + +%define inp0 r9 +%define inp1 r10 + +%define a xmm0 +%define b xmm1 +%define c xmm2 +%define d xmm3 +%define e xmm4 +%define f xmm5 +%define g xmm6 +%define h xmm7 + +%define a0 xmm8 +%define a1 xmm9 +%define a2 xmm10 + +%define TT0 xmm14 +%define TT1 xmm13 +%define TT2 xmm12 +%define TT3 xmm11 +%define TT4 xmm10 +%define TT5 xmm9 + +%define T1 xmm14 +%define TMP xmm15 + + + +%define SZ2 2*SHA512_DIGEST_WORD_SIZE ; Size of one vector register +%define ROUNDS 80*SZ2 + +; Define stack usage + +struc STACK +_DATA: resb SZ2 * 16 +_DIGEST: resb SZ2 * NUM_SHA512_DIGEST_WORDS + resb 8 ; for alignment, must be odd multiple of 8 +endstruc + +%define VMOVPD vmovupd + +; transpose r0, r1, t0 +; Input looks like {r0 r1} +; r0 = {a1 a0} +; r1 = {b1 b0} +; +; output looks like +; r0 = {b0, a0} +; t0 = {b1, a1} + +%macro TRANSPOSE 3 +%define %%r0 %1 +%define %%r1 %2 +%define %%t0 %3 + vshufpd %%t0, %%r0, %%r1, 11b ; t0 = b1 a1 + vshufpd %%r0, %%r0, %%r1, 00b ; r0 = b0 a0 +%endm + + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +; PRORQ reg, imm, tmp +; packed-rotate-right-double +; does a rotate by doing two shifts and an or +%macro PRORQ 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpsllq %%tmp, %%reg, (64-(%%imm)) + vpsrlq %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; non-destructive +; PRORQ_nd reg, imm, tmp, src +%macro PRORQ_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpsllq %%tmp, %%src, (64-(%%imm)) + vpsrlq %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; PRORQ dst/src, amt +%macro PRORQ 2 + PRORQ %1, %2, TMP +%endmacro + +; PRORQ_nd dst, src, amt +%macro PRORQ_nd 3 + PRORQ_nd %1, %3, TMP, %2 +%endmacro + + + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15 2 +%define %%T1 %1 +%define %%i %2 + PRORQ_nd a0, e, (18-14) ; sig1: a0 = (e >> 4) + + vpxor a2, f, g ; ch: a2 = f^g + vpand a2, a2, e ; ch: a2 = (f^g)&e + vpxor a2, a2, g ; a2 = ch + + PRORQ_nd a1, e, 41 ; sig1: a1 = (e >> 41) + vmovdqa [SZ2*(%%i&0xf) + rsp + _DATA],%%T1 + vpaddq %%T1,%%T1,[TBL + ROUND] ; T1 = W + K + vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) + PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18) + vpaddq h, h, a2 ; h = h + ch + PRORQ_nd a2, a, (34-28) ; sig0: a2 = (a >> 6) + vpaddq h, h, %%T1 ; h = h + ch + W + K + vpxor a0, a0, a1 ; a0 = sigma1 + vmovdqa %%T1, a ; maj: T1 = a + PRORQ_nd a1, a, 39 ; sig0: a1 = (a >> 39) + vpxor %%T1, %%T1, c ; maj: T1 = a^c + add ROUND, SZ2 ; ROUND++ + vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b + vpaddq h, h, a0 + + vpaddq d, d, h + + vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) + PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34) + vpxor a2, a2, a1 ; a2 = sig0 + vpand a1, a, c ; maj: a1 = a&c + vpor a1, a1, %%T1 ; a1 = maj + vpaddq h, h, a1 ; h = h + ch + W + K + maj + vpaddq h, h, a2 ; h = h + ch + W + K + maj + sigma0 + ROTATE_ARGS + +%endm + + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 2 +%define %%T1 %1 +%define %%i %2 + vmovdqa %%T1, [SZ2*((%%i-15)&0xf) + rsp + _DATA] + vmovdqa a1, [SZ2*((%%i-2)&0xf) + rsp + _DATA] + vmovdqa a0, %%T1 + PRORQ %%T1, 8-1 + vmovdqa a2, a1 + PRORQ a1, 61-19 + vpxor %%T1, %%T1, a0 + PRORQ %%T1, 1 + vpxor a1, a1, a2 + PRORQ a1, 19 + vpsrlq a0, a0, 7 + vpxor %%T1, %%T1, a0 + vpsrlq a2, a2, 6 + vpxor a1, a1, a2 + vpaddq %%T1, %%T1, [SZ2*((%%i-16)&0xf) + rsp + _DATA] + vpaddq a1, a1, [SZ2*((%%i-7)&0xf) + rsp + _DATA] + vpaddq %%T1, %%T1, a1 + + ROUND_00_15 %%T1, %%i + +%endm + + + +;; SHA512_ARGS: +;; UINT128 digest[8]; // transposed digests +;; UINT8 *data_ptr[2]; +;; + +;; void sha512_x2_avx(SHA512_ARGS *args, UINT64 msg_size_in_blocks) +;; arg 1 : STATE : pointer args +;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1) +;; +MKGLOBAL(sha512_x2_avx,function,internal) +align 32 +sha512_x2_avx: + ; general registers preserved in outer calling routine + ; outer calling routine saves all the XMM registers + + sub rsp, STACK_size + + ;; Load the pre-transposed incoming digest. + vmovdqa a,[STATE + 0 * SHA512_DIGEST_ROW_SIZE] + vmovdqa b,[STATE + 1 * SHA512_DIGEST_ROW_SIZE] + vmovdqa c,[STATE + 2 * SHA512_DIGEST_ROW_SIZE] + vmovdqa d,[STATE + 3 * SHA512_DIGEST_ROW_SIZE] + vmovdqa e,[STATE + 4 * SHA512_DIGEST_ROW_SIZE] + vmovdqa f,[STATE + 5 * SHA512_DIGEST_ROW_SIZE] + vmovdqa g,[STATE + 6 * SHA512_DIGEST_ROW_SIZE] + vmovdqa h,[STATE + 7 * SHA512_DIGEST_ROW_SIZE] + + lea TBL,[rel K512_2] + + ;; load the address of each of the 2 message lanes + ;; getting ready to transpose input onto stack + mov inp0,[STATE + _data_ptr_sha512 +0*PTR_SZ] + mov inp1,[STATE + _data_ptr_sha512 +1*PTR_SZ] + + xor IDX, IDX +lloop: + + xor ROUND, ROUND + + ;; save old digest + vmovdqa [rsp + _DIGEST + 0*SZ2], a + vmovdqa [rsp + _DIGEST + 1*SZ2], b + vmovdqa [rsp + _DIGEST + 2*SZ2], c + vmovdqa [rsp + _DIGEST + 3*SZ2], d + vmovdqa [rsp + _DIGEST + 4*SZ2], e + vmovdqa [rsp + _DIGEST + 5*SZ2], f + vmovdqa [rsp + _DIGEST + 6*SZ2], g + vmovdqa [rsp + _DIGEST + 7*SZ2], h + +%assign i 0 +%rep 8 + ;; load up the shuffler for little-endian to big-endian format + vmovdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK] + VMOVPD TT0,[inp0+IDX+i*16] ;; double precision is 64 bits + VMOVPD TT2,[inp1+IDX+i*16] + + TRANSPOSE TT0, TT2, TT1 + vpshufb TT0, TT0, TMP + vpshufb TT1, TT1, TMP + + ROUND_00_15 TT0,(i*2+0) + ROUND_00_15 TT1,(i*2+1) +%assign i (i+1) +%endrep + +;; Increment IDX by message block size == 8 (loop) * 16 (XMM width in bytes) + add IDX, 8 * 16 + +%assign i (i*4) + + jmp Lrounds_16_xx +align 16 +Lrounds_16_xx: +%rep 16 + ROUND_16_XX T1, i +%assign i (i+1) +%endrep + + cmp ROUND,ROUNDS + jb Lrounds_16_xx + + ;; add old digest + vpaddq a, a, [rsp + _DIGEST + 0*SZ2] + vpaddq b, b, [rsp + _DIGEST + 1*SZ2] + vpaddq c, c, [rsp + _DIGEST + 2*SZ2] + vpaddq d, d, [rsp + _DIGEST + 3*SZ2] + vpaddq e, e, [rsp + _DIGEST + 4*SZ2] + vpaddq f, f, [rsp + _DIGEST + 5*SZ2] + vpaddq g, g, [rsp + _DIGEST + 6*SZ2] + vpaddq h, h, [rsp + _DIGEST + 7*SZ2] + + sub INP_SIZE, 1 ;; consumed one message block + jne lloop + + ; write back to memory (state object) the transposed digest + vmovdqa [STATE+0*SHA512_DIGEST_ROW_SIZE],a + vmovdqa [STATE+1*SHA512_DIGEST_ROW_SIZE],b + vmovdqa [STATE+2*SHA512_DIGEST_ROW_SIZE],c + vmovdqa [STATE+3*SHA512_DIGEST_ROW_SIZE],d + vmovdqa [STATE+4*SHA512_DIGEST_ROW_SIZE],e + vmovdqa [STATE+5*SHA512_DIGEST_ROW_SIZE],f + vmovdqa [STATE+6*SHA512_DIGEST_ROW_SIZE],g + vmovdqa [STATE+7*SHA512_DIGEST_ROW_SIZE],h + + ; update input pointers + add inp0, IDX + mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0 + add inp1, IDX + mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + ;; Clear stack frame ((16 + 8)*16 bytes) +%ifdef SAFE_DATA + vpxor xmm0, xmm0 +%assign i 0 +%rep (16+NUM_SHA512_DIGEST_WORDS) + vmovdqa [rsp + i*SZ2], xmm0 +%assign i (i+1) +%endrep +%endif + + add rsp, STACK_size + + ; outer calling routine restores XMM and other GP registers + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/sha_256_mult_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha_256_mult_avx.asm new file mode 100644 index 000000000..c1895a3f5 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/sha_256_mult_avx.asm @@ -0,0 +1,391 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; code to compute quad SHA256 using AVX +;; outer calling routine takes care of save and restore of XMM registers +;; Logic designed/laid out by JDG + +;; Stack must be aligned to 16 bytes before call +;; Windows clobbers: rax rbx rdx r8 r9 r10 r11 r12 +;; Windows preserves: rcx rsi rdi rbp r12 r14 r15 +;; +;; Linux clobbers: rax rbx rsi r8 r9 r10 r11 r12 +;; Linux preserves: rcx rdx rdi rbp r13 r14 r15 +;; +;; clobbers xmm0-15 + +%include "include/os.asm" +%include "mb_mgr_datastruct.asm" + +extern K256_4 + +%ifdef LINUX + %define arg1 rdi + %define arg2 rsi +%else + ; Windows definitions + %define arg1 rcx + %define arg2 rdx +%endif + +; Common definitions +%define STATE arg1 +%define INP_SIZE arg2 + +%define IDX rax +%define ROUND rbx +%define TBL r12 + +%define inp0 r8 +%define inp1 r9 +%define inp2 r10 +%define inp3 r11 + +%define a xmm0 +%define b xmm1 +%define c xmm2 +%define d xmm3 +%define e xmm4 +%define f xmm5 +%define g xmm6 +%define h xmm7 + +%define a0 xmm8 +%define a1 xmm9 +%define a2 xmm10 + +%define TT0 xmm14 +%define TT1 xmm13 +%define TT2 xmm12 +%define TT3 xmm11 +%define TT4 xmm10 +%define TT5 xmm9 + +%define T1 xmm14 +%define TMP xmm15 + +%define SZ4 4*SHA256_DIGEST_WORD_SIZE ; Size of one vector register +%define ROUNDS 64*SZ4 + +; Define stack usage +struc STACK +_DATA: resb SZ4 * 16 +_DIGEST: resb SZ4 * NUM_SHA256_DIGEST_WORDS + resb 8 ; for alignment, must be odd multiple of 8 +endstruc + +%define VMOVPS vmovups + +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a3 a2 a1 a0} +; r1 = {b3 b2 b1 b0} +; r2 = {c3 c2 c1 c0} +; r3 = {d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; r0 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} + + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} + + vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} + + vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} + + vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} +%endmacro + + + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +; PRORD reg, imm, tmp +%macro PRORD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpslld %%tmp, %%reg, (32-(%%imm)) + vpsrld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; non-destructive +; PRORD_nd reg, imm, tmp, src +%macro PRORD_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + ;vmovdqa %%tmp, %%reg + vpslld %%tmp, %%src, (32-(%%imm)) + vpsrld %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; PRORD dst/src, amt +%macro PRORD 2 + PRORD %1, %2, TMP +%endmacro + +; PRORD_nd dst, src, amt +%macro PRORD_nd 3 + PRORD_nd %1, %3, TMP, %2 +%endmacro + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15 2 +%define %%T1 %1 +%define %%i %2 + PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5) + + vpxor a2, f, g ; ch: a2 = f^g + vpand a2, a2, e ; ch: a2 = (f^g)&e + vpxor a2, a2, g ; a2 = ch + + PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25) + vmovdqa [SZ4*(%%i&0xf) + rsp + _DATA], %%T1 + vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K + vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) + PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) + vpaddd h, h, a2 ; h = h + ch + PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11) + vpaddd h, h, %%T1 ; h = h + ch + W + K + vpxor a0, a0, a1 ; a0 = sigma1 + PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22) + vpxor %%T1, a, c ; maj: T1 = a^c + add ROUND, SZ4 ; ROUND++ + vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b + vpaddd h, h, a0 + + vpaddd d, d, h + + vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) + PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) + vpxor a2, a2, a1 ; a2 = sig0 + vpand a1, a, c ; maj: a1 = a&c + vpor a1, a1, %%T1 ; a1 = maj + vpaddd h, h, a1 ; h = h + ch + W + K + maj + vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm + + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 2 +%define %%T1 %1 +%define %%i %2 + vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp + _DATA] + vmovdqa a1, [SZ4*((%%i-2)&0xf) + rsp + _DATA] + vmovdqa a0, %%T1 + PRORD %%T1, 18-7 + vmovdqa a2, a1 + PRORD a1, 19-17 + vpxor %%T1, %%T1, a0 + PRORD %%T1, 7 + vpxor a1, a1, a2 + PRORD a1, 17 + vpsrld a0, a0, 3 + vpxor %%T1, %%T1, a0 + vpsrld a2, a2, 10 + vpxor a1, a1, a2 + vpaddd %%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp + _DATA] + vpaddd a1, a1, [SZ4*((%%i-7)&0xf) + rsp + _DATA] + vpaddd %%T1, %%T1, a1 + + ROUND_00_15 %%T1, %%i +%endm + +section .data +default rel +align 16 +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +section .text + +;; SHA256_ARGS: +;; UINT128 digest[8]; // transposed digests +;; UINT8 *data_ptr[4]; +;; + +;; void sha_256_mult_avx(SHA256_ARGS *args, UINT64 num_blocks); +;; arg 1 : STATE : pointer args +;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1) +;; +MKGLOBAL(sha_256_mult_avx,function,internal) +align 16 +sha_256_mult_avx: + ; general registers preserved in outer calling routine + ; outer calling routine saves all the XMM registers + sub rsp, STACK_size + + ;; Load the pre-transposed incoming digest. + vmovdqa a,[STATE+0*SHA256_DIGEST_ROW_SIZE] + vmovdqa b,[STATE+1*SHA256_DIGEST_ROW_SIZE] + vmovdqa c,[STATE+2*SHA256_DIGEST_ROW_SIZE] + vmovdqa d,[STATE+3*SHA256_DIGEST_ROW_SIZE] + vmovdqa e,[STATE+4*SHA256_DIGEST_ROW_SIZE] + vmovdqa f,[STATE+5*SHA256_DIGEST_ROW_SIZE] + vmovdqa g,[STATE+6*SHA256_DIGEST_ROW_SIZE] + vmovdqa h,[STATE+7*SHA256_DIGEST_ROW_SIZE] + + lea TBL,[rel K256_4] + + ;; load the address of each of the 4 message lanes + ;; getting ready to transpose input onto stack + mov inp0,[STATE + _data_ptr_sha256 + 0*PTR_SZ] + mov inp1,[STATE + _data_ptr_sha256 + 1*PTR_SZ] + mov inp2,[STATE + _data_ptr_sha256 + 2*PTR_SZ] + mov inp3,[STATE + _data_ptr_sha256 + 3*PTR_SZ] + + xor IDX, IDX +lloop: + xor ROUND, ROUND + + ;; save old digest + vmovdqa [rsp + _DIGEST + 0*SZ4], a + vmovdqa [rsp + _DIGEST + 1*SZ4], b + vmovdqa [rsp + _DIGEST + 2*SZ4], c + vmovdqa [rsp + _DIGEST + 3*SZ4], d + vmovdqa [rsp + _DIGEST + 4*SZ4], e + vmovdqa [rsp + _DIGEST + 5*SZ4], f + vmovdqa [rsp + _DIGEST + 6*SZ4], g + vmovdqa [rsp + _DIGEST + 7*SZ4], h + +%assign i 0 +%rep 4 + vmovdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK] + VMOVPS TT2,[inp0+IDX+i*16] + VMOVPS TT1,[inp1+IDX+i*16] + VMOVPS TT4,[inp2+IDX+i*16] + VMOVPS TT3,[inp3+IDX+i*16] + TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5 + vpshufb TT0, TT0, TMP + vpshufb TT1, TT1, TMP + vpshufb TT2, TT2, TMP + vpshufb TT3, TT3, TMP + ROUND_00_15 TT0,(i*4+0) + ROUND_00_15 TT1,(i*4+1) + ROUND_00_15 TT2,(i*4+2) + ROUND_00_15 TT3,(i*4+3) +%assign i (i+1) +%endrep + add IDX, 4*4*4 + +%assign i (i*4) + + jmp Lrounds_16_xx +align 16 +Lrounds_16_xx: +%rep 16 + ROUND_16_XX T1, i +%assign i (i+1) +%endrep + + cmp ROUND,ROUNDS + jb Lrounds_16_xx + + ;; add old digest + vpaddd a, a, [rsp + _DIGEST + 0*SZ4] + vpaddd b, b, [rsp + _DIGEST + 1*SZ4] + vpaddd c, c, [rsp + _DIGEST + 2*SZ4] + vpaddd d, d, [rsp + _DIGEST + 3*SZ4] + vpaddd e, e, [rsp + _DIGEST + 4*SZ4] + vpaddd f, f, [rsp + _DIGEST + 5*SZ4] + vpaddd g, g, [rsp + _DIGEST + 6*SZ4] + vpaddd h, h, [rsp + _DIGEST + 7*SZ4] + + sub INP_SIZE, 1 ;; unit is blocks + jne lloop + + ; write back to memory (state object) the transposed digest + vmovdqa [STATE+0*SHA256_DIGEST_ROW_SIZE],a + vmovdqa [STATE+1*SHA256_DIGEST_ROW_SIZE],b + vmovdqa [STATE+2*SHA256_DIGEST_ROW_SIZE],c + vmovdqa [STATE+3*SHA256_DIGEST_ROW_SIZE],d + vmovdqa [STATE+4*SHA256_DIGEST_ROW_SIZE],e + vmovdqa [STATE+5*SHA256_DIGEST_ROW_SIZE],f + vmovdqa [STATE+6*SHA256_DIGEST_ROW_SIZE],g + vmovdqa [STATE+7*SHA256_DIGEST_ROW_SIZE],h + + ; update input pointers + add inp0, IDX + mov [STATE + _data_ptr_sha256 + 0*8], inp0 + add inp1, IDX + mov [STATE + _data_ptr_sha256 + 1*8], inp1 + add inp2, IDX + mov [STATE + _data_ptr_sha256 + 2*8], inp2 + add inp3, IDX + mov [STATE + _data_ptr_sha256 + 3*8], inp3 + + ;;;;;;;;;;;;;;;; + ;; Postamble + +%ifdef SAFE_DATA + ;; Clear stack frame ((16 + 8)*16 bytes) + vpxor xmm0, xmm0 +%assign i 0 +%rep (16+NUM_SHA256_DIGEST_WORDS) + vmovdqa [rsp + i*SZ4], xmm0 +%assign i (i+1) +%endrep +%endif + + add rsp, STACK_size + ; outer calling routine restores XMM and other GP registers + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/snow3g_avx.c b/src/spdk/intel-ipsec-mb/avx/snow3g_avx.c new file mode 100644 index 000000000..8c6995fb8 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/snow3g_avx.c @@ -0,0 +1,42 @@ +/******************************************************************************* + Copyright (c) 2019, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define AVX +#define SNOW3G_F8_1_BUFFER_BIT snow3g_f8_1_buffer_bit_avx +#define SNOW3G_F8_1_BUFFER snow3g_f8_1_buffer_avx +#define SNOW3G_F8_2_BUFFER snow3g_f8_2_buffer_avx +#define SNOW3G_F8_4_BUFFER snow3g_f8_4_buffer_avx +#define SNOW3G_F8_8_BUFFER snow3g_f8_8_buffer_avx +#define SNOW3G_F8_N_BUFFER snow3g_f8_n_buffer_avx +#define SNOW3G_F8_8_BUFFER_MULTIKEY snow3g_f8_8_buffer_multikey_avx +#define SNOW3G_F8_N_BUFFER_MULTIKEY snow3g_f8_n_buffer_multikey_avx +#define SNOW3G_F9_1_BUFFER snow3g_f9_1_buffer_avx +#define SNOW3G_INIT_KEY_SCHED snow3g_init_key_sched_avx +#define SNOW3G_KEY_SCHED_SIZE snow3g_key_sched_size_avx +#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_xmms_avx + +#include "include/snow3g_common.h" diff --git a/src/spdk/intel-ipsec-mb/avx/zuc_avx.asm b/src/spdk/intel-ipsec-mb/avx/zuc_avx.asm new file mode 100755 index 000000000..e7c6bad8a --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/zuc_avx.asm @@ -0,0 +1,1146 @@ +;; +;; Copyright (c) 2009-2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "include/reg_sizes.asm" + +extern lookup_8bit_avx + +section .data +default rel +align 64 +S0: +db 0x3e,0x72,0x5b,0x47,0xca,0xe0,0x00,0x33,0x04,0xd1,0x54,0x98,0x09,0xb9,0x6d,0xcb +db 0x7b,0x1b,0xf9,0x32,0xaf,0x9d,0x6a,0xa5,0xb8,0x2d,0xfc,0x1d,0x08,0x53,0x03,0x90 +db 0x4d,0x4e,0x84,0x99,0xe4,0xce,0xd9,0x91,0xdd,0xb6,0x85,0x48,0x8b,0x29,0x6e,0xac +db 0xcd,0xc1,0xf8,0x1e,0x73,0x43,0x69,0xc6,0xb5,0xbd,0xfd,0x39,0x63,0x20,0xd4,0x38 +db 0x76,0x7d,0xb2,0xa7,0xcf,0xed,0x57,0xc5,0xf3,0x2c,0xbb,0x14,0x21,0x06,0x55,0x9b +db 0xe3,0xef,0x5e,0x31,0x4f,0x7f,0x5a,0xa4,0x0d,0x82,0x51,0x49,0x5f,0xba,0x58,0x1c +db 0x4a,0x16,0xd5,0x17,0xa8,0x92,0x24,0x1f,0x8c,0xff,0xd8,0xae,0x2e,0x01,0xd3,0xad +db 0x3b,0x4b,0xda,0x46,0xeb,0xc9,0xde,0x9a,0x8f,0x87,0xd7,0x3a,0x80,0x6f,0x2f,0xc8 +db 0xb1,0xb4,0x37,0xf7,0x0a,0x22,0x13,0x28,0x7c,0xcc,0x3c,0x89,0xc7,0xc3,0x96,0x56 +db 0x07,0xbf,0x7e,0xf0,0x0b,0x2b,0x97,0x52,0x35,0x41,0x79,0x61,0xa6,0x4c,0x10,0xfe +db 0xbc,0x26,0x95,0x88,0x8a,0xb0,0xa3,0xfb,0xc0,0x18,0x94,0xf2,0xe1,0xe5,0xe9,0x5d +db 0xd0,0xdc,0x11,0x66,0x64,0x5c,0xec,0x59,0x42,0x75,0x12,0xf5,0x74,0x9c,0xaa,0x23 +db 0x0e,0x86,0xab,0xbe,0x2a,0x02,0xe7,0x67,0xe6,0x44,0xa2,0x6c,0xc2,0x93,0x9f,0xf1 +db 0xf6,0xfa,0x36,0xd2,0x50,0x68,0x9e,0x62,0x71,0x15,0x3d,0xd6,0x40,0xc4,0xe2,0x0f +db 0x8e,0x83,0x77,0x6b,0x25,0x05,0x3f,0x0c,0x30,0xea,0x70,0xb7,0xa1,0xe8,0xa9,0x65 +db 0x8d,0x27,0x1a,0xdb,0x81,0xb3,0xa0,0xf4,0x45,0x7a,0x19,0xdf,0xee,0x78,0x34,0x60 + +S1: +db 0x55,0xc2,0x63,0x71,0x3b,0xc8,0x47,0x86,0x9f,0x3c,0xda,0x5b,0x29,0xaa,0xfd,0x77 +db 0x8c,0xc5,0x94,0x0c,0xa6,0x1a,0x13,0x00,0xe3,0xa8,0x16,0x72,0x40,0xf9,0xf8,0x42 +db 0x44,0x26,0x68,0x96,0x81,0xd9,0x45,0x3e,0x10,0x76,0xc6,0xa7,0x8b,0x39,0x43,0xe1 +db 0x3a,0xb5,0x56,0x2a,0xc0,0x6d,0xb3,0x05,0x22,0x66,0xbf,0xdc,0x0b,0xfa,0x62,0x48 +db 0xdd,0x20,0x11,0x06,0x36,0xc9,0xc1,0xcf,0xf6,0x27,0x52,0xbb,0x69,0xf5,0xd4,0x87 +db 0x7f,0x84,0x4c,0xd2,0x9c,0x57,0xa4,0xbc,0x4f,0x9a,0xdf,0xfe,0xd6,0x8d,0x7a,0xeb +db 0x2b,0x53,0xd8,0x5c,0xa1,0x14,0x17,0xfb,0x23,0xd5,0x7d,0x30,0x67,0x73,0x08,0x09 +db 0xee,0xb7,0x70,0x3f,0x61,0xb2,0x19,0x8e,0x4e,0xe5,0x4b,0x93,0x8f,0x5d,0xdb,0xa9 +db 0xad,0xf1,0xae,0x2e,0xcb,0x0d,0xfc,0xf4,0x2d,0x46,0x6e,0x1d,0x97,0xe8,0xd1,0xe9 +db 0x4d,0x37,0xa5,0x75,0x5e,0x83,0x9e,0xab,0x82,0x9d,0xb9,0x1c,0xe0,0xcd,0x49,0x89 +db 0x01,0xb6,0xbd,0x58,0x24,0xa2,0x5f,0x38,0x78,0x99,0x15,0x90,0x50,0xb8,0x95,0xe4 +db 0xd0,0x91,0xc7,0xce,0xed,0x0f,0xb4,0x6f,0xa0,0xcc,0xf0,0x02,0x4a,0x79,0xc3,0xde +db 0xa3,0xef,0xea,0x51,0xe6,0x6b,0x18,0xec,0x1b,0x2c,0x80,0xf7,0x74,0xe7,0xff,0x21 +db 0x5a,0x6a,0x54,0x1e,0x41,0x31,0x92,0x35,0xc4,0x33,0x07,0x0a,0xba,0x7e,0x0e,0x34 +db 0x88,0xb1,0x98,0x7c,0xf3,0x3d,0x60,0x6c,0x7b,0xca,0xd3,0x1f,0x32,0x65,0x04,0x28 +db 0x64,0xbe,0x85,0x9b,0x2f,0x59,0x8a,0xd7,0xb0,0x25,0xac,0xaf,0x12,0x03,0xe2,0xf2 + +EK_d: +dw 0x44D7, 0x26BC, 0x626B, 0x135E, 0x5789, 0x35E2, 0x7135, 0x09AF, +dw 0x4D78, 0x2F13, 0x6BC4, 0x1AF1, 0x5E26, 0x3C4D, 0x789A, 0x47AC + +mask31: +dd 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF + +align 16 +bit_reverse_table_l: +db 0x00, 0x08, 0x04, 0x0c, 0x02, 0x0a, 0x06, 0x0e, 0x01, 0x09, 0x05, 0x0d, 0x03, 0x0b, 0x07, 0x0f + +align 16 +bit_reverse_table_h: +db 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0 + +align 16 +bit_reverse_and_table: +db 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f + +align 16 +data_mask_64bits: +dd 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 + +bit_mask_table: +db 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe + + +section .text +align 64 + +%define OFFSET_FR1 (16*4) +%define OFFSET_FR2 (17*4) +%define OFFSET_BRC_X0 (18*4) +%define OFFSET_BRC_X1 (19*4) +%define OFFSET_BRC_X2 (20*4) +%define OFFSET_BRC_X3 (21*4) + +%define MASK31 xmm12 + +%define OFS_R1 (16*(4*4)) +%define OFS_R2 (OFS_R1 + (4*4)) +%define OFS_X0 (OFS_R2 + (4*4)) +%define OFS_X1 (OFS_X0 + (4*4)) +%define OFS_X2 (OFS_X1 + (4*4)) +%define OFS_X3 (OFS_X2 + (4*4)) + +%ifidn __OUTPUT_FORMAT__, win64 + %define XMM_STORAGE 16*10 +%else + %define XMM_STORAGE 0 +%endif + +%define VARIABLE_OFFSET XMM_STORAGE + +%macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + push r15 +%ifidn __OUTPUT_FORMAT__, win64 + push rdi + push rsi +%endif + mov r14, rsp + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + vmovdqu [rsp + 0*16],xmm6 + vmovdqu [rsp + 1*16],xmm7 + vmovdqu [rsp + 2*16],xmm8 + vmovdqu [rsp + 3*16],xmm9 + vmovdqu [rsp + 4*16],xmm10 + vmovdqu [rsp + 5*16],xmm11 + vmovdqu [rsp + 6*16],xmm12 + vmovdqu [rsp + 7*16],xmm13 + vmovdqu [rsp + 8*16],xmm14 + vmovdqu [rsp + 9*16],xmm15 +%endif +%endmacro + + +%macro FUNC_RESTORE 0 + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15, [rsp + 9*16] + vmovdqu xmm14, [rsp + 8*16] + vmovdqu xmm13, [rsp + 7*16] + vmovdqu xmm12, [rsp + 6*16] + vmovdqu xmm11, [rsp + 5*16] + vmovdqu xmm10, [rsp + 4*16] + vmovdqu xmm9, [rsp + 3*16] + vmovdqu xmm8, [rsp + 2*16] + vmovdqu xmm7, [rsp + 1*16] + vmovdqu xmm6, [rsp + 0*16] +%endif + mov rsp, r14 +%ifidn __OUTPUT_FORMAT__, win64 + pop rsi + pop rdi +%endif + pop r15 + pop r14 + pop r13 + pop r12 +%endmacro + + +;; +;; make_u31() +;; +%macro make_u31 4 + +%define %%Rt %1 +%define %%Ke %2 +%define %%Ek %3 +%define %%Iv %4 + xor %%Rt, %%Rt + shrd %%Rt, %%Iv, 8 + shrd %%Rt, %%Ek, 15 + shrd %%Rt, %%Ke, 9 +%endmacro + + +; +; bits_reorg4() +; +; params +; %1 - round number +; rax - LFSR pointer +; uses +; +; return +; +%macro bits_reorg4 1 + ; + ; xmm15 = LFSR_S15 + ; xmm14 = LFSR_S14 + ; xmm11 = LFSR_S11 + ; xmm9 = LFSR_S9 + ; xmm7 = LFSR_S7 + ; xmm5 = LFSR_S5 + ; xmm2 = LFSR_S2 + ; xmm0 = LFSR_S0 + ; + vmovdqa xmm15, [rax + ((15 + %1) % 16)*16] + vmovdqa xmm14, [rax + ((14 + %1) % 16)*16] + vmovdqa xmm11, [rax + ((11 + %1) % 16)*16] + vmovdqa xmm9, [rax + (( 9 + %1) % 16)*16] + vmovdqa xmm7, [rax + (( 7 + %1) % 16)*16] + vmovdqa xmm5, [rax + (( 5 + %1) % 16)*16] + vmovdqa xmm2, [rax + (( 2 + %1) % 16)*16] + vmovdqa xmm0, [rax + (( 0 + %1) % 16)*16] + + vpxor xmm1, xmm1 + vpslld xmm15, 1 + vpblendw xmm3, xmm14, xmm1, 0xAA + vpblendw xmm15, xmm3, xmm15, 0xAA + + vmovdqa [rax + OFS_X0], xmm15 ; BRC_X0 + vpslld xmm11, 16 + vpsrld xmm9, 15 + vpor xmm11, xmm9 + vmovdqa [rax + OFS_X1], xmm11 ; BRC_X1 + vpslld xmm7, 16 + vpsrld xmm5, 15 + vpor xmm7, xmm5 + vmovdqa [rax + OFS_X2], xmm7 ; BRC_X2 + vpslld xmm2, 16 + vpsrld xmm0, 15 + vpor xmm2, xmm0 + vmovdqa [rax + OFS_X3], xmm2 ; BRC_X3 +%endmacro + +%macro lookup_single_sbox 2 +%define %%table %1 ; [in] Pointer to table to look up +%define %%idx_val %2 ; [in/out] Index to look up and returned value (rcx, rdx, r8, r9) + +%ifdef SAFE_LOOKUP + ;; Save all registers used in lookup_8bit (xmm0-5, r9,r10) + ;; and registers for param passing and return (4 regs, OS dependent) + ;; (6*16 + 6*8 = 144 bytes) + sub rsp, 144 + + vmovdqu [rsp], xmm0 + vmovdqu [rsp + 16], xmm1 + vmovdqu [rsp + 32], xmm2 + vmovdqu [rsp + 48], xmm3 + vmovdqu [rsp + 64], xmm4 + vmovdqu [rsp + 80], xmm5 + mov [rsp + 96], r9 + mov [rsp + 104], r10 + +%ifdef LINUX + mov [rsp + 112], rdi + mov [rsp + 120], rsi + mov [rsp + 128], rdx + mov rdi, %%table + mov rsi, %%idx_val + mov rdx, 256 +%else +%ifnidni %%idx_val, rcx + mov [rsp + 112], rcx +%endif +%ifnidni %%idx_val, rdx + mov [rsp + 120], rdx +%endif +%ifnidni %%idx_val, r8 + mov [rsp + 128], r8 +%endif + + mov rdx, %%idx_val + mov rcx, %%table + mov r8, 256 +%endif + mov [rsp + 136], rax + + call lookup_8bit_avx + + ;; Restore all registers + vmovdqu xmm0, [rsp] + vmovdqu xmm1, [rsp + 16] + vmovdqu xmm2, [rsp + 32] + vmovdqu xmm3, [rsp + 48] + vmovdqu xmm4, [rsp + 64] + vmovdqu xmm5, [rsp + 80] + mov r9, [rsp + 96] + mov r10, [rsp + 104] + +%ifdef LINUX + mov rdi, [rsp + 112] + mov rsi, [rsp + 120] + mov rdx, [rsp + 128] +%else +%ifnidni %%idx_val, rcx + mov rcx, [rsp + 112] +%endif +%ifnidni %%idx_val, rdx + mov rdx, [rsp + 120] +%endif +%ifnidni %%idx_val, rcx + mov r8, [rsp + 128] +%endif +%endif + + ;; Move returned value from lookup function, before restoring rax + mov DWORD(%%idx_val), eax + mov rax, [rsp + 136] + + add rsp, 144 + +%else ;; SAFE_LOOKUP + + movzx DWORD(%%idx_val), BYTE [%%table + %%idx_val] + +%endif ;; SAFE_LOOKUP +%endmacro + +; +; sbox_lkup() +; +; params +; %1 R1/R2 table offset +; %2 R1/R2 entry offset +; %3 xmm reg name +; uses +; rcx,rdx,r8,r9,r10,rsi +; return +; +%macro sbox_lkup 3 + vpextrb rcx, %3, (0 + (%2 * 4)) + lookup_single_sbox rsi, rcx + vpextrb rdx, %3, (1 + (%2 * 4)) + lookup_single_sbox rdi, rdx + + xor r10, r10 + vpextrb r8, %3, (2 + (%2 * 4)) + lookup_single_sbox rsi, r8 + vpextrb r9, %3, (3 + (%2 * 4)) + lookup_single_sbox rdi, r9 + + shrd r10d, ecx, 8 + shrd r10d, edx, 8 + shrd r10d, r8d, 8 + shrd r10d, r9d, 8 + mov [rax + %1 + (%2 * 4)], r10d +%endmacro + + +; +; rot_mod32() +; +; uses xmm7 +; +%macro rot_mod32 3 + vpslld %1, %2, %3 + vpsrld xmm7, %2, (32 - %3) + + vpor %1, xmm7 +%endmacro + + +; +; nonlin_fun4() +; +; params +; %1 == 1, then calculate W +; uses +; +; return +; xmm0 = W value, updates F_R1[] / F_R2[] +; +%macro nonlin_fun4 1 + +%if (%1 == 1) + vmovdqa xmm0, [rax + OFS_X0] + vpxor xmm0, [rax + OFS_R1] + vpaddd xmm0, [rax + OFS_R2] ; W = (BRC_X0 ^ F_R1) + F_R2 +%endif + ; + vmovdqa xmm1, [rax + OFS_R1] + vmovdqa xmm2, [rax + OFS_R2] + vpaddd xmm1, [rax + OFS_X1] ; W1 = F_R1 + BRC_X1 + vpxor xmm2, [rax + OFS_X2] ; W2 = F_R2 ^ BRC_X2 + ; + + vpslld xmm3, xmm1, 16 + vpsrld xmm4, xmm1, 16 + vpslld xmm5, xmm2, 16 + vpsrld xmm6, xmm2, 16 + vpor xmm1, xmm3, xmm6 + vpor xmm2, xmm4, xmm5 + + ; + rot_mod32 xmm3, xmm1, 2 + rot_mod32 xmm4, xmm1, 10 + rot_mod32 xmm5, xmm1, 18 + rot_mod32 xmm6, xmm1, 24 + vpxor xmm1, xmm3 + vpxor xmm1, xmm4 + vpxor xmm1, xmm5 + vpxor xmm1, xmm6 ; XMM1 = U = L1(P) + + sbox_lkup OFS_R1, 0, xmm1 ; F_R1[0] + sbox_lkup OFS_R1, 1, xmm1 ; F_R1[1] + sbox_lkup OFS_R1, 2, xmm1 ; F_R1[2] + sbox_lkup OFS_R1, 3, xmm1 ; F_R1[3] + ; + rot_mod32 xmm3, xmm2, 8 + rot_mod32 xmm4, xmm2, 14 + rot_mod32 xmm5, xmm2, 22 + rot_mod32 xmm6, xmm2, 30 + vpxor xmm2, xmm3 + vpxor xmm2, xmm4 + vpxor xmm2, xmm5 + vpxor xmm2, xmm6 ; XMM2 = V = L2(Q) + ; + + sbox_lkup OFS_R2, 0, xmm2 ; F_R2[0] + sbox_lkup OFS_R2, 1, xmm2 ; F_R2[1] + sbox_lkup OFS_R2, 2, xmm2 ; F_R2[2] + sbox_lkup OFS_R2, 3, xmm2 ; F_R2[3] +%endmacro + + +; +; store_kstr4() +; +; params +; +; uses +; xmm0 as input +; return +; +%macro store_kstr4 0 + vpxor xmm0, [rax + OFS_X3] + vpextrd r15d, xmm0, 3 + pop r9 ; *pKeyStr4 + vpextrd r14d, xmm0, 2 + pop r8 ; *pKeyStr3 + vpextrd r13d, xmm0, 1 + pop rdx ; *pKeyStr2 + vpextrd r12d, xmm0, 0 + pop rcx ; *pKeyStr1 + mov [r9], r15d + mov [r8], r14d + mov [rdx], r13d + mov [rcx], r12d + add rcx, 4 + add rdx, 4 + add r8, 4 + add r9, 4 + push rcx + push rdx + push r8 + push r9 +%endmacro + + +; +; add_mod31() +; add two 32-bit args and reduce mod (2^31-1) +; params +; %1 - arg1/res +; %2 - arg2 +; uses +; xmm2 +; return +; %1 +%macro add_mod31 2 + vpaddd %1, %2 + vpsrld xmm2, %1, 31 + vpand %1, MASK31 + vpaddd %1, xmm2 +%endmacro + + +; +; rot_mod31() +; rotate (mult by pow of 2) 32-bit arg and reduce mod (2^31-1) +; params +; %1 - arg +; %2 - # of bits +; uses +; xmm2 +; return +; %1 +%macro rot_mod31 2 + + vpslld xmm2, %1, %2 + vpsrld %1, %1, (31 - %2) + + vpor %1, xmm2 + vpand %1, MASK31 +%endmacro + + +; +; lfsr_updt4() +; +; params +; %1 - round number +; uses +; xmm0 as input (ZERO or W) +; return +; +%macro lfsr_updt4 1 + ; + ; xmm1 = LFSR_S0 + ; xmm4 = LFSR_S4 + ; xmm10 = LFSR_S10 + ; xmm13 = LFSR_S13 + ; xmm15 = LFSR_S15 + ; + vpxor xmm3, xmm3 + vmovdqa xmm1, [rax + (( 0 + %1) % 16)*16] + vmovdqa xmm4, [rax + (( 4 + %1) % 16)*16] + vmovdqa xmm10, [rax + ((10 + %1) % 16)*16] + vmovdqa xmm13, [rax + ((13 + %1) % 16)*16] + vmovdqa xmm15, [rax + ((15 + %1) % 16)*16] + + ; Calculate LFSR feedback + add_mod31 xmm0, xmm1 + rot_mod31 xmm1, 8 + add_mod31 xmm0, xmm1 + rot_mod31 xmm4, 20 + add_mod31 xmm0, xmm4 + rot_mod31 xmm10, 21 + add_mod31 xmm0, xmm10 + rot_mod31 xmm13, 17 + add_mod31 xmm0, xmm13 + rot_mod31 xmm15, 15 + add_mod31 xmm0, xmm15 + + + + vmovdqa [rax + (( 0 + %1) % 16)*16], xmm0 + + ; LFSR_S16 = (LFSR_S15++) = eax +%endmacro + + +; +; key_expand_4() +; +%macro key_expand_4 2 + movzx r8d, byte [rdi + (%1 + 0)] + movzx r9d, word [rbx + ((%1 + 0)*2)] + movzx r10d, byte [rsi + (%1 + 0)] + make_u31 r11d, r8d, r9d, r10d + mov [rax + (((%1 + 0)*16)+(%2*4))], r11d + + movzx r12d, byte [rdi + (%1 + 1)] + movzx r13d, word [rbx + ((%1 + 1)*2)] + movzx r14d, byte [rsi + (%1 + 1)] + make_u31 r15d, r12d, r13d, r14d + mov [rax + (((%1 + 1)*16)+(%2*4))], r15d +%endmacro + + +MKGLOBAL(asm_ZucInitialization_4_avx,function,internal) +asm_ZucInitialization_4_avx: + +%ifdef LINUX + %define pKe rdi + %define pIv rsi + %define pState rdx +%else + %define pKe rcx + %define pIv rdx + %define pState r8 +%endif + + ; Save non-volatile registers + push rbx + push rdi + push rsi + push r12 + push r13 + push r14 + push r15 + push rdx + + lea rax, [pState] ; load pointer to LFSR + push pState ; Save LFSR Pointer to stack + + ; setup the key pointer for first buffer key expand + mov rbx, [pKe] ; load the pointer to the array of keys into rbx + + push pKe ; save rdi (key pointer) to the stack + lea rdi, [rbx] ; load the pointer to the first key into rdi + + + ; setup the IV pointer for first buffer key expand + mov rcx, [pIv] ; load the pointer to the array of IV's + push pIv ; save the IV pointer to the stack + lea rsi, [rcx] ; load the first IV pointer + + lea rbx, [EK_d] ; load D variables + + ; Expand key packet 1 + key_expand_4 0, 0 + key_expand_4 2, 0 + key_expand_4 4, 0 + key_expand_4 6, 0 + key_expand_4 8, 0 + key_expand_4 10, 0 + key_expand_4 12, 0 + key_expand_4 14, 0 + + + ;second packet key expand here - reset pointers + pop rdx ; get IV array pointer from Stack + mov rcx, [rdx+8] ; load offset to IV 2 in array + lea rsi, [rcx] ; load pointer to IV2 + + pop rbx ; get Key array pointer from Stack + mov rcx, [rbx+8] ; load offset to key 2 in array + lea rdi, [rcx] ; load pointer to Key 2 + + push rbx ; save Key pointer + push rdx ; save IV pointer + + lea rbx, [EK_d] + + ; Expand key packet 2 + key_expand_4 0, 1 + key_expand_4 2, 1 + key_expand_4 4, 1 + key_expand_4 6, 1 + key_expand_4 8, 1 + key_expand_4 10, 1 + key_expand_4 12, 1 + key_expand_4 14, 1 + + + + ;Third packet key expand here - reset pointers + pop rdx ; get IV array pointer from Stack + mov rcx, [rdx+16] ; load offset to IV 3 in array + lea rsi, [rcx] ; load pointer to IV3 + + pop rbx ; get Key array pointer from Stack + mov rcx, [rbx+16] ; load offset to key 3 in array + lea rdi, [rcx] ; load pointer to Key 3 + + push rbx ; save Key pointer + push rdx ; save IV pointer + lea rbx, [EK_d] + ; Expand key packet 3 + key_expand_4 0, 2 + key_expand_4 2, 2 + key_expand_4 4, 2 + key_expand_4 6, 2 + key_expand_4 8, 2 + key_expand_4 10, 2 + key_expand_4 12, 2 + key_expand_4 14, 2 + + + + ;fourth packet key expand here - reset pointers + pop rdx ; get IV array pointer from Stack + mov rcx, [rdx+24] ; load offset to IV 4 in array + lea rsi, [rcx] ; load pointer to IV4 + + pop rbx ; get Key array pointer from Stack + mov rcx, [rbx+24] ; load offset to key 2 in array + lea rdi, [rcx] ; load pointer to Key 2 + lea rbx, [EK_d] + ; Expand key packet 4 + key_expand_4 0, 3 + key_expand_4 2, 3 + key_expand_4 4, 3 + key_expand_4 6, 3 + key_expand_4 8, 3 + key_expand_4 10, 3 + key_expand_4 12, 3 + key_expand_4 14, 3 + + ; Set R1 and R2 to zero + ;xor r10, r10 + ;xor r11, r11 + + + + ; Load read-only registers + lea rdi, [S0] ; used by sbox_lkup() macro + lea rsi, [S1] + vmovdqa xmm12, [mask31] + + ; Shift LFSR 32-times, update state variables +%assign N 0 +%rep 32 + pop rdx + lea rax, [rdx] + push rdx + + bits_reorg4 N + nonlin_fun4 1 + vpsrld xmm0,1 ; Shift out LSB of W + + pop rdx + lea rax, [rdx] + push rdx + + lfsr_updt4 N ; W (xmm0) used in LFSR update - not set to zero +%assign N N+1 +%endrep + + ; And once more, initial round from keygen phase = 33 times + pop rdx + lea rax, [rdx] + push rdx + + bits_reorg4 0 + nonlin_fun4 0 + + pop rdx + lea rax, [rdx] + + vpxor xmm0, xmm0 + lfsr_updt4 0 + + + + ; Restore non-volatile registers + pop rdx + pop r15 + pop r14 + pop r13 + pop r12 + pop rsi + pop rdi + pop rbx + + ret +; +; +; +;; +;; void asm_ZucGenKeystream64B_4_avx(state4_t *pSta, u32* pKeyStr1, u32* pKeyStr2, u32* pKeyStr3, u32* pKeyStr4); +;; +;; WIN64 +;; RCX - pSta +;; RDX - pKeyStr1 +;; R8 - pKeyStr2 +;; R9 - pKeyStr3 +;; Stack - pKeyStr4 +;; +;; LIN64 +;; RDI - pSta +;; RSI - pKeyStr1 +;; RDX - pKeyStr2 +;; RCX - pKeyStr3 +;; R8 - pKeyStr4 +;; +MKGLOBAL(asm_ZucGenKeystream64B_4_avx,function,internal) +asm_ZucGenKeystream64B_4_avx: + +%ifdef LINUX + %define pState rdi + %define pKS1 rsi + %define pKS2 rdx + %define pKS3 rcx + %define pKS4 r8 +%else + %define pState rcx + %define pKS1 rdx + %define pKS2 r8 + %define pKS3 r9 + %define pKS4 rax +%endif + +%ifndef LINUX + mov rax, [rsp + 8*5] ; 5th parameter from stack +%endif + + ; Save non-volatile registers + push rbx + push r12 + push r13 + push r14 + push r15 + +%ifndef LINUX + push rdi + push rsi +%endif + ; Store 4 keystream pointers on the stack + + push pKS1 + push pKS2 + push pKS3 + push pKS4 + + + ; Load state pointer in RAX + mov rax, pState + + + ; Load read-only registers + lea rdi, [S0] ; used by sbox_lkup() macro + lea rsi, [S1] + vmovdqa xmm12, [mask31] + + ; Generate 64B of keystream in 16 rounds +%assign N 1 +%rep 16 + bits_reorg4 N + nonlin_fun4 1 + store_kstr4 + vpxor xmm0, xmm0 + lfsr_updt4 N +%assign N N+1 +%endrep + + ; Take keystream pointers off (#push = #pops) + pop rax + pop rax + pop rax + pop rax + +%ifndef LINUX + pop rsi + pop rdi +%endif + + ; Restore non-volatile registers + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret + + +;; +;; extern uint32_t asm_Eia3RemainderAVX(const void *ks, const void *data, uint64_t n_bits) +;; +;; Returns authentication update value to be XOR'ed with current authentication tag +;; +;; WIN64 +;; RCX - KS (key stream pointer) +;; RDX - DATA (data pointer) +;; R8 - N_BITS (number data bits to process) +;; LIN64 +;; RDI - KS (key stream pointer) +;; RSI - DATA (data pointer) +;; RDX - N_BITS (number data bits to process) +;; +align 64 +MKGLOBAL(asm_Eia3RemainderAVX,function,internal) +asm_Eia3RemainderAVX: + +%ifdef LINUX + %define KS rdi + %define DATA rsi + %define N_BITS rdx +%else + %define KS rcx + %define DATA rdx + %define N_BITS r8 +%endif + FUNC_SAVE + + vmovdqa xmm5, [bit_reverse_table_l] + vmovdqa xmm6, [bit_reverse_table_h] + vmovdqa xmm7, [bit_reverse_and_table] + vmovdqa xmm10, [data_mask_64bits] + vpxor xmm9, xmm9 + +%rep 3 + cmp N_BITS, 128 + jb Eia3RoundsAVX_dq_end + + ;; read 16 bytes and reverse bits + vmovdqu xmm0, [DATA] + vmovdqa xmm1, xmm0 + vpand xmm1, xmm7 + + vmovdqa xmm2, xmm7 + vpandn xmm2, xmm0 + vpsrld xmm2, 4 + + vmovdqa xmm8, xmm6 ; bit reverse low nibbles (use high table) + vpshufb xmm8, xmm1 + + vmovdqa xmm4, xmm5 ; bit reverse high nibbles (use low table) + vpshufb xmm4, xmm2 + + vpor xmm8, xmm4 + ; xmm8 - bit reversed data bytes + + ;; ZUC authentication part + ;; - 4x32 data bits + ;; - set up KS + vmovdqu xmm3, [KS + (0*4)] + vmovdqu xmm4, [KS + (2*4)] + vpshufd xmm0, xmm3, 0x61 + vpshufd xmm1, xmm4, 0x61 + + ;; - set up DATA + vmovdqa xmm2, xmm8 + vpand xmm2, xmm10 + vpshufd xmm3, xmm2, 0xdc + vmovdqa xmm4, xmm3 + + vpsrldq xmm8, 8 + vpshufd xmm13, xmm8, 0xdc + vmovdqa xmm14, xmm13 + + ;; - clmul + ;; - xor the results from 4 32-bit words together + vpclmulqdq xmm3, xmm0, 0x00 + vpclmulqdq xmm4, xmm0, 0x11 + vpclmulqdq xmm13, xmm1, 0x00 + vpclmulqdq xmm14, xmm1, 0x11 + + vpxor xmm3, xmm4 + vpxor xmm13, xmm14 + vpxor xmm9, xmm3 + vpxor xmm9, xmm13 + lea DATA, [DATA + 16] + lea KS, [KS + 16] + sub N_BITS, 128 +%endrep +Eia3RoundsAVX_dq_end: + +%rep 3 + cmp N_BITS, 32 + jb Eia3RoundsAVX_dw_end + + ;; swap dwords in KS + vmovq xmm1, [KS] + vpshufd xmm4, xmm1, 0xf1 + + ;; bit-reverse 4 bytes of data + vmovdqa xmm2, xmm7 + vmovd xmm0, [DATA] + vmovdqa xmm1, xmm0 + vpand xmm1, xmm2 + + vpandn xmm2, xmm0 + vpsrld xmm2, 4 + + vmovdqa xmm0, xmm6 ; bit reverse low nibbles (use high table) + vpshufb xmm0, xmm1 + + vmovdqa xmm3, xmm5 ; bit reverse high nibbles (use low table) + vpshufb xmm3, xmm2 + + vpor xmm0, xmm3 + + ;; rol & xor + vpclmulqdq xmm0, xmm4, 0 + vpxor xmm9, xmm0 + + lea DATA, [DATA + 4] + lea KS, [KS + 4] + sub N_BITS, 32 +%endrep + +Eia3RoundsAVX_dw_end: + vmovq rax, xmm9 + shr rax, 32 + + or N_BITS, N_BITS + jz Eia3RoundsAVX_byte_loop_end + + ;; get 64-bit key stream for the last data bits (less than 32) + mov KS, [KS] + + ;; process remaining data bytes and bits +Eia3RoundsAVX_byte_loop: + or N_BITS, N_BITS + jz Eia3RoundsAVX_byte_loop_end + + cmp N_BITS, 8 + jb Eia3RoundsAVX_byte_partial + + movzx r11, byte [DATA] + sub N_BITS, 8 + jmp Eia3RoundsAVX_byte_read + +Eia3RoundsAVX_byte_partial: + ;; process remaining bits (up to 7) + lea r11, [bit_mask_table] + movzx r10, byte [r11 + N_BITS] + movzx r11, byte [DATA] + and r11, r10 + xor N_BITS, N_BITS +Eia3RoundsAVX_byte_read: + +%assign DATATEST 0x80 +%rep 8 + xor r10, r10 + test r11, DATATEST + cmovne r10, KS + xor rax, r10 + rol KS, 1 +%assign DATATEST (DATATEST >> 1) +%endrep ; byte boundary + lea DATA, [DATA + 1] + jmp Eia3RoundsAVX_byte_loop + +Eia3RoundsAVX_byte_loop_end: + + ;; eax - holds the return value at this stage + FUNC_RESTORE + + ret + +;; +;;extern uint32_t asm_Eia3Round64BAVX(uint32_t T, const void *KS, const void *DATA) +;; +;; Updates authentication tag T based on keystream KS and DATA. +;; - it processes 64 bytes of DATA +;; - reads data in 16 byte chunks and bit reverses them +;; - reads and re-arranges KS +;; - employs clmul for the XOR & ROL part +;; - copies top 64 butes of KS to bottom (for the next round) +;; +;; WIN64 +;; RCX - T +;; RDX - KS pointer to key stream (2 x 64 bytes) +;;; R8 - DATA pointer to data +;; LIN64 +;; RDI - T +;; RSI - KS pointer to key stream (2 x 64 bytes) +;; RDX - DATA pointer to data +;; +align 64 +MKGLOBAL(asm_Eia3Round64BAVX,function,internal) +asm_Eia3Round64BAVX: + +%ifdef LINUX + %define T edi + %define KS rsi + %define DATA rdx +%else + %define T ecx + %define KS rdx + %define DATA r8 +%endif + + FUNC_SAVE + + vmovdqa xmm5, [bit_reverse_table_l] + vmovdqa xmm6, [bit_reverse_table_h] + vmovdqa xmm7, [bit_reverse_and_table] + vmovdqa xmm10, [data_mask_64bits] + + vpxor xmm9, xmm9 +%assign I 0 +%rep 4 + ;; read 16 bytes and reverse bits + vmovdqu xmm0, [DATA + 16*I] + vpand xmm1, xmm0, xmm7 + + vpandn xmm2, xmm7, xmm0 + vpsrld xmm2, 4 + + vpshufb xmm8, xmm6, xmm1 ; bit reverse low nibbles (use high table) + vpshufb xmm4, xmm5, xmm2 ; bit reverse high nibbles (use low table) + + vpor xmm8, xmm4 + ; xmm8 - bit reversed data bytes + + ;; ZUC authentication part + ;; - 4x32 data bits + ;; - set up KS +%if I != 0 + vmovdqa xmm11, xmm12 + vmovdqu xmm12, [KS + (I*16) + (4*4)] +%else + vmovdqu xmm11, [KS + (I*16) + (0*4)] + vmovdqu xmm12, [KS + (I*16) + (4*4)] +%endif + vpalignr xmm13, xmm12, xmm11, 8 + vpshufd xmm2, xmm11, 0x61 + vpshufd xmm3, xmm13, 0x61 + + ;; - set up DATA + vpand xmm13, xmm10, xmm8 + vpshufd xmm0, xmm13, 0xdc + + vpsrldq xmm8, 8 + vpshufd xmm1, xmm8, 0xdc + + ;; - clmul + ;; - xor the results from 4 32-bit words together +%if I != 0 + vpclmulqdq xmm13, xmm0, xmm2, 0x00 + vpclmulqdq xmm14, xmm0, xmm2, 0x11 + vpclmulqdq xmm15, xmm1, xmm3, 0x00 + vpclmulqdq xmm8, xmm1, xmm3, 0x11 + + vpxor xmm13, xmm14 + vpxor xmm15, xmm8 + vpxor xmm9, xmm13 + vpxor xmm9, xmm15 +%else + vpclmulqdq xmm9, xmm0, xmm2, 0x00 + vpclmulqdq xmm13, xmm0, xmm2, 0x11 + vpclmulqdq xmm14, xmm1, xmm3, 0x00 + vpclmulqdq xmm15, xmm1, xmm3, 0x11 + + vpxor xmm14, xmm15 + vpxor xmm9, xmm13 + vpxor xmm9, xmm14 +%endif + + +%assign I (I + 1) +%endrep + + ;; - update T + vmovq rax, xmm9 + shr rax, 32 + xor eax, T + + FUNC_RESTORE + + ret + + +;---------------------------------------------------------------------------------------- +;---------------------------------------------------------------------------------------- + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/zuc_avx_top.c b/src/spdk/intel-ipsec-mb/avx/zuc_avx_top.c new file mode 100755 index 000000000..b3ba2de81 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/zuc_avx_top.c @@ -0,0 +1,548 @@ +/******************************************************************************* + Copyright (c) 2009-2019, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +/*----------------------------------------------------------------------- +* zuc_avx.c +*----------------------------------------------------------------------- +* An implementation of ZUC, the core algorithm for the +* 3GPP Confidentiality and Integrity algorithms. +* +*-----------------------------------------------------------------------*/ + +#include <string.h> + +#include "include/zuc_internal.h" +#include "include/wireless_common.h" +#include "include/save_xmms.h" +#include "include/clear_regs_mem.h" +#include "intel-ipsec-mb.h" + +#define SAVE_XMMS save_xmms_avx +#define RESTORE_XMMS restore_xmms_avx +#define CLEAR_SCRATCH_SIMD_REGS clear_scratch_xmms_avx + +static inline +void _zuc_eea3_1_buffer_avx(const void *pKey, + const void *pIv, + const void *pBufferIn, + void *pBufferOut, + const uint32_t length) +{ + DECLARE_ALIGNED(ZucState_t zucState, 64); + DECLARE_ALIGNED(uint8_t keyStream[64], 64); + /* buffer to store 64 bytes of keystream */ + DECLARE_ALIGNED(uint8_t tempSrc[64], 64); + DECLARE_ALIGNED(uint8_t tempDst[64], 64); + + const uint64_t *pIn64 = NULL; + const uint8_t *pIn8 = NULL; + uint8_t *pOut8 = NULL; + uint64_t *pOut64 = NULL, *pKeyStream64 = NULL; + uint64_t *pTemp64 = NULL, *pdstTemp64 = NULL; + + uint32_t numKeyStreamsPerPkt = length/ ZUC_KEYSTR_LEN; + uint32_t numBytesLeftOver = length % ZUC_KEYSTR_LEN; + + /* need to set the LFSR state to zero */ + memset(&zucState, 0, sizeof(ZucState_t)); + + /* initialize the zuc state */ + asm_ZucInitialization(pKey, pIv, &(zucState)); + + /* Loop Over all the Quad-Words in input buffer and XOR with the 64bits + * of generated keystream */ + pOut64 = (uint64_t *) pBufferOut; + pIn64 = (const uint64_t *) pBufferIn; + + while (numKeyStreamsPerPkt--) { + /* Generate the key stream 64 bytes at a time */ + asm_ZucGenKeystream64B((uint32_t *) &keyStream[0], &zucState); + + /* XOR The Keystream generated with the input buffer here */ + pKeyStream64 = (uint64_t *) keyStream; + asm_XorKeyStream64B_avx(pIn64, pOut64, pKeyStream64); + pIn64 += 8; + pOut64 += 8; + } + + /* Check for remaining 0 to 63 bytes */ + pIn8 = (const uint8_t *) pBufferIn; + pOut8 = (uint8_t *) pBufferOut; + if(numBytesLeftOver) { + asm_ZucGenKeystream64B((uint32_t *) &keyStream[0], &zucState); + + /* copy the remaining bytes into temporary buffer and XOR with + * the 64-bytes of keystream. Then copy on the valid bytes back + * to the output buffer */ + + memcpy(&tempSrc[0], &pIn8[length - numBytesLeftOver], + numBytesLeftOver); + pKeyStream64 = (uint64_t *) &keyStream[0]; + pTemp64 = (uint64_t *) &tempSrc[0]; + pdstTemp64 = (uint64_t *) &tempDst[0]; + + asm_XorKeyStream64B_avx(pTemp64, pdstTemp64, pKeyStream64); + memcpy(&pOut8[length - numBytesLeftOver], &tempDst[0], + numBytesLeftOver); + + } +#ifdef SAFE_DATA + /* Clear sensitive data in stack */ + clear_mem(keyStream, sizeof(keyStream)); + clear_mem(&zucState, sizeof(zucState)); +#endif +} + +static inline +void _zuc_eea3_4_buffer_avx(const void * const pKey[4], + const void * const pIv[4], + const void * const pBufferIn[4], + void *pBufferOut[4], + const uint32_t length[4]) +{ + DECLARE_ALIGNED(ZucState4_t state, 64); + DECLARE_ALIGNED(ZucState_t singlePktState, 64); + unsigned int i = 0; + /* Calculate the minimum input packet size */ + uint32_t bytes1 = (length[0] < length[1] ? + length[0] : length[1]); + uint32_t bytes2 = (length[2] < length[3] ? + length[2] : length[3]); + /* min number of bytes */ + uint32_t bytes = (bytes1 < bytes2) ? bytes1 : bytes2; + uint32_t numKeyStreamsPerPkt = bytes/ZUC_KEYSTR_LEN; + uint32_t remainBytes[4] = {0}; + DECLARE_ALIGNED(uint8_t keyStr1[64], 64); + DECLARE_ALIGNED(uint8_t keyStr2[64], 64); + DECLARE_ALIGNED(uint8_t keyStr3[64], 64); + DECLARE_ALIGNED(uint8_t keyStr4[64], 64); + DECLARE_ALIGNED(uint8_t tempSrc[64], 64); + DECLARE_ALIGNED(uint8_t tempDst[64], 64); + /* structure to store the 4 keys */ + DECLARE_ALIGNED(ZucKey4_t keys, 64); + /* structure to store the 4 IV's */ + DECLARE_ALIGNED(ZucIv4_t ivs, 64); + uint32_t numBytesLeftOver = 0; + const uint8_t *pTempBufInPtr = NULL; + uint8_t *pTempBufOutPtr = NULL; + + const uint64_t *pIn64_0 = NULL; + const uint64_t *pIn64_1 = NULL; + const uint64_t *pIn64_2 = NULL; + const uint64_t *pIn64_3 = NULL; + uint64_t *pOut64_0 = NULL; + uint64_t *pOut64_1 = NULL; + uint64_t *pOut64_2 = NULL; + uint64_t *pOut64_3 = NULL; + uint64_t *pTempSrc64 = NULL; + uint64_t *pTempDst64 = NULL; + uint64_t *pKeyStream64 = NULL; + + /* rounded down minimum length */ + bytes = numKeyStreamsPerPkt * ZUC_KEYSTR_LEN; + + /* Need to set the LFSR state to zero */ + memset(&state, 0, sizeof(ZucState4_t)); + + /* Calculate the number of bytes left over for each packet */ + for (i=0; i< 4; i++) + remainBytes[i] = length[i] - bytes; + + /* Setup the Keys */ + keys.pKey1 = pKey[0]; + keys.pKey2 = pKey[1]; + keys.pKey3 = pKey[2]; + keys.pKey4 = pKey[3]; + + /* setup the IV's */ + ivs.pIv1 = pIv[0]; + ivs.pIv2 = pIv[1]; + ivs.pIv3 = pIv[2]; + ivs.pIv4 = pIv[3]; + + asm_ZucInitialization_4_avx( &keys, &ivs, &state); + + pOut64_0 = (uint64_t *) pBufferOut[0]; + pOut64_1 = (uint64_t *) pBufferOut[1]; + pOut64_2 = (uint64_t *) pBufferOut[2]; + pOut64_3 = (uint64_t *) pBufferOut[3]; + + pIn64_0 = (const uint64_t *) pBufferIn[0]; + pIn64_1 = (const uint64_t *) pBufferIn[1]; + pIn64_2 = (const uint64_t *) pBufferIn[2]; + pIn64_3 = (const uint64_t *) pBufferIn[3]; + + /* Loop for 64 bytes at a time generating 4 key-streams per loop */ + while (numKeyStreamsPerPkt) { + /* Generate 64 bytes at a time */ + asm_ZucGenKeystream64B_4_avx(&state, + (uint32_t *) keyStr1, + (uint32_t *) keyStr2, + (uint32_t *) keyStr3, + (uint32_t *) keyStr4); + + /* XOR the KeyStream with the input buffers and store in output + * buffer*/ + pKeyStream64 = (uint64_t *) keyStr1; + asm_XorKeyStream64B_avx(pIn64_0, pOut64_0, pKeyStream64); + pIn64_0 += 8; + pOut64_0 += 8; + + pKeyStream64 = (uint64_t *) keyStr2; + asm_XorKeyStream64B_avx(pIn64_1, pOut64_1, pKeyStream64); + pIn64_1 += 8; + pOut64_1 += 8; + + pKeyStream64 = (uint64_t *) keyStr3; + asm_XorKeyStream64B_avx(pIn64_2, pOut64_2, pKeyStream64); + pIn64_2 += 8; + pOut64_2 += 8; + + pKeyStream64 = (uint64_t *) keyStr4; + asm_XorKeyStream64B_avx(pIn64_3, pOut64_3, pKeyStream64); + pIn64_3 += 8; + pOut64_3 += 8; + + /* Update keystream count */ + numKeyStreamsPerPkt--; + + } + + /* process each packet separately for the remaining bytes */ + for (i = 0; i < 4; i++) { + if (remainBytes[i]) { + /* need to copy the zuc state to single packet state */ + singlePktState.lfsrState[0] = state.lfsrState[0][i]; + singlePktState.lfsrState[1] = state.lfsrState[1][i]; + singlePktState.lfsrState[2] = state.lfsrState[2][i]; + singlePktState.lfsrState[3] = state.lfsrState[3][i]; + singlePktState.lfsrState[4] = state.lfsrState[4][i]; + singlePktState.lfsrState[5] = state.lfsrState[5][i]; + singlePktState.lfsrState[6] = state.lfsrState[6][i]; + singlePktState.lfsrState[7] = state.lfsrState[7][i]; + singlePktState.lfsrState[8] = state.lfsrState[8][i]; + singlePktState.lfsrState[9] = state.lfsrState[9][i]; + singlePktState.lfsrState[10] = state.lfsrState[10][i]; + singlePktState.lfsrState[11] = state.lfsrState[11][i]; + singlePktState.lfsrState[12] = state.lfsrState[12][i]; + singlePktState.lfsrState[13] = state.lfsrState[13][i]; + singlePktState.lfsrState[14] = state.lfsrState[14][i]; + singlePktState.lfsrState[15] = state.lfsrState[15][i]; + + singlePktState.fR1 = state.fR1[i]; + singlePktState.fR2 = state.fR2[i]; + + singlePktState.bX0 = state.bX0[i]; + singlePktState.bX1 = state.bX1[i]; + singlePktState.bX2 = state.bX2[i]; + singlePktState.bX3 = state.bX3[i]; + + numKeyStreamsPerPkt = remainBytes[i] / ZUC_KEYSTR_LEN; + numBytesLeftOver = remainBytes[i] % ZUC_KEYSTR_LEN; + + pTempBufInPtr = pBufferIn[i]; + pTempBufOutPtr = pBufferOut[i]; + + /* update the output and input pointers here to point + * to the i'th buffers */ + pOut64_0 = (uint64_t *) &pTempBufOutPtr[length[i] - + remainBytes[i]]; + pIn64_0 = (const uint64_t *) &pTempBufInPtr[length[i] - + remainBytes[i]]; + + while (numKeyStreamsPerPkt--) { + /* Generate the key stream 64 bytes at a time */ + asm_ZucGenKeystream64B((uint32_t *) keyStr1, + &singlePktState); + pKeyStream64 = (uint64_t *) keyStr1; + asm_XorKeyStream64B_avx(pIn64_0, pOut64_0, + pKeyStream64); + pIn64_0 += 8; + pOut64_0 += 8; + } + + + /* Check for remaining 0 to 63 bytes */ + if (numBytesLeftOver) { + asm_ZucGenKeystream64B((uint32_t *) &keyStr1, + &singlePktState); + uint32_t offset = length[i] - numBytesLeftOver; + + /* copy the remaining bytes into temporary + * buffer and XOR with the 64-bytes of + * keystream. Then copy on the valid bytes back + * to the output buffer */ + memcpy(&tempSrc[0], &pTempBufInPtr[offset], + numBytesLeftOver); + memset(&tempSrc[numBytesLeftOver], 0, + 64 - numBytesLeftOver); + + pKeyStream64 = (uint64_t *) &keyStr1[0]; + pTempSrc64 = (uint64_t *) &tempSrc[0]; + pTempDst64 = (uint64_t *) &tempDst[0]; + asm_XorKeyStream64B_avx(pTempSrc64, pTempDst64, + pKeyStream64); + + memcpy(&pTempBufOutPtr[offset], + &tempDst[0], numBytesLeftOver); + } + } + } +#ifdef SAFE_DATA + /* Clear sensitive data in stack */ + clear_mem(keyStr1, sizeof(keyStr1)); + clear_mem(keyStr2, sizeof(keyStr2)); + clear_mem(keyStr3, sizeof(keyStr3)); + clear_mem(keyStr4, sizeof(keyStr4)); + clear_mem(&singlePktState, sizeof(singlePktState)); + clear_mem(&state, sizeof(state)); + clear_mem(&keys, sizeof(keys)); + clear_mem(&ivs, sizeof(ivs)); +#endif +} + +void zuc_eea3_1_buffer_avx(const void *pKey, + const void *pIv, + const void *pBufferIn, + void *pBufferOut, + const uint32_t length) +{ +#ifndef LINUX + DECLARE_ALIGNED(uint128_t xmm_save[10], 16); + + SAVE_XMMS(xmm_save); +#endif +#ifdef SAFE_PARAM + /* Check for NULL pointers */ + if (pKey == NULL || pIv == NULL || pBufferIn == NULL || + pBufferOut == NULL) + return; + + /* Check input data is in range of supported length */ + if (length < ZUC_MIN_LEN || length > ZUC_MAX_LEN) + return; +#endif + _zuc_eea3_1_buffer_avx(pKey, pIv, pBufferIn, pBufferOut, length); + +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#ifndef LINUX + RESTORE_XMMS(xmm_save); +#endif +} + +void zuc_eea3_4_buffer_avx(const void * const pKey[4], + const void * const pIv[4], + const void * const pBufferIn[4], + void *pBufferOut[4], + const uint32_t length[4]) +{ +#ifndef LINUX + DECLARE_ALIGNED(uint128_t xmm_save[10], 16); + + SAVE_XMMS(xmm_save); +#endif +#ifdef SAFE_PARAM + unsigned int i; + + /* Check for NULL pointers */ + if (pKey == NULL || pIv == NULL || pBufferIn == NULL || + pBufferOut == NULL || length == NULL) + return; + + for (i = 0; i < 4; i++) { + if (pKey[i] == NULL || pIv[i] == NULL || + pBufferIn[i] == NULL || pBufferOut[i] == NULL) + return; + + /* Check input data is in range of supported length */ + if (length[i] < ZUC_MIN_LEN || length[i] > ZUC_MAX_LEN) + return; + } +#endif + + _zuc_eea3_4_buffer_avx(pKey, pIv, pBufferIn, pBufferOut, length); + +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#ifndef LINUX + RESTORE_XMMS(xmm_save); +#endif +} + +void zuc_eea3_n_buffer_avx(const void * const pKey[], const void * const pIv[], + const void * const pBufferIn[], void *pBufferOut[], + const uint32_t length[], + const uint32_t numBuffers) +{ +#ifndef LINUX + DECLARE_ALIGNED(uint128_t xmm_save[10], 16); + + SAVE_XMMS(xmm_save); +#endif + + unsigned int i; + unsigned int packetCount = numBuffers; + +#ifdef SAFE_PARAM + /* Check for NULL pointers */ + if (pKey == NULL || pIv == NULL || pBufferIn == NULL || + pBufferOut == NULL || length == NULL) + return; + + for (i = 0; i < numBuffers; i++) { + if (pKey[i] == NULL || pIv[i] == NULL || + pBufferIn[i] == NULL || pBufferOut[i] == NULL) + return; + + /* Check input data is in range of supported length */ + if (length[i] < ZUC_MIN_LEN || length[i] > ZUC_MAX_LEN) + return; + } +#endif + i = 0; + + while(packetCount >= 4) { + packetCount -=4; + _zuc_eea3_4_buffer_avx(&pKey[i], + &pIv[i], + &pBufferIn[i], + &pBufferOut[i], + &length[i]); + i+=4; + } + + while(packetCount--) { + _zuc_eea3_1_buffer_avx(pKey[i], + pIv[i], + pBufferIn[i], + pBufferOut[i], + length[i]); + i++; + } +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#ifndef LINUX + RESTORE_XMMS(xmm_save); +#endif +} + +static inline uint64_t rotate_left(uint64_t u, size_t r) +{ + return (((u) << (r)) | ((u) >> (64 - (r)))); +} + +static inline uint64_t load_uint64(const void *ptr) +{ + return *((const uint64_t *)ptr); +} + +void zuc_eia3_1_buffer_avx(const void *pKey, + const void *pIv, + const void *pBufferIn, + const uint32_t lengthInBits, + uint32_t *pMacI) +{ +#ifndef LINUX + DECLARE_ALIGNED(uint128_t xmm_save[10], 16); + + SAVE_XMMS(xmm_save); +#endif + DECLARE_ALIGNED(ZucState_t zucState, 64); + DECLARE_ALIGNED(uint32_t keyStream[16 * 2], 64); + const uint32_t keyStreamLengthInBits = ZUC_KEYSTR_LEN * 8; + /* generate a key-stream 2 words longer than the input message */ + const uint32_t N = lengthInBits + (2 * ZUC_WORD); + uint32_t L = (N + 31) / ZUC_WORD; + uint32_t *pZuc = (uint32_t *) &keyStream[0]; + uint32_t remainingBits = lengthInBits; + uint32_t T = 0; + const uint8_t *pIn8 = (const uint8_t *) pBufferIn; + +#ifdef SAFE_PARAM + /* Check for NULL pointers */ + if (pKey == NULL || pIv == NULL || pBufferIn == NULL || pMacI == NULL) + return; + + /* Check input data is in range of supported length */ + if (lengthInBits < ZUC_MIN_LEN || lengthInBits > ZUC_MAX_LEN) + return; +#endif + + memset(&zucState, 0, sizeof(ZucState_t)); + + asm_ZucInitialization(pKey, pIv, &(zucState)); + asm_ZucGenKeystream64B(pZuc, &zucState); + + /* loop over the message bits */ + while (remainingBits >= keyStreamLengthInBits) { + remainingBits -= keyStreamLengthInBits; + L -= (keyStreamLengthInBits / 32); + /* Generate the next key stream 8 bytes or 64 bytes */ + if (!remainingBits) + asm_ZucGenKeystream8B(&keyStream[16], &zucState); + else + asm_ZucGenKeystream64B(&keyStream[16], &zucState); + T = asm_Eia3Round64BAVX(T, &keyStream[0], pIn8); + memcpy(&keyStream[0], &keyStream[16], 16 * sizeof(uint32_t)); + pIn8 = &pIn8[ZUC_KEYSTR_LEN]; + } + + /* + * If remaining bits has more than 14 ZUC WORDS (double words), + * keystream needs to have up to another 2 ZUC WORDS (8B) + */ + if (remainingBits > (14 * 32)) + asm_ZucGenKeystream8B(&keyStream[16], &zucState); + T ^= asm_Eia3RemainderAVX(&keyStream[0], pIn8, remainingBits); + T ^= rotate_left(load_uint64(&keyStream[remainingBits / 32]), + remainingBits % 32); + + /* save the final MAC-I result */ + uint32_t keyBlock = keyStream[L - 1]; + *pMacI = bswap4(T ^ keyBlock); + +#ifdef SAFE_DATA + /* Clear sensitive data (in registers and stack) */ + clear_mem(keyStream, sizeof(keyStream)); + clear_mem(&zucState, sizeof(zucState)); + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +#ifndef LINUX + RESTORE_XMMS(xmm_save); +#endif +} |