diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
commit | 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch) | |
tree | e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/spdk/intel-ipsec-mb/avx | |
parent | Initial commit. (diff) | |
download | ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip |
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/intel-ipsec-mb/avx')
46 files changed, 14000 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/avx/aes128_cbc_dec_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes128_cbc_dec_by8_avx.asm new file mode 100644 index 00000000..d1fefc96 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes128_cbc_dec_by8_avx.asm @@ -0,0 +1,306 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; routine to do AES128 CBC decrypt "by8" + +;; clobbers xmm0-15 + +%include "os.asm" + +%define CONCAT(a,b) a %+ b +%define VMOVDQ vmovdqu + +%define xdata0 xmm0 +%define xdata1 xmm1 +%define xdata2 xmm2 +%define xdata3 xmm3 +%define xdata4 xmm4 +%define xdata5 xmm5 +%define xdata6 xmm6 +%define xdata7 xmm7 +%define xIV xmm8 +%define xkey0 xmm9 +%define xkey2 xmm10 +%define xkey4 xmm11 +%define xkey6 xmm12 +%define xkey8 xmm13 +%define xkey10 xmm14 +%define xkeytmp xmm15 + +%ifdef LINUX +%define p_in rdi +%define p_IV rsi +%define p_keys rdx +%define p_out rcx +%define num_bytes r8 +%else +%define p_in rcx +%define p_IV rdx +%define p_keys r8 +%define p_out r9 +%define num_bytes rax +%endif + +%define tmp r10 + +%macro do_aes_load 1 + do_aes %1, 1 +%endmacro + +%macro do_aes_noload 1 + do_aes %1, 0 +%endmacro + +; do_aes num_in_par load_keys +; This increments p_in, but not p_out +%macro do_aes 2 +%define %%by %1 +%define %%load_keys %2 + +%if (%%load_keys) + vmovdqa xkey0, [p_keys + 0*16] +%endif + +%assign i 0 +%rep %%by + VMOVDQ CONCAT(xdata,i), [p_in + i*16] +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey2, [p_keys + 2*16] +%endif +%assign i 0 +%rep %%by + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkey0 +%assign i (i+1) +%endrep + + add p_in, 16*%%by + + vmovdqa xkeytmp, [p_keys + 1*16] +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey4, [p_keys + 4*16] +%endif +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey2 +%assign i (i+1) +%endrep + + vmovdqa xkeytmp, [p_keys + 3*16] +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey6, [p_keys + 6*16] +%endif +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey4 +%assign i (i+1) +%endrep + + vmovdqa xkeytmp, [p_keys + 5*16] +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey8, [p_keys + 8*16] +%endif +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey6 +%assign i (i+1) +%endrep + + vmovdqa xkeytmp, [p_keys + 7*16] +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey10, [p_keys + 10*16] +%endif +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey8 +%assign i (i+1) +%endrep + + vmovdqa xkeytmp, [p_keys + 9*16] +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeytmp +%assign i (i+1) +%endrep + +%assign i 0 +%rep %%by + vaesdeclast CONCAT(xdata,i), CONCAT(xdata,i), xkey10 +%assign i (i+1) +%endrep + + vpxor xdata0, xdata0, xIV +%assign i 1 +%if (%%by > 1) +%rep (%%by - 1) + VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by] + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xIV +%assign i (i+1) +%endrep +%endif + VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by] + +%assign i 0 +%rep %%by + VMOVDQ [p_out + i*16], CONCAT(xdata,i) +%assign i (i+1) +%endrep +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +section .text + +;; aes_cbc_dec_128_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) +MKGLOBAL(aes_cbc_dec_128_avx,function,internal) +aes_cbc_dec_128_avx: + +%ifndef LINUX + mov num_bytes, [rsp + 8*5] +%endif + + vmovdqu xIV, [p_IV] + + mov tmp, num_bytes + and tmp, 7*16 + jz mult_of_8_blks + + ; 1 <= tmp <= 7 + cmp tmp, 4*16 + jg gt4 + je eq4 + +lt4: + cmp tmp, 2*16 + jg eq3 + je eq2 +eq1: + do_aes_load 1 + add p_out, 1*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq2: + do_aes_load 2 + add p_out, 2*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq3: + do_aes_load 3 + add p_out, 3*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq4: + do_aes_load 4 + add p_out, 4*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +gt4: + cmp tmp, 6*16 + jg eq7 + je eq6 + +eq5: + do_aes_load 5 + add p_out, 5*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq6: + do_aes_load 6 + add p_out, 6*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq7: + do_aes_load 7 + add p_out, 7*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +mult_of_8_blks: + vmovdqa xkey0, [p_keys + 0*16] + vmovdqa xkey2, [p_keys + 2*16] + vmovdqa xkey4, [p_keys + 4*16] + vmovdqa xkey6, [p_keys + 6*16] + vmovdqa xkey8, [p_keys + 8*16] + vmovdqa xkey10, [p_keys + 10*16] + +main_loop2: + ; num_bytes is a multiple of 8 and >0 + do_aes_noload 8 + add p_out, 8*16 + sub num_bytes, 8*16 + jne main_loop2 + +do_return2: +; Don't write back IV +; vmovdqu [p_IV], xIV + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/aes128_cbc_mac_x8.asm b/src/spdk/intel-ipsec-mb/avx/aes128_cbc_mac_x8.asm new file mode 100644 index 00000000..45228dd4 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes128_cbc_mac_x8.asm @@ -0,0 +1,31 @@ +;; +;; Copyright (c) 2017-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;;; Routine to compute CBC-MAC. It is based on 128 bit CBC AES encrypt code. + +%define CBC_MAC 1 +%include "aes_cbc_enc_128_x8.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/aes128_cntr_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes128_cntr_by8_avx.asm new file mode 100644 index 00000000..ae829325 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes128_cntr_by8_avx.asm @@ -0,0 +1,392 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "memcpy.asm" + +; routine to do AES128 CNTR enc/decrypt "by8" +; XMM registers are clobbered. Saving/restoring must be done at a higher level +section .data +default rel + +MKGLOBAL(byteswap_const,data,internal) +MKGLOBAL(ddq_add_1,data,internal) +MKGLOBAL(ddq_add_2,data,internal) +MKGLOBAL(ddq_add_3,data,internal) +MKGLOBAL(ddq_add_4,data,internal) +MKGLOBAL(ddq_add_5,data,internal) +MKGLOBAL(ddq_add_6,data,internal) +MKGLOBAL(ddq_add_7,data,internal) +MKGLOBAL(ddq_add_8,data,internal) + +align 16 +byteswap_const: ;DDQ 0x000102030405060708090A0B0C0D0E0F + DQ 0x08090A0B0C0D0E0F, 0x0001020304050607 +ddq_add_1: ;DDQ 0x00000000000000000000000000000001 + DQ 0x0000000000000001, 0x0000000000000000 +ddq_add_2: ;DDQ 0x00000000000000000000000000000002 + DQ 0x0000000000000002, 0x0000000000000000 +ddq_add_3: ;DDQ 0x00000000000000000000000000000003 + DQ 0x0000000000000003, 0x0000000000000000 +ddq_add_4: ;DDQ 0x00000000000000000000000000000004 + DQ 0x0000000000000004, 0x0000000000000000 +ddq_add_5: ;DDQ 0x00000000000000000000000000000005 + DQ 0x0000000000000005, 0x0000000000000000 +ddq_add_6: ;DDQ 0x00000000000000000000000000000006 + DQ 0x0000000000000006, 0x0000000000000000 +ddq_add_7: ;DDQ 0x00000000000000000000000000000007 + DQ 0x0000000000000007, 0x0000000000000000 +ddq_add_8: ;DDQ 0x00000000000000000000000000000008 + DQ 0x0000000000000008, 0x0000000000000000 + +section .text + +%define CONCAT(a,b) a %+ b +%define VMOVDQ vmovdqu + +%define xdata0 xmm0 +%define xdata1 xmm1 +%define xdata2 xmm2 +%define xdata3 xmm3 +%define xdata4 xmm4 +%define xdata5 xmm5 +%define xdata6 xmm6 +%define xdata7 xmm7 +%define xcounter xmm8 +%define xbyteswap xmm9 +%define xkey0 xmm10 +%define xkey3 xmm11 +%define xkey6 xmm12 +%define xkey9 xmm13 +%define xkeyA xmm14 +%define xkeyB xmm15 + +%ifdef LINUX +%define p_in rdi +%define p_IV rsi +%define p_keys rdx +%define p_out rcx +%define num_bytes r8 +%define p_ivlen r9 +%else +%define p_in rcx +%define p_IV rdx +%define p_keys r8 +%define p_out r9 +%define num_bytes r10 +%define p_ivlen qword [rsp + 8*6] +%endif + +%define tmp r11 +%define p_tmp rsp + _buffer + +%macro do_aes_load 1 + do_aes %1, 1 +%endmacro + +%macro do_aes_noload 1 + do_aes %1, 0 +%endmacro + +; do_aes num_in_par load_keys +; This increments p_in, but not p_out +%macro do_aes 2 +%define %%by %1 +%define %%load_keys %2 + +%if (%%load_keys) + vmovdqa xkey0, [p_keys + 0*16] +%endif + + vpshufb xdata0, xcounter, xbyteswap +%assign i 1 +%rep (%%by - 1) + vpaddd CONCAT(xdata,i), xcounter, [rel CONCAT(ddq_add_,i)] + vpshufb CONCAT(xdata,i), CONCAT(xdata,i), xbyteswap +%assign i (i + 1) +%endrep + + vmovdqa xkeyA, [p_keys + 1*16] + + vpxor xdata0, xkey0 + vpaddd xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)] +%assign i 1 +%rep (%%by - 1) + vpxor CONCAT(xdata,i), xkey0 +%assign i (i + 1) +%endrep + + vmovdqa xkeyB, [p_keys + 2*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 1 +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey3, [p_keys + 3*16] +%endif +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 2 +%assign i (i+1) +%endrep + + add p_in, 16*%%by + + vmovdqa xkeyB, [p_keys + 4*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey3 ; key 3 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 5*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 4 +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey6, [p_keys + 6*16] +%endif +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 5 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 7*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey6 ; key 6 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 8*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 7 +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey9, [p_keys + 9*16] +%endif +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 8 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 10*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey9 ; key 9 +%assign i (i+1) +%endrep + +%assign i 0 +%rep %%by + vaesenclast CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 10 +%assign i (i+1) +%endrep + +%assign i 0 +%rep (%%by / 2) +%assign j (i+1) + VMOVDQ xkeyA, [p_in + i*16 - 16*%%by] + VMOVDQ xkeyB, [p_in + j*16 - 16*%%by] + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA + vpxor CONCAT(xdata,j), CONCAT(xdata,j), xkeyB +%assign i (i+2) +%endrep +%if (i < %%by) + VMOVDQ xkeyA, [p_in + i*16 - 16*%%by] + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%endif + +%assign i 0 +%rep %%by + VMOVDQ [p_out + i*16], CONCAT(xdata,i) +%assign i (i+1) +%endrep +%endmacro + +struc STACK +_buffer: resq 2 +_rsp_save: resq 1 +endstruc + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + +;; aes_cntr_128_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) +align 32 +MKGLOBAL(aes_cntr_128_avx,function,internal) +aes_cntr_128_avx: + +%ifndef LINUX + mov num_bytes, [rsp + 8*5] ; arg5 +%endif + + vmovdqa xbyteswap, [rel byteswap_const] + test p_ivlen, 16 + jnz iv_is_16_bytes + ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001 + mov DWORD(tmp), 0x01000000 + vpinsrq xcounter, [p_IV], 0 + vpinsrd xcounter, [p_IV + 8], 2 + vpinsrd xcounter, DWORD(tmp), 3 +bswap_iv: + vpshufb xcounter, xbyteswap + + mov tmp, num_bytes + and tmp, 7*16 + jz chk ; x8 > or < 15 (not 7 lines) + + ; 1 <= tmp <= 7 + cmp tmp, 4*16 + jg gt4 + je eq4 + +lt4: + cmp tmp, 2*16 + jg eq3 + je eq2 +eq1: + do_aes_load 1 + add p_out, 1*16 + jmp chk + +eq2: + do_aes_load 2 + add p_out, 2*16 + jmp chk + +eq3: + do_aes_load 3 + add p_out, 3*16 + jmp chk + +eq4: + do_aes_load 4 + add p_out, 4*16 + jmp chk + +gt4: + cmp tmp, 6*16 + jg eq7 + je eq6 + +eq5: + do_aes_load 5 + add p_out, 5*16 + jmp chk + +eq6: + do_aes_load 6 + add p_out, 6*16 + jmp chk + +eq7: + do_aes_load 7 + add p_out, 7*16 + ; fall through to chk +chk: + and num_bytes, ~(7*16) + jz do_return2 + + cmp num_bytes, 16 + jb last + + ; process multiples of 8 blocks + vmovdqa xkey0, [p_keys + 0*16] + vmovdqa xkey3, [p_keys + 3*16] + vmovdqa xkey6, [p_keys + 6*16] + vmovdqa xkey9, [p_keys + 9*16] + jmp main_loop2 + +align 32 +main_loop2: + ; num_bytes is a multiple of 8 blocks + partial bytes + do_aes_noload 8 + add p_out, 8*16 + sub num_bytes, 8*16 + cmp num_bytes, 8*16 + jae main_loop2 + + test num_bytes, 15 ; partial bytes to be processed? + jnz last + +do_return2: +; don't return updated IV +; vpshufb xcounter, xcounter, xbyteswap +; vmovdqu [p_IV], xcounter + ret + +last: + ;; Code dealing with the partial block cases + ; reserve 16 byte aligned buffer on stack + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + mov [rsp + _rsp_save], rax ; save SP + + ; copy input bytes into scratch buffer + memcpy_avx_16_1 p_tmp, p_in, num_bytes, tmp, rax + ; Encryption of a single partial block (p_tmp) + vpshufb xcounter, xbyteswap + vmovdqa xdata0, xcounter + vpxor xdata0, [p_keys + 16*0] +%assign i 1 +%rep 9 + vaesenc xdata0, [p_keys + 16*i] +%assign i (i+1) +%endrep + ; created keystream + vaesenclast xdata0, [p_keys + 16*i] + ; xor keystream with the message (scratch) + vpxor xdata0, [p_tmp] + vmovdqa [p_tmp], xdata0 + ; copy result into the output buffer + memcpy_avx_16_1 p_out, p_tmp, num_bytes, tmp, rax + ; remove the stack frame + mov rsp, [rsp + _rsp_save] ; original SP + jmp do_return2 + +iv_is_16_bytes: + ; Read 16 byte IV: Nonce + ESP IV + block counter (BE) + vmovdqu xcounter, [p_IV] + jmp bswap_iv + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/aes192_cbc_dec_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes192_cbc_dec_by8_avx.asm new file mode 100644 index 00000000..2d0bb204 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes192_cbc_dec_by8_avx.asm @@ -0,0 +1,328 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; routine to do AES192 CBC decrypt "by8" + +; XMM registers are clobbered. Saving/restoring must be done at a higher level +%include "os.asm" + +%define CONCAT(a,b) a %+ b +%define VMOVDQ vmovdqu + +%define xdata0 xmm0 +%define xdata1 xmm1 +%define xdata2 xmm2 +%define xdata3 xmm3 +%define xdata4 xmm4 +%define xdata5 xmm5 +%define xdata6 xmm6 +%define xdata7 xmm7 +%define xIV xmm8 +%define xkey0 xmm9 +%define xkey3 xmm10 +%define xkey6 xmm11 +%define xkey9 xmm12 +%define xkey12 xmm13 +%define xkeyA xmm14 +%define xkeyB xmm15 + +%ifdef LINUX +%define p_in rdi +%define p_IV rsi +%define p_keys rdx +%define p_out rcx +%define num_bytes r8 +%else +%define p_in rcx +%define p_IV rdx +%define p_keys r8 +%define p_out r9 +%define num_bytes rax +%endif + +%define tmp r10 + +%macro do_aes_load 1 + do_aes %1, 1 +%endmacro + +%macro do_aes_noload 1 + do_aes %1, 0 +%endmacro + +; do_aes num_in_par load_keys +; This increments p_in, but not p_out +%macro do_aes 2 +%define %%by %1 +%define %%load_keys %2 + +%if (%%load_keys) + vmovdqa xkey0, [p_keys + 0*16] +%endif + +%assign i 0 +%rep %%by + VMOVDQ CONCAT(xdata,i), [p_in + i*16] +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 1*16] + +%assign i 0 +%rep %%by + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkey0 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 2*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%assign i (i+1) +%endrep + + add p_in, 16*%%by + +%if (%%load_keys) + vmovdqa xkey3, [p_keys + 3*16] +%endif + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 4*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey3 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 5*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey6, [p_keys + 6*16] +%endif + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 7*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey6 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 8*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey9, [p_keys + 9*16] +%endif + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 10*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey9 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 11*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey12, [p_keys + 12*16] +%endif + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB +%assign i (i+1) +%endrep + +%assign i 0 +%rep %%by + vaesdeclast CONCAT(xdata,i), CONCAT(xdata,i), xkey12 +%assign i (i+1) +%endrep + + vpxor xdata0, xdata0, xIV +%assign i 1 +%if (%%by > 1) +%rep (%%by - 1) + VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by] + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xIV +%assign i (i+1) +%endrep +%endif + VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by] + +%assign i 0 +%rep %%by + VMOVDQ [p_out + i*16], CONCAT(xdata,i) +%assign i (i+1) +%endrep +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +section .text + +;; aes_cbc_dec_192_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) +MKGLOBAL(aes_cbc_dec_192_avx,function,internal) +aes_cbc_dec_192_avx: + +%ifndef LINUX + mov num_bytes, [rsp + 8*5] +%endif + + vmovdqu xIV, [p_IV] + + mov tmp, num_bytes + and tmp, 7*16 + jz mult_of_8_blks + + ; 1 <= tmp <= 7 + cmp tmp, 4*16 + jg gt4 + je eq4 + +lt4: + cmp tmp, 2*16 + jg eq3 + je eq2 +eq1: + do_aes_load 1 + add p_out, 1*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq2: + do_aes_load 2 + add p_out, 2*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq3: + do_aes_load 3 + add p_out, 3*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq4: + do_aes_load 4 + add p_out, 4*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +gt4: + cmp tmp, 6*16 + jg eq7 + je eq6 + +eq5: + do_aes_load 5 + add p_out, 5*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq6: + do_aes_load 6 + add p_out, 6*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq7: + do_aes_load 7 + add p_out, 7*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +mult_of_8_blks: + vmovdqa xkey0, [p_keys + 0*16] + vmovdqa xkey3, [p_keys + 3*16] + vmovdqa xkey6, [p_keys + 6*16] + vmovdqa xkey9, [p_keys + 9*16] + vmovdqa xkey12, [p_keys + 12*16] + +main_loop2: + ; num_bytes is a multiple of 8 and >0 + do_aes_noload 8 + add p_out, 8*16 + sub num_bytes, 8*16 + jne main_loop2 + +do_return2: +; Don't write back IV +; vmovdqu [p_IV], xIV + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/aes192_cntr_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes192_cntr_by8_avx.asm new file mode 100644 index 00000000..436ec01e --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes192_cntr_by8_avx.asm @@ -0,0 +1,378 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "memcpy.asm" + +; routine to do AES192 CNTR enc/decrypt "by8" +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +extern byteswap_const +extern ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4 +extern ddq_add_5, ddq_add_6, ddq_add_7, ddq_add_8 + +%define CONCAT(a,b) a %+ b +%define VMOVDQ vmovdqu + +%define xdata0 xmm0 +%define xdata1 xmm1 +%define xdata2 xmm2 +%define xdata3 xmm3 +%define xdata4 xmm4 +%define xdata5 xmm5 +%define xdata6 xmm6 +%define xdata7 xmm7 +%define xcounter xmm8 +%define xbyteswap xmm9 +%define xkey0 xmm10 +%define xkey4 xmm11 +%define xkey8 xmm12 +%define xkey12 xmm13 +%define xkeyA xmm14 +%define xkeyB xmm15 + +%ifdef LINUX +%define p_in rdi +%define p_IV rsi +%define p_keys rdx +%define p_out rcx +%define num_bytes r8 +%define p_ivlen r9 +%else +%define p_in rcx +%define p_IV rdx +%define p_keys r8 +%define p_out r9 +%define num_bytes r10 +%define p_ivlen qword [rsp + 8*6] +%endif + +%define tmp r11 +%define p_tmp rsp + _buffer + +%macro do_aes_load 1 + do_aes %1, 1 +%endmacro + +%macro do_aes_noload 1 + do_aes %1, 0 +%endmacro + +; do_aes num_in_par load_keys +; This increments p_in, but not p_out +%macro do_aes 2 +%define %%by %1 +%define %%load_keys %2 + +%if (%%load_keys) + vmovdqa xkey0, [p_keys + 0*16] +%endif + + vpshufb xdata0, xcounter, xbyteswap +%assign i 1 +%rep (%%by - 1) + vpaddd CONCAT(xdata,i), xcounter, [rel CONCAT(ddq_add_,i)] + vpshufb CONCAT(xdata,i), CONCAT(xdata,i), xbyteswap +%assign i (i + 1) +%endrep + + vmovdqa xkeyA, [p_keys + 1*16] + + vpxor xdata0, xkey0 + vpaddd xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)] +%assign i 1 +%rep (%%by - 1) + vpxor CONCAT(xdata,i), xkey0 +%assign i (i + 1) +%endrep + + vmovdqa xkeyB, [p_keys + 2*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 1 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 3*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 2 +%assign i (i+1) +%endrep + + add p_in, 16*%%by + +%if (%%load_keys) + vmovdqa xkey4, [p_keys + 4*16] +%endif +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 3 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 5*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey4 ; key 4 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 6*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 5 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 7*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 6 +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey8, [p_keys + 8*16] +%endif +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 7 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 9*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey8 ; key 8 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 10*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 9 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 11*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 10 +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey12, [p_keys + 12*16] +%endif +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 11 +%assign i (i+1) +%endrep + + +%assign i 0 +%rep %%by + vaesenclast CONCAT(xdata,i), CONCAT(xdata,i), xkey12 ; key 12 +%assign i (i+1) +%endrep + + +%assign i 0 +%rep (%%by / 2) +%assign j (i+1) + VMOVDQ xkeyA, [p_in + i*16 - 16*%%by] + VMOVDQ xkeyB, [p_in + j*16 - 16*%%by] + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA + vpxor CONCAT(xdata,j), CONCAT(xdata,j), xkeyB +%assign i (i+2) +%endrep +%if (i < %%by) + VMOVDQ xkeyA, [p_in + i*16 - 16*%%by] + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%endif + +%assign i 0 +%rep %%by + VMOVDQ [p_out + i*16], CONCAT(xdata,i) +%assign i (i+1) +%endrep +%endmacro + +struc STACK +_buffer: resq 2 +_rsp_save: resq 1 +endstruc + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +section .text + +;; aes_cntr_192_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len) +align 32 +MKGLOBAL(aes_cntr_192_avx,function,internal) +aes_cntr_192_avx: + +%ifndef LINUX + mov num_bytes, [rsp + 8*5] +%endif + + vmovdqa xbyteswap, [rel byteswap_const] + test p_ivlen, 16 + jnz iv_is_16_bytes + ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001 + mov DWORD(tmp), 0x01000000 + vpinsrq xcounter, [p_IV], 0 + vpinsrd xcounter, [p_IV + 8], 2 + vpinsrd xcounter, DWORD(tmp), 3 +bswap_iv: + vpshufb xcounter, xbyteswap + + mov tmp, num_bytes + and tmp, 7*16 + jz chk ; x8 > or < 15 (not 7 lines) + + ; 1 <= tmp <= 7 + cmp tmp, 4*16 + jg gt4 + je eq4 + +lt4: + cmp tmp, 2*16 + jg eq3 + je eq2 +eq1: + do_aes_load 1 + add p_out, 1*16 + jmp chk + +eq2: + do_aes_load 2 + add p_out, 2*16 + jmp chk + +eq3: + do_aes_load 3 + add p_out, 3*16 + jmp chk + +eq4: + do_aes_load 4 + add p_out, 4*16 + jmp chk + +gt4: + cmp tmp, 6*16 + jg eq7 + je eq6 + +eq5: + do_aes_load 5 + add p_out, 5*16 + jmp chk + +eq6: + do_aes_load 6 + add p_out, 6*16 + jmp chk + +eq7: + do_aes_load 7 + add p_out, 7*16 + ; fall through to chk +chk: + and num_bytes, ~(7*16) + jz do_return2 + + cmp num_bytes, 16 + jb last + + ; process multiples of 8 blocks + vmovdqa xkey0, [p_keys + 0*16] + vmovdqa xkey4, [p_keys + 4*16] + vmovdqa xkey8, [p_keys + 8*16] + vmovdqa xkey12, [p_keys + 12*16] + jmp main_loop2 + +align 32 +main_loop2: + ; num_bytes is a multiple of 8 blocks + partial bytes + do_aes_noload 8 + add p_out, 8*16 + sub num_bytes, 8*16 + cmp num_bytes, 8*16 + jae main_loop2 + + test num_bytes, 15 ; partial bytes to be processed? + jnz last +do_return2: +; don't return updated IV +; vpshufb xcounter, xcounter, xbyteswap +; vmovdqu [p_IV], xcounter + ret + +last: + ;; Code dealing with the partial block cases + ; reserve 16 byte aligned buffer on stack + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + mov [rsp + _rsp_save], rax ; save SP + + ; copy input bytes into scratch buffer + memcpy_avx_16_1 p_tmp, p_in, num_bytes, tmp, rax + ; Encryption of a single partial block (p_tmp) + vpshufb xcounter, xbyteswap + vmovdqa xdata0, xcounter + vpxor xdata0, [p_keys + 16*0] +%assign i 1 +%rep 11 + vaesenc xdata0, [p_keys + 16*i] +%assign i (i+1) +%endrep + ; created keystream + vaesenclast xdata0, [p_keys + 16*i] + ; xor keystream with the message (scratch) + vpxor xdata0, [p_tmp] + vmovdqa [p_tmp], xdata0 + ; copy result into the output buffer + memcpy_avx_16_1 p_out, p_tmp, num_bytes, tmp, rax + ; remove the stack frame + mov rsp, [rsp + _rsp_save] ; original SP + jmp do_return2 + +iv_is_16_bytes: + ; Read 16 byte IV: Nonce + ESP IV + block counter (BE) + vmovdqu xcounter, [p_IV] + jmp bswap_iv + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/aes256_cbc_dec_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes256_cbc_dec_by8_avx.asm new file mode 100644 index 00000000..6bcd164d --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes256_cbc_dec_by8_avx.asm @@ -0,0 +1,344 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; routine to do AES256 CBC decrypt "by8" + +; XMM registers are clobbered. Saving/restoring must be done at a higher level +%include "os.asm" + +%define CONCAT(a,b) a %+ b +%define VMOVDQ vmovdqu + +%define xdata0 xmm0 +%define xdata1 xmm1 +%define xdata2 xmm2 +%define xdata3 xmm3 +%define xdata4 xmm4 +%define xdata5 xmm5 +%define xdata6 xmm6 +%define xdata7 xmm7 +%define xIV xmm8 +%define xkey0 xmm9 +%define xkey3 xmm10 +%define xkey6 xmm11 +%define xkey9 xmm12 +%define xkey12 xmm13 +%define xkeyA xmm14 +%define xkeyB xmm15 + +%ifdef LINUX +%define p_in rdi +%define p_IV rsi +%define p_keys rdx +%define p_out rcx +%define num_bytes r8 +%else +%define p_in rcx +%define p_IV rdx +%define p_keys r8 +%define p_out r9 +%define num_bytes rax +%endif + +%define tmp r10 + +%macro do_aes_load 1 + do_aes %1, 1 +%endmacro + +%macro do_aes_noload 1 + do_aes %1, 0 +%endmacro + +; do_aes num_in_par load_keys +; This increments p_in, but not p_out +%macro do_aes 2 +%define %%by %1 +%define %%load_keys %2 + +%if (%%load_keys) + vmovdqa xkey0, [p_keys + 0*16] +%endif + +%assign i 0 +%rep %%by + VMOVDQ CONCAT(xdata,i), [p_in + i*16] +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 1*16] + +%assign i 0 +%rep %%by + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkey0 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 2*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%assign i (i+1) +%endrep + + add p_in, 16*%%by + +%if (%%load_keys) + vmovdqa xkey3, [p_keys + 3*16] +%endif + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 4*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey3 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 5*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey6, [p_keys + 6*16] +%endif + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 7*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey6 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 8*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey9, [p_keys + 9*16] +%endif + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 10*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey9 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 11*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey12, [p_keys + 12*16] +%endif + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 13*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey12 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 14*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%assign i (i+1) +%endrep + +%assign i 0 +%rep %%by + vaesdeclast CONCAT(xdata,i), CONCAT(xdata,i), xkeyB +%assign i (i+1) +%endrep + + vpxor xdata0, xdata0, xIV +%assign i 1 +%if (%%by > 1) +%rep (%%by - 1) + VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by] + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xIV +%assign i (i+1) +%endrep +%endif + VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by] + +%assign i 0 +%rep %%by + VMOVDQ [p_out + i*16], CONCAT(xdata,i) +%assign i (i+1) +%endrep +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +section .text + +;; aes_cbc_dec_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) +MKGLOBAL(aes_cbc_dec_256_avx,function,internal) +aes_cbc_dec_256_avx: + +%ifndef LINUX + mov num_bytes, [rsp + 8*5] +%endif + + vmovdqu xIV, [p_IV] + + mov tmp, num_bytes + and tmp, 7*16 + jz mult_of_8_blks + + ; 1 <= tmp <= 7 + cmp tmp, 4*16 + jg gt4 + je eq4 + +lt4: + cmp tmp, 2*16 + jg eq3 + je eq2 +eq1: + do_aes_load 1 + add p_out, 1*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq2: + do_aes_load 2 + add p_out, 2*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq3: + do_aes_load 3 + add p_out, 3*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq4: + do_aes_load 4 + add p_out, 4*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +gt4: + cmp tmp, 6*16 + jg eq7 + je eq6 + +eq5: + do_aes_load 5 + add p_out, 5*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq6: + do_aes_load 6 + add p_out, 6*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq7: + do_aes_load 7 + add p_out, 7*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +mult_of_8_blks: + vmovdqa xkey0, [p_keys + 0*16] + vmovdqa xkey3, [p_keys + 3*16] + vmovdqa xkey6, [p_keys + 6*16] + vmovdqa xkey9, [p_keys + 9*16] + vmovdqa xkey12, [p_keys + 12*16] + +main_loop2: + ; num_bytes is a multiple of 8 and >0 + do_aes_noload 8 + add p_out, 8*16 + sub num_bytes, 8*16 + jne main_loop2 + +do_return2: +; Don't write back IV +; vmovdqu [p_IV], xIV + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/aes256_cntr_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes256_cntr_by8_avx.asm new file mode 100644 index 00000000..80c30f4b --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes256_cntr_by8_avx.asm @@ -0,0 +1,391 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "memcpy.asm" + +; routine to do AES256 CNTR enc/decrypt "by8" +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +extern byteswap_const +extern ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4 +extern ddq_add_5, ddq_add_6, ddq_add_7, ddq_add_8 + +%define CONCAT(a,b) a %+ b +%define VMOVDQ vmovdqu + +%define xdata0 xmm0 +%define xdata1 xmm1 +%define xdata2 xmm2 +%define xdata3 xmm3 +%define xdata4 xmm4 +%define xdata5 xmm5 +%define xdata6 xmm6 +%define xdata7 xmm7 +%define xcounter xmm8 +%define xbyteswap xmm9 +%define xkey0 xmm10 +%define xkey4 xmm11 +%define xkey8 xmm12 +%define xkey12 xmm13 +%define xkeyA xmm14 +%define xkeyB xmm15 + +%ifdef LINUX +%define p_in rdi +%define p_IV rsi +%define p_keys rdx +%define p_out rcx +%define num_bytes r8 +%define p_ivlen r9 +%else +%define p_in rcx +%define p_IV rdx +%define p_keys r8 +%define p_out r9 +%define num_bytes r10 +%define p_ivlen qword [rsp + 8*6] +%endif + +%define tmp r11 +%define p_tmp rsp + _buffer + +%macro do_aes_load 1 + do_aes %1, 1 +%endmacro + +%macro do_aes_noload 1 + do_aes %1, 0 +%endmacro + +; do_aes num_in_par load_keys +; This increments p_in, but not p_out +%macro do_aes 2 +%define %%by %1 +%define %%load_keys %2 + +%if (%%load_keys) + vmovdqa xkey0, [p_keys + 0*16] +%endif + + vpshufb xdata0, xcounter, xbyteswap +%assign i 1 +%rep (%%by - 1) + vpaddd CONCAT(xdata,i), xcounter, [rel CONCAT(ddq_add_,i)] + vpshufb CONCAT(xdata,i), CONCAT(xdata,i), xbyteswap +%assign i (i + 1) +%endrep + + vmovdqa xkeyA, [p_keys + 1*16] + + vpxor xdata0, xkey0 + vpaddd xcounter, xcounter, [rel CONCAT(ddq_add_,%%by)] +%assign i 1 +%rep (%%by - 1) + vpxor CONCAT(xdata,i), xkey0 +%assign i (i + 1) +%endrep + + vmovdqa xkeyB, [p_keys + 2*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 1 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 3*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 2 +%assign i (i+1) +%endrep + + add p_in, 16*%%by + +%if (%%load_keys) + vmovdqa xkey4, [p_keys + 4*16] +%endif +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 3 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 5*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey4 ; key 4 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 6*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 5 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 7*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 6 +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey8, [p_keys + 8*16] +%endif +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 7 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 9*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey8 ; key 8 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 10*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 9 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 11*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 10 +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey12, [p_keys + 12*16] +%endif +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 11 +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 13*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkey12 ; key 12 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 14*16] +%assign i 0 +%rep %%by + vaesenc CONCAT(xdata,i), CONCAT(xdata,i), xkeyA ; key 13 +%assign i (i+1) +%endrep + +%assign i 0 +%rep %%by + vaesenclast CONCAT(xdata,i), CONCAT(xdata,i), xkeyB ; key 14 +%assign i (i+1) +%endrep + +%assign i 0 +%rep (%%by / 2) +%assign j (i+1) + VMOVDQ xkeyA, [p_in + i*16 - 16*%%by] + VMOVDQ xkeyB, [p_in + j*16 - 16*%%by] + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA + vpxor CONCAT(xdata,j), CONCAT(xdata,j), xkeyB +%assign i (i+2) +%endrep +%if (i < %%by) + VMOVDQ xkeyA, [p_in + i*16 - 16*%%by] + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%endif + +%assign i 0 +%rep %%by + VMOVDQ [p_out + i*16], CONCAT(xdata,i) +%assign i (i+1) +%endrep +%endmacro + +struc STACK +_buffer: resq 2 +_rsp_save: resq 1 +endstruc + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +section .text + +;; aes_cntr_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len) +align 32 +MKGLOBAL(aes_cntr_256_avx,function,internal) +aes_cntr_256_avx: + +%ifndef LINUX + mov num_bytes, [rsp + 8*5] +%endif + + vmovdqa xbyteswap, [rel byteswap_const] + test p_ivlen, 16 + jnz iv_is_16_bytes + ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001 + mov DWORD(tmp), 0x01000000 + vpinsrq xcounter, [p_IV], 0 + vpinsrd xcounter, [p_IV + 8], 2 + vpinsrd xcounter, DWORD(tmp), 3 +bswap_iv: + vpshufb xcounter, xbyteswap + + mov tmp, num_bytes + and tmp, 7*16 + jz chk ; x8 > or < 15 (not 7 lines) + + ; 1 <= tmp <= 7 + cmp tmp, 4*16 + jg gt4 + je eq4 + +lt4: + cmp tmp, 2*16 + jg eq3 + je eq2 +eq1: + do_aes_load 1 + add p_out, 1*16 + jmp chk + +eq2: + do_aes_load 2 + add p_out, 2*16 + jmp chk + +eq3: + do_aes_load 3 + add p_out, 3*16 + jmp chk + +eq4: + do_aes_load 4 + add p_out, 4*16 + jmp chk + +gt4: + cmp tmp, 6*16 + jg eq7 + je eq6 + +eq5: + do_aes_load 5 + add p_out, 5*16 + jmp chk + +eq6: + do_aes_load 6 + add p_out, 6*16 + jmp chk + +eq7: + do_aes_load 7 + add p_out, 7*16 + ; fall through to chk +chk: + and num_bytes, ~(7*16) + jz do_return2 + + cmp num_bytes, 16 + jb last + + ; process multiples of 8 blocks + vmovdqa xkey0, [p_keys + 0*16] + vmovdqa xkey4, [p_keys + 4*16] + vmovdqa xkey8, [p_keys + 8*16] + vmovdqa xkey12, [p_keys + 12*16] + jmp main_loop2 + +align 32 +main_loop2: + ; num_bytes is a multiple of 8 and >0 + do_aes_noload 8 + add p_out, 8*16 + sub num_bytes, 8*16 + cmp num_bytes, 8*16 + jae main_loop2 + + test num_bytes, 15 ; partial bytes to be processed? + jnz last + +do_return2: +; don't return updated IV +; vpshufb xcounter, xcounter, xbyteswap +; vmovdqu [p_IV], xcounter + ret + +last: + ;; Code dealing with the partial block cases + ; reserve 16 byte aligned buffer on stack + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + mov [rsp + _rsp_save], rax ; save SP + + ; copy input bytes into scratch buffer + memcpy_avx_16_1 p_tmp, p_in, num_bytes, tmp, rax + ; Encryption of a single partial block (p_tmp) + vpshufb xcounter, xbyteswap + vmovdqa xdata0, xcounter + vpxor xdata0, [p_keys + 16*0] +%assign i 1 +%rep 13 + vaesenc xdata0, [p_keys + 16*i] +%assign i (i+1) +%endrep + ; created keystream + vaesenclast xdata0, [p_keys + 16*i] + ; xor keystream with the message (scratch) + vpxor xdata0, [p_tmp] + vmovdqa [p_tmp], xdata0 + ; copy result into the output buffer + memcpy_avx_16_1 p_out, p_tmp, num_bytes, tmp, rax + ; remove the stack frame + mov rsp, [rsp + _rsp_save] ; original SP + jmp do_return2 + +iv_is_16_bytes: + ; Read 16 byte IV: Nonce + ESP IV + block counter (BE) + vmovdqu xcounter, [p_IV] + jmp bswap_iv + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_128_x8.asm b/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_128_x8.asm new file mode 100644 index 00000000..4c3ba209 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_128_x8.asm @@ -0,0 +1,494 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;;; routine to do a 128 bit CBC AES encrypt and CBC MAC + +;; clobbers all registers except for ARG1 and rbp + +%include "os.asm" +%include "mb_mgr_datastruct.asm" + +%define VMOVDQ vmovdqu ;; assume buffers not aligned + +%macro VPXOR2 2 + vpxor %1, %1, %2 +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; struct AES_ARGS_x8 { +;; void* in[8]; +;; void* out[8]; +;; UINT128* keys[8]; +;; UINT128 IV[8]; +;; } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void aes_cbc_enc_128_x8(AES_ARGS_x8 *args, UINT64 len); +;; arg 1: ARG : addr of AES_ARGS_x8 structure +;; arg 2: LEN : len (in units of bytes) + +struc STACK +_gpr_save: resq 8 +_len: resq 1 +endstruc + +%define GPR_SAVE_AREA rsp + _gpr_save +%define LEN_AREA rsp + _len + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define arg3 rcx +%define arg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define arg3 rdi +%define arg4 rsi +%endif + +%define ARG arg1 +%define LEN arg2 + +%define IDX rax +%define TMP rbx + +%define KEYS0 arg3 +%define KEYS1 arg4 +%define KEYS2 rbp +%define KEYS3 r8 +%define KEYS4 r9 +%define KEYS5 r10 +%define KEYS6 r11 +%define KEYS7 r12 + +%define IN0 r13 +%define IN2 r14 +%define IN4 r15 +%define IN6 LEN + +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 +%define XDATA4 xmm4 +%define XDATA5 xmm5 +%define XDATA6 xmm6 +%define XDATA7 xmm7 + +%define XKEY0_3 xmm8 +%define XKEY1_4 xmm9 +%define XKEY2_5 xmm10 +%define XKEY3_6 xmm11 +%define XKEY4_7 xmm12 +%define XKEY5_8 xmm13 +%define XKEY6_9 xmm14 +%define XTMP xmm15 + +section .text +%ifdef CBC_MAC +MKGLOBAL(aes128_cbc_mac_x8,function,internal) +aes128_cbc_mac_x8: +%else +MKGLOBAL(aes_cbc_enc_128_x8,function,internal) +aes_cbc_enc_128_x8: +%endif + sub rsp, STACK_size + mov [GPR_SAVE_AREA + 8*0], rbp +%ifdef CBC_MAC + mov [GPR_SAVE_AREA + 8*1], rbx + mov [GPR_SAVE_AREA + 8*2], r12 + mov [GPR_SAVE_AREA + 8*3], r13 + mov [GPR_SAVE_AREA + 8*4], r14 + mov [GPR_SAVE_AREA + 8*5], r15 +%ifndef LINUX + mov [GPR_SAVE_AREA + 8*6], rsi + mov [GPR_SAVE_AREA + 8*7], rdi +%endif +%endif + + mov IDX, 16 + mov [LEN_AREA], LEN + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + mov IN0, [ARG + _aesarg_in + 8*0] + mov IN2, [ARG + _aesarg_in + 8*2] + mov IN4, [ARG + _aesarg_in + 8*4] + mov IN6, [ARG + _aesarg_in + 8*6] + + mov TMP, [ARG + _aesarg_in + 8*1] + VMOVDQ XDATA0, [IN0] ; load first block of plain text + VMOVDQ XDATA1, [TMP] ; load first block of plain text + mov TMP, [ARG + _aesarg_in + 8*3] + VMOVDQ XDATA2, [IN2] ; load first block of plain text + VMOVDQ XDATA3, [TMP] ; load first block of plain text + mov TMP, [ARG + _aesarg_in + 8*5] + VMOVDQ XDATA4, [IN4] ; load first block of plain text + VMOVDQ XDATA5, [TMP] ; load first block of plain text + mov TMP, [ARG + _aesarg_in + 8*7] + VMOVDQ XDATA6, [IN6] ; load first block of plain text + VMOVDQ XDATA7, [TMP] ; load first block of plain text + + + VPXOR2 XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV + VPXOR2 XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV + VPXOR2 XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV + VPXOR2 XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV + VPXOR2 XDATA4, [ARG + _aesarg_IV + 16*4] ; plaintext XOR IV + VPXOR2 XDATA5, [ARG + _aesarg_IV + 16*5] ; plaintext XOR IV + VPXOR2 XDATA6, [ARG + _aesarg_IV + 16*6] ; plaintext XOR IV + VPXOR2 XDATA7, [ARG + _aesarg_IV + 16*7] ; plaintext XOR IV + + mov KEYS0, [ARG + _aesarg_keys + 8*0] + mov KEYS1, [ARG + _aesarg_keys + 8*1] + mov KEYS2, [ARG + _aesarg_keys + 8*2] + mov KEYS3, [ARG + _aesarg_keys + 8*3] + mov KEYS4, [ARG + _aesarg_keys + 8*4] + mov KEYS5, [ARG + _aesarg_keys + 8*5] + mov KEYS6, [ARG + _aesarg_keys + 8*6] + mov KEYS7, [ARG + _aesarg_keys + 8*7] + + VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK + VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK + VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK + VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK + VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK + VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK + VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK + VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK + + vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC + vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC + vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC + vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC + + vmovdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key + + vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC + vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC + vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC + vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC + + vmovdqa XKEY1_4, [KEYS1 + 16*4] ; load round 4 key + + vaesenc XDATA0, XKEY0_3 ; 3. ENC + vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC + vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC + vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC + vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC + vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC + vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC + vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC + + vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + vmovdqa XKEY2_5, [KEYS2 + 16*5] ; load round 5 key + vaesenc XDATA1, XKEY1_4 ; 4. ENC + vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC + vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC + vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC + vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC + + vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + vmovdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key + vaesenc XDATA2, XKEY2_5 ; 5. ENC + vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC + vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC + vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC + vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC + + vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC + vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC + vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC + vmovdqa XKEY4_7, [KEYS4 + 16*7] ; load round 7 key + vaesenc XDATA3, XKEY3_6 ; 6. ENC + vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC + vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC + vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC + vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC + + vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + vmovdqa XKEY5_8, [KEYS5 + 16*8] ; load round 8 key + vaesenc XDATA4, XKEY4_7 ; 7. ENC + vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC + vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC + vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC + + vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC + vmovdqa XKEY6_9, [KEYS6 + 16*9] ; load round 9 key + vaesenc XDATA5, XKEY5_8 ; 8. ENC + vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC + vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC + + vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC + vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC + vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC + vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC + vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC + vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC + mov TMP, [ARG + _aesarg_out + 8*0] + vaesenc XDATA6, XKEY6_9 ; 9. ENC + vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC + + + vaesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC + vaesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC + vaesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC + vaesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC + vaesenclast XDATA4, [KEYS4 + 16*10] ; 10. ENC + vaesenclast XDATA5, [KEYS5 + 16*10] ; 10. ENC + vaesenclast XDATA6, [KEYS6 + 16*10] ; 10. ENC + vaesenclast XDATA7, [KEYS7 + 16*10] ; 10. ENC + +%ifndef CBC_MAC + VMOVDQ [TMP], XDATA0 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*1] + VMOVDQ [TMP], XDATA1 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*2] + VMOVDQ [TMP], XDATA2 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*3] + VMOVDQ [TMP], XDATA3 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*4] + VMOVDQ [TMP], XDATA4 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*5] + VMOVDQ [TMP], XDATA5 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*6] + VMOVDQ [TMP], XDATA6 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*7] + VMOVDQ [TMP], XDATA7 ; write back ciphertext +%endif + cmp [LEN_AREA], IDX + je done + +main_loop: + mov TMP, [ARG + _aesarg_in + 8*1] + VPXOR2 XDATA0, [IN0 + IDX] ; load next block of plain text + VPXOR2 XDATA1, [TMP + IDX] ; load next block of plain text + mov TMP, [ARG + _aesarg_in + 8*3] + VPXOR2 XDATA2, [IN2 + IDX] ; load next block of plain text + VPXOR2 XDATA3, [TMP + IDX] ; load next block of plain text + mov TMP, [ARG + _aesarg_in + 8*5] + VPXOR2 XDATA4, [IN4 + IDX] ; load next block of plain text + VPXOR2 XDATA5, [TMP + IDX] ; load next block of plain text + mov TMP, [ARG + _aesarg_in + 8*7] + VPXOR2 XDATA6, [IN6 + IDX] ; load next block of plain text + VPXOR2 XDATA7, [TMP + IDX] ; load next block of plain text + + VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK + VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK + VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK + VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK + VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK + VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK + VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK + VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK + + vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC + vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC + vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC + vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC + + vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC + vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC + vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC + vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC + + vaesenc XDATA0, XKEY0_3 ; 3. ENC + vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC + vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC + vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC + vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC + vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC + vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC + vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC + + vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + vaesenc XDATA1, XKEY1_4 ; 4. ENC + vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC + vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC + vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC + vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC + + vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + vaesenc XDATA2, XKEY2_5 ; 5. ENC + vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC + vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC + vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC + vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC + + vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC + vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC + vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC + vaesenc XDATA3, XKEY3_6 ; 6. ENC + vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC + vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC + vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC + vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC + + vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + vaesenc XDATA4, XKEY4_7 ; 7. ENC + vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC + vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC + vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC + + vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC + vaesenc XDATA5, XKEY5_8 ; 8. ENC + vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC + vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC + + vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC + vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC + vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC + vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC + vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC + vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC + mov TMP, [ARG + _aesarg_out + 8*0] + vaesenc XDATA6, XKEY6_9 ; 9. ENC + vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC + + + vaesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC + vaesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC + vaesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC + vaesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC + vaesenclast XDATA4, [KEYS4 + 16*10] ; 10. ENC + vaesenclast XDATA5, [KEYS5 + 16*10] ; 10. ENC + vaesenclast XDATA6, [KEYS6 + 16*10] ; 10. ENC + vaesenclast XDATA7, [KEYS7 + 16*10] ; 10. ENC + +%ifndef CBC_MAC + ;; no ciphertext write back for CBC-MAC + VMOVDQ [TMP + IDX], XDATA0 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*1] + VMOVDQ [TMP + IDX], XDATA1 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*2] + VMOVDQ [TMP + IDX], XDATA2 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*3] + VMOVDQ [TMP + IDX], XDATA3 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*4] + VMOVDQ [TMP + IDX], XDATA4 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*5] + VMOVDQ [TMP + IDX], XDATA5 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*6] + VMOVDQ [TMP + IDX], XDATA6 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*7] + VMOVDQ [TMP + IDX], XDATA7 ; write back ciphertext +%endif + add IDX, 16 + cmp [LEN_AREA], IDX + jne main_loop + +done: + ;; update IV for AES128-CBC / store digest for CBC-MAC + vmovdqa [ARG + _aesarg_IV + 16*0], XDATA0 + vmovdqa [ARG + _aesarg_IV + 16*1], XDATA1 + vmovdqa [ARG + _aesarg_IV + 16*2], XDATA2 + vmovdqa [ARG + _aesarg_IV + 16*3], XDATA3 + vmovdqa [ARG + _aesarg_IV + 16*4], XDATA4 + vmovdqa [ARG + _aesarg_IV + 16*5], XDATA5 + vmovdqa [ARG + _aesarg_IV + 16*6], XDATA6 + vmovdqa [ARG + _aesarg_IV + 16*7], XDATA7 + + ;; update IN and OUT + vmovd xmm0, [LEN_AREA] + vpshufd xmm0, xmm0, 0x44 + vpaddq xmm1, xmm0, [ARG + _aesarg_in + 16*0] + vpaddq xmm2, xmm0, [ARG + _aesarg_in + 16*1] + vpaddq xmm3, xmm0, [ARG + _aesarg_in + 16*2] + vpaddq xmm4, xmm0, [ARG + _aesarg_in + 16*3] + vmovdqa [ARG + _aesarg_in + 16*0], xmm1 + vmovdqa [ARG + _aesarg_in + 16*1], xmm2 + vmovdqa [ARG + _aesarg_in + 16*2], xmm3 + vmovdqa [ARG + _aesarg_in + 16*3], xmm4 +%ifndef CBC_MAC + vpaddq xmm5, xmm0, [ARG + _aesarg_out + 16*0] + vpaddq xmm6, xmm0, [ARG + _aesarg_out + 16*1] + vpaddq xmm7, xmm0, [ARG + _aesarg_out + 16*2] + vpaddq xmm8, xmm0, [ARG + _aesarg_out + 16*3] + vmovdqa [ARG + _aesarg_out + 16*0], xmm5 + vmovdqa [ARG + _aesarg_out + 16*1], xmm6 + vmovdqa [ARG + _aesarg_out + 16*2], xmm7 + vmovdqa [ARG + _aesarg_out + 16*3], xmm8 +%endif + + ;; XMMs are saved at a higher level + mov rbp, [GPR_SAVE_AREA + 8*0] +%ifdef CBC_MAC + mov rbx, [GPR_SAVE_AREA + 8*1] + mov r12, [GPR_SAVE_AREA + 8*2] + mov r13, [GPR_SAVE_AREA + 8*3] + mov r14, [GPR_SAVE_AREA + 8*4] + mov r15, [GPR_SAVE_AREA + 8*5] +%ifndef LINUX + mov rsi, [GPR_SAVE_AREA + 8*6] + mov rdi, [GPR_SAVE_AREA + 8*7] +%endif +%endif + + add rsp, STACK_size + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_192_x8.asm b/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_192_x8.asm new file mode 100644 index 00000000..c17f6d6a --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_192_x8.asm @@ -0,0 +1,501 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;;; routine to do a 192 bit CBC AES encrypt + +;; clobbers all registers except for ARG1 and rbp + +%include "os.asm" +%include "mb_mgr_datastruct.asm" + +%define VMOVDQ vmovdqu ;; assume buffers not aligned + +%macro VPXOR2 2 + vpxor %1, %1, %2 +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; struct AES_ARGS_x8 { +;; void* in[8]; +;; void* out[8]; +;; UINT128* keys[8]; +;; UINT128 IV[8]; +;; } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void aes_cbc_enc_192_x8(AES_ARGS_x8 *args, UINT64 len); +;; arg 1: ARG : addr of AES_ARGS_x8 structure +;; arg 2: LEN : len (in units of bytes) + +struc STACK +_gpr_save: resq 1 +_len: resq 1 +endstruc + +%define GPR_SAVE_AREA rsp + _gpr_save +%define LEN_AREA rsp + _len + +%ifdef LINUX +%define ARG rdi +%define LEN rsi +%define REG3 rcx +%define REG4 rdx +%else +%define ARG rcx +%define LEN rdx +%define REG3 rsi +%define REG4 rdi +%endif + +%define IDX rax +%define TMP rbx + +%define KEYS0 REG3 +%define KEYS1 REG4 +%define KEYS2 rbp +%define KEYS3 r8 +%define KEYS4 r9 +%define KEYS5 r10 +%define KEYS6 r11 +%define KEYS7 r12 + +%define IN0 r13 +%define IN2 r14 +%define IN4 r15 +%define IN6 LEN + +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 +%define XDATA4 xmm4 +%define XDATA5 xmm5 +%define XDATA6 xmm6 +%define XDATA7 xmm7 + +%define XKEY0_3 xmm8 +%define XKEY1_4 xmm9 +%define XKEY2_5 xmm10 +%define XKEY3_6 xmm11 +%define XKEY4_7 xmm12 +%define XKEY5_8 xmm13 +%define XKEY6_9 xmm14 +%define XTMP xmm15 + +section .text + +MKGLOBAL(aes_cbc_enc_192_x8,function,internal) +aes_cbc_enc_192_x8: + + sub rsp, STACK_size + mov [GPR_SAVE_AREA + 8*0], rbp + + mov IDX, 16 + mov [LEN_AREA], LEN + + mov IN0, [ARG + _aesarg_in + 8*0] + mov IN2, [ARG + _aesarg_in + 8*2] + mov IN4, [ARG + _aesarg_in + 8*4] + mov IN6, [ARG + _aesarg_in + 8*6] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + mov TMP, [ARG + _aesarg_in + 8*1] + VMOVDQ XDATA0, [IN0] ; load first block of plain text + VMOVDQ XDATA1, [TMP] ; load first block of plain text + mov TMP, [ARG + _aesarg_in + 8*3] + VMOVDQ XDATA2, [IN2] ; load first block of plain text + VMOVDQ XDATA3, [TMP] ; load first block of plain text + mov TMP, [ARG + _aesarg_in + 8*5] + VMOVDQ XDATA4, [IN4] ; load first block of plain text + VMOVDQ XDATA5, [TMP] ; load first block of plain text + mov TMP, [ARG + _aesarg_in + 8*7] + VMOVDQ XDATA6, [IN6] ; load first block of plain text + VMOVDQ XDATA7, [TMP] ; load first block of plain text + + + VPXOR2 XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV + VPXOR2 XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV + VPXOR2 XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV + VPXOR2 XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV + VPXOR2 XDATA4, [ARG + _aesarg_IV + 16*4] ; plaintext XOR IV + VPXOR2 XDATA5, [ARG + _aesarg_IV + 16*5] ; plaintext XOR IV + VPXOR2 XDATA6, [ARG + _aesarg_IV + 16*6] ; plaintext XOR IV + VPXOR2 XDATA7, [ARG + _aesarg_IV + 16*7] ; plaintext XOR IV + + mov KEYS0, [ARG + _aesarg_keys + 8*0] + mov KEYS1, [ARG + _aesarg_keys + 8*1] + mov KEYS2, [ARG + _aesarg_keys + 8*2] + mov KEYS3, [ARG + _aesarg_keys + 8*3] + mov KEYS4, [ARG + _aesarg_keys + 8*4] + mov KEYS5, [ARG + _aesarg_keys + 8*5] + mov KEYS6, [ARG + _aesarg_keys + 8*6] + mov KEYS7, [ARG + _aesarg_keys + 8*7] + + VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK + VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK + VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK + VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK + VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK + VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK + VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK + VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK + + vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC + vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC + vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC + vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC + + vmovdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key + + vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC + vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC + vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC + vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC + + vmovdqa XKEY1_4, [KEYS1 + 16*4] ; load round 4 key + + vaesenc XDATA0, XKEY0_3 ; 3. ENC + vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC + vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC + vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC + vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC + vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC + vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC + vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC + + vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + vmovdqa XKEY2_5, [KEYS2 + 16*5] ; load round 5 key + vaesenc XDATA1, XKEY1_4 ; 4. ENC + vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC + vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC + vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC + vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC + + vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + vmovdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key + vaesenc XDATA2, XKEY2_5 ; 5. ENC + vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC + vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC + vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC + vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC + + vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC + vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC + vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC + vmovdqa XKEY4_7, [KEYS4 + 16*7] ; load round 7 key + vaesenc XDATA3, XKEY3_6 ; 6. ENC + vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC + vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC + vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC + vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC + + vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + vmovdqa XKEY5_8, [KEYS5 + 16*8] ; load round 8 key + vaesenc XDATA4, XKEY4_7 ; 7. ENC + vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC + vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC + vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC + + vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC + vmovdqa XKEY6_9, [KEYS6 + 16*9] ; load round 9 key + vaesenc XDATA5, XKEY5_8 ; 8. ENC + vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC + vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC + + vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC + vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC + vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC + vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC + vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC + vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC + mov TMP, [ARG + _aesarg_out + 8*0] + vaesenc XDATA6, XKEY6_9 ; 9. ENC + vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC + + + vaesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC + vaesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC + vaesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC + vaesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC + vaesenc XDATA4, [KEYS4 + 16*10] ; 10. ENC + vaesenc XDATA5, [KEYS5 + 16*10] ; 10. ENC + vaesenc XDATA6, [KEYS6 + 16*10] ; 10. ENC + vaesenc XDATA7, [KEYS7 + 16*10] ; 10. ENC + + vaesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC + vaesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC + vaesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC + vaesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC + vaesenc XDATA4, [KEYS4 + 16*11] ; 11. ENC + vaesenc XDATA5, [KEYS5 + 16*11] ; 11. ENC + vaesenc XDATA6, [KEYS6 + 16*11] ; 11. ENC + vaesenc XDATA7, [KEYS7 + 16*11] ; 11. ENC + + + vaesenclast XDATA0, [KEYS0 + 16*12] ; 12. ENC + vaesenclast XDATA1, [KEYS1 + 16*12] ; 12. ENC + vaesenclast XDATA2, [KEYS2 + 16*12] ; 12. ENC + vaesenclast XDATA3, [KEYS3 + 16*12] ; 12. ENC + vaesenclast XDATA4, [KEYS4 + 16*12] ; 12. ENC + vaesenclast XDATA5, [KEYS5 + 16*12] ; 12. ENC + vaesenclast XDATA6, [KEYS6 + 16*12] ; 12. ENC + vaesenclast XDATA7, [KEYS7 + 16*12] ; 12. ENC + + VMOVDQ [TMP], XDATA0 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*1] + VMOVDQ [TMP], XDATA1 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*2] + VMOVDQ [TMP], XDATA2 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*3] + VMOVDQ [TMP], XDATA3 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*4] + VMOVDQ [TMP], XDATA4 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*5] + VMOVDQ [TMP], XDATA5 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*6] + VMOVDQ [TMP], XDATA6 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*7] + VMOVDQ [TMP], XDATA7 ; write back ciphertext + + cmp [LEN_AREA], IDX + je done + +main_loop: + mov TMP, [ARG + _aesarg_in + 8*1] + VPXOR2 XDATA0, [IN0 + IDX] ; load next block of plain text + VPXOR2 XDATA1, [TMP + IDX] ; load next block of plain text + mov TMP, [ARG + _aesarg_in + 8*3] + VPXOR2 XDATA2, [IN2 + IDX] ; load next block of plain text + VPXOR2 XDATA3, [TMP + IDX] ; load next block of plain text + mov TMP, [ARG + _aesarg_in + 8*5] + VPXOR2 XDATA4, [IN4 + IDX] ; load next block of plain text + VPXOR2 XDATA5, [TMP + IDX] ; load next block of plain text + mov TMP, [ARG + _aesarg_in + 8*7] + VPXOR2 XDATA6, [IN6 + IDX] ; load next block of plain text + VPXOR2 XDATA7, [TMP + IDX] ; load next block of plain text + + + VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK + VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK + VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK + VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK + VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK + VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK + VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK + VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK + + vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC + vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC + vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC + vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC + + vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC + vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC + vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC + vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC + + vaesenc XDATA0, XKEY0_3 ; 3. ENC + vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC + vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC + vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC + vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC + vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC + vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC + vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC + + vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + vaesenc XDATA1, XKEY1_4 ; 4. ENC + vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC + vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC + vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC + vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC + + vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + vaesenc XDATA2, XKEY2_5 ; 5. ENC + vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC + vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC + vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC + vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC + + vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC + vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC + vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC + vaesenc XDATA3, XKEY3_6 ; 6. ENC + vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC + vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC + vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC + vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC + + vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + vaesenc XDATA4, XKEY4_7 ; 7. ENC + vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC + vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC + vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC + + vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC + vaesenc XDATA5, XKEY5_8 ; 8. ENC + vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC + vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC + + vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC + vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC + vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC + vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC + vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC + vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC + mov TMP, [ARG + _aesarg_out + 8*0] + vaesenc XDATA6, XKEY6_9 ; 9. ENC + vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC + + + vaesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC + vaesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC + vaesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC + vaesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC + vaesenc XDATA4, [KEYS4 + 16*10] ; 10. ENC + vaesenc XDATA5, [KEYS5 + 16*10] ; 10. ENC + vaesenc XDATA6, [KEYS6 + 16*10] ; 10. ENC + vaesenc XDATA7, [KEYS7 + 16*10] ; 10. ENC + + vaesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC + vaesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC + vaesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC + vaesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC + vaesenc XDATA4, [KEYS4 + 16*11] ; 11. ENC + vaesenc XDATA5, [KEYS5 + 16*11] ; 11. ENC + vaesenc XDATA6, [KEYS6 + 16*11] ; 11. ENC + vaesenc XDATA7, [KEYS7 + 16*11] ; 11. ENC + + vaesenclast XDATA0, [KEYS0 + 16*12] ; 12. ENC + vaesenclast XDATA1, [KEYS1 + 16*12] ; 12. ENC + vaesenclast XDATA2, [KEYS2 + 16*12] ; 12. ENC + vaesenclast XDATA3, [KEYS3 + 16*12] ; 12. ENC + vaesenclast XDATA4, [KEYS4 + 16*12] ; 12. ENC + vaesenclast XDATA5, [KEYS5 + 16*12] ; 12. ENC + vaesenclast XDATA6, [KEYS6 + 16*12] ; 12. ENC + vaesenclast XDATA7, [KEYS7 + 16*12] ; 12. ENC + + + VMOVDQ [TMP + IDX], XDATA0 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*1] + VMOVDQ [TMP + IDX], XDATA1 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*2] + VMOVDQ [TMP + IDX], XDATA2 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*3] + VMOVDQ [TMP + IDX], XDATA3 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*4] + VMOVDQ [TMP + IDX], XDATA4 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*5] + VMOVDQ [TMP + IDX], XDATA5 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*6] + VMOVDQ [TMP + IDX], XDATA6 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*7] + VMOVDQ [TMP + IDX], XDATA7 ; write back ciphertext + + add IDX, 16 + cmp [LEN_AREA], IDX + jne main_loop + +done: + ;; update IV + vmovdqa [ARG + _aesarg_IV + 16*0], XDATA0 + vmovdqa [ARG + _aesarg_IV + 16*1], XDATA1 + vmovdqa [ARG + _aesarg_IV + 16*2], XDATA2 + vmovdqa [ARG + _aesarg_IV + 16*3], XDATA3 + vmovdqa [ARG + _aesarg_IV + 16*4], XDATA4 + vmovdqa [ARG + _aesarg_IV + 16*5], XDATA5 + vmovdqa [ARG + _aesarg_IV + 16*6], XDATA6 + vmovdqa [ARG + _aesarg_IV + 16*7], XDATA7 + + ;; update IN and OUT + vmovd xmm0, [LEN_AREA] + vpshufd xmm0, xmm0, 0x44 + vpaddq xmm1, xmm0, [ARG + _aesarg_in + 16*0] + vpaddq xmm2, xmm0, [ARG + _aesarg_in + 16*1] + vpaddq xmm3, xmm0, [ARG + _aesarg_in + 16*2] + vpaddq xmm4, xmm0, [ARG + _aesarg_in + 16*3] + vmovdqa [ARG + _aesarg_in + 16*0], xmm1 + vmovdqa [ARG + _aesarg_in + 16*1], xmm2 + vmovdqa [ARG + _aesarg_in + 16*2], xmm3 + vmovdqa [ARG + _aesarg_in + 16*3], xmm4 + vpaddq xmm5, xmm0, [ARG + _aesarg_out + 16*0] + vpaddq xmm6, xmm0, [ARG + _aesarg_out + 16*1] + vpaddq xmm7, xmm0, [ARG + _aesarg_out + 16*2] + vpaddq xmm8, xmm0, [ARG + _aesarg_out + 16*3] + vmovdqa [ARG + _aesarg_out + 16*0], xmm5 + vmovdqa [ARG + _aesarg_out + 16*1], xmm6 + vmovdqa [ARG + _aesarg_out + 16*2], xmm7 + vmovdqa [ARG + _aesarg_out + 16*3], xmm8 + +;; XMMs are saved at a higher level + mov rbp, [GPR_SAVE_AREA + 8*0] + + add rsp, STACK_size + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_256_x8.asm b/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_256_x8.asm new file mode 100644 index 00000000..ab52bcd5 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes_cbc_enc_256_x8.asm @@ -0,0 +1,536 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;;; routine to do a 256 bit CBC AES encrypt + +;; clobbers all registers except for ARG1 and rbp + +%include "os.asm" +%include "mb_mgr_datastruct.asm" + +%define VMOVDQ vmovdqu ;; assume buffers not aligned + +%macro VPXOR2 2 + vpxor %1, %1, %2 +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; struct AES_ARGS_x8 { +;; void* in[8]; +;; void* out[8]; +;; UINT128* keys[8]; +;; UINT128 IV[8]; +;; } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void aes_cbc_enc_256_x8(AES_ARGS_x8 *args, UINT64 len); +;; arg 1: ARG : addr of AES_ARGS_x8 structure +;; arg 2: LEN : len (in units of bytes) + +struc STACK +_gpr_save: resq 1 +_len: resq 1 +endstruc + +%define GPR_SAVE_AREA rsp + _gpr_save +%define LEN_AREA rsp + _len + +%ifdef LINUX +%define ARG rdi +%define LEN rsi +%define REG3 rcx +%define REG4 rdx +%else +%define ARG rcx +%define LEN rdx +%define REG3 rsi +%define REG4 rdi +%endif + +%define IDX rax +%define TMP rbx + +%define KEYS0 REG3 +%define KEYS1 REG4 +%define KEYS2 rbp +%define KEYS3 r8 +%define KEYS4 r9 +%define KEYS5 r10 +%define KEYS6 r11 +%define KEYS7 r12 + +%define IN0 r13 +%define IN2 r14 +%define IN4 r15 +%define IN6 LEN + +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 +%define XDATA4 xmm4 +%define XDATA5 xmm5 +%define XDATA6 xmm6 +%define XDATA7 xmm7 + +%define XKEY0_3 xmm8 +%define XKEY1_4 xmm9 +%define XKEY2_5 xmm10 +%define XKEY3_6 xmm11 +%define XKEY4_7 xmm12 +%define XKEY5_8 xmm13 +%define XKEY6_9 xmm14 +%define XTMP xmm15 + +section .text +MKGLOBAL(aes_cbc_enc_256_x8,function,internal) +aes_cbc_enc_256_x8: + + sub rsp, STACK_size + mov [GPR_SAVE_AREA + 8*0], rbp + + mov IDX, 16 + mov [LEN_AREA], LEN + + mov IN0, [ARG + _aesarg_in + 8*0] + mov IN2, [ARG + _aesarg_in + 8*2] + mov IN4, [ARG + _aesarg_in + 8*4] + mov IN6, [ARG + _aesarg_in + 8*6] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + mov TMP, [ARG + _aesarg_in + 8*1] + VMOVDQ XDATA0, [IN0] ; load first block of plain text + VMOVDQ XDATA1, [TMP] ; load first block of plain text + mov TMP, [ARG + _aesarg_in + 8*3] + VMOVDQ XDATA2, [IN2] ; load first block of plain text + VMOVDQ XDATA3, [TMP] ; load first block of plain text + mov TMP, [ARG + _aesarg_in + 8*5] + VMOVDQ XDATA4, [IN4] ; load first block of plain text + VMOVDQ XDATA5, [TMP] ; load first block of plain text + mov TMP, [ARG + _aesarg_in + 8*7] + VMOVDQ XDATA6, [IN6] ; load first block of plain text + VMOVDQ XDATA7, [TMP] ; load first block of plain text + + + VPXOR2 XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV + VPXOR2 XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV + VPXOR2 XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV + VPXOR2 XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV + VPXOR2 XDATA4, [ARG + _aesarg_IV + 16*4] ; plaintext XOR IV + VPXOR2 XDATA5, [ARG + _aesarg_IV + 16*5] ; plaintext XOR IV + VPXOR2 XDATA6, [ARG + _aesarg_IV + 16*6] ; plaintext XOR IV + VPXOR2 XDATA7, [ARG + _aesarg_IV + 16*7] ; plaintext XOR IV + + mov KEYS0, [ARG + _aesarg_keys + 8*0] + mov KEYS1, [ARG + _aesarg_keys + 8*1] + mov KEYS2, [ARG + _aesarg_keys + 8*2] + mov KEYS3, [ARG + _aesarg_keys + 8*3] + mov KEYS4, [ARG + _aesarg_keys + 8*4] + mov KEYS5, [ARG + _aesarg_keys + 8*5] + mov KEYS6, [ARG + _aesarg_keys + 8*6] + mov KEYS7, [ARG + _aesarg_keys + 8*7] + + VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK + VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK + VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK + VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK + VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK + VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK + VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK + VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK + + vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC + vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC + vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC + vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC + + vmovdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key + + vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC + vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC + vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC + vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC + + vmovdqa XKEY1_4, [KEYS1 + 16*4] ; load round 4 key + + vaesenc XDATA0, XKEY0_3 ; 3. ENC + vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC + vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC + vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC + vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC + vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC + vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC + vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC + + vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + vmovdqa XKEY2_5, [KEYS2 + 16*5] ; load round 5 key + vaesenc XDATA1, XKEY1_4 ; 4. ENC + vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC + vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC + vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC + vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC + + vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + vmovdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key + vaesenc XDATA2, XKEY2_5 ; 5. ENC + vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC + vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC + vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC + vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC + + vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC + vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC + vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC + vmovdqa XKEY4_7, [KEYS4 + 16*7] ; load round 7 key + vaesenc XDATA3, XKEY3_6 ; 6. ENC + vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC + vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC + vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC + vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC + + vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + vmovdqa XKEY5_8, [KEYS5 + 16*8] ; load round 8 key + vaesenc XDATA4, XKEY4_7 ; 7. ENC + vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC + vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC + vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC + + vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC + vmovdqa XKEY6_9, [KEYS6 + 16*9] ; load round 9 key + vaesenc XDATA5, XKEY5_8 ; 8. ENC + vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC + vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC + + vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC + vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC + vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC + vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC + vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC + vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC + mov TMP, [ARG + _aesarg_out + 8*0] + vaesenc XDATA6, XKEY6_9 ; 9. ENC + vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC + + + vaesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC + vaesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC + vaesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC + vaesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC + vaesenc XDATA4, [KEYS4 + 16*10] ; 10. ENC + vaesenc XDATA5, [KEYS5 + 16*10] ; 10. ENC + vaesenc XDATA6, [KEYS6 + 16*10] ; 10. ENC + vaesenc XDATA7, [KEYS7 + 16*10] ; 10. ENC + + vaesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC + vaesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC + vaesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC + vaesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC + vaesenc XDATA4, [KEYS4 + 16*11] ; 11. ENC + vaesenc XDATA5, [KEYS5 + 16*11] ; 11. ENC + vaesenc XDATA6, [KEYS6 + 16*11] ; 11. ENC + vaesenc XDATA7, [KEYS7 + 16*11] ; 11. ENC + + + vaesenc XDATA0, [KEYS0 + 16*12] ; 12. ENC + vaesenc XDATA1, [KEYS1 + 16*12] ; 12. ENC + vaesenc XDATA2, [KEYS2 + 16*12] ; 12. ENC + vaesenc XDATA3, [KEYS3 + 16*12] ; 12. ENC + vaesenc XDATA4, [KEYS4 + 16*12] ; 12. ENC + vaesenc XDATA5, [KEYS5 + 16*12] ; 12. ENC + vaesenc XDATA6, [KEYS6 + 16*12] ; 12. ENC + vaesenc XDATA7, [KEYS7 + 16*12] ; 12. ENC + + vaesenc XDATA0, [KEYS0 + 16*13] ; 13. ENC + vaesenc XDATA1, [KEYS1 + 16*13] ; 13. ENC + vaesenc XDATA2, [KEYS2 + 16*13] ; 13. ENC + vaesenc XDATA3, [KEYS3 + 16*13] ; 13. ENC + vaesenc XDATA4, [KEYS4 + 16*13] ; 13. ENC + vaesenc XDATA5, [KEYS5 + 16*13] ; 13. ENC + vaesenc XDATA6, [KEYS6 + 16*13] ; 13. ENC + vaesenc XDATA7, [KEYS7 + 16*13] ; 13. ENC + + vaesenclast XDATA0, [KEYS0 + 16*14] ; 14. ENC + vaesenclast XDATA1, [KEYS1 + 16*14] ; 14. ENC + vaesenclast XDATA2, [KEYS2 + 16*14] ; 14. ENC + vaesenclast XDATA3, [KEYS3 + 16*14] ; 14. ENC + vaesenclast XDATA4, [KEYS4 + 16*14] ; 14. ENC + vaesenclast XDATA5, [KEYS5 + 16*14] ; 14. ENC + vaesenclast XDATA6, [KEYS6 + 16*14] ; 14. ENC + vaesenclast XDATA7, [KEYS7 + 16*14] ; 14. ENC + + VMOVDQ [TMP], XDATA0 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*1] + VMOVDQ [TMP], XDATA1 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*2] + VMOVDQ [TMP], XDATA2 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*3] + VMOVDQ [TMP], XDATA3 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*4] + VMOVDQ [TMP], XDATA4 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*5] + VMOVDQ [TMP], XDATA5 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*6] + VMOVDQ [TMP], XDATA6 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*7] + VMOVDQ [TMP], XDATA7 ; write back ciphertext + + cmp [LEN_AREA], IDX + je done + +main_loop: + mov TMP, [ARG + _aesarg_in + 8*1] + VPXOR2 XDATA0, [IN0 + IDX] ; load next block of plain text + VPXOR2 XDATA1, [TMP + IDX] ; load next block of plain text + mov TMP, [ARG + _aesarg_in + 8*3] + VPXOR2 XDATA2, [IN2 + IDX] ; load next block of plain text + VPXOR2 XDATA3, [TMP + IDX] ; load next block of plain text + mov TMP, [ARG + _aesarg_in + 8*5] + VPXOR2 XDATA4, [IN4 + IDX] ; load next block of plain text + VPXOR2 XDATA5, [TMP + IDX] ; load next block of plain text + mov TMP, [ARG + _aesarg_in + 8*7] + VPXOR2 XDATA6, [IN6 + IDX] ; load next block of plain text + VPXOR2 XDATA7, [TMP + IDX] ; load next block of plain text + + + VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK + VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK + VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK + VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK + VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK + VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK + VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK + VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK + + vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC + vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC + vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC + vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC + + vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC + vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC + vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC + vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC + + vaesenc XDATA0, XKEY0_3 ; 3. ENC + vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC + vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC + vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC + vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC + vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC + vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC + vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC + + vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + vaesenc XDATA1, XKEY1_4 ; 4. ENC + vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC + vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC + vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC + vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC + + vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + vaesenc XDATA2, XKEY2_5 ; 5. ENC + vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC + vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC + vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC + vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC + + vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC + vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC + vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC + vaesenc XDATA3, XKEY3_6 ; 6. ENC + vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC + vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC + vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC + vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC + + vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + vaesenc XDATA4, XKEY4_7 ; 7. ENC + vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC + vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC + vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC + + vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC + vaesenc XDATA5, XKEY5_8 ; 8. ENC + vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC + vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC + + vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC + vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC + vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC + vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC + vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC + vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC + mov TMP, [ARG + _aesarg_out + 8*0] + vaesenc XDATA6, XKEY6_9 ; 9. ENC + vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC + + + vaesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC + vaesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC + vaesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC + vaesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC + vaesenc XDATA4, [KEYS4 + 16*10] ; 10. ENC + vaesenc XDATA5, [KEYS5 + 16*10] ; 10. ENC + vaesenc XDATA6, [KEYS6 + 16*10] ; 10. ENC + vaesenc XDATA7, [KEYS7 + 16*10] ; 10. ENC + + vaesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC + vaesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC + vaesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC + vaesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC + vaesenc XDATA4, [KEYS4 + 16*11] ; 11. ENC + vaesenc XDATA5, [KEYS5 + 16*11] ; 11. ENC + vaesenc XDATA6, [KEYS6 + 16*11] ; 11. ENC + vaesenc XDATA7, [KEYS7 + 16*11] ; 11. ENC + + vaesenc XDATA0, [KEYS0 + 16*12] ; 12. ENC + vaesenc XDATA1, [KEYS1 + 16*12] ; 12. ENC + vaesenc XDATA2, [KEYS2 + 16*12] ; 12. ENC + vaesenc XDATA3, [KEYS3 + 16*12] ; 12. ENC + vaesenc XDATA4, [KEYS4 + 16*12] ; 12. ENC + vaesenc XDATA5, [KEYS5 + 16*12] ; 12. ENC + vaesenc XDATA6, [KEYS6 + 16*12] ; 12. ENC + vaesenc XDATA7, [KEYS7 + 16*12] ; 12. ENC + + vaesenc XDATA0, [KEYS0 + 16*13] ; 13. ENC + vaesenc XDATA1, [KEYS1 + 16*13] ; 13. ENC + vaesenc XDATA2, [KEYS2 + 16*13] ; 13. ENC + vaesenc XDATA3, [KEYS3 + 16*13] ; 13. ENC + vaesenc XDATA4, [KEYS4 + 16*13] ; 13. ENC + vaesenc XDATA5, [KEYS5 + 16*13] ; 13. ENC + vaesenc XDATA6, [KEYS6 + 16*13] ; 13. ENC + vaesenc XDATA7, [KEYS7 + 16*13] ; 13. ENC + + vaesenclast XDATA0, [KEYS0 + 16*14] ; 14. ENC + vaesenclast XDATA1, [KEYS1 + 16*14] ; 14. ENC + vaesenclast XDATA2, [KEYS2 + 16*14] ; 14. ENC + vaesenclast XDATA3, [KEYS3 + 16*14] ; 14. ENC + vaesenclast XDATA4, [KEYS4 + 16*14] ; 14. ENC + vaesenclast XDATA5, [KEYS5 + 16*14] ; 14. ENC + vaesenclast XDATA6, [KEYS6 + 16*14] ; 14. ENC + vaesenclast XDATA7, [KEYS7 + 16*14] ; 14. ENC + + + VMOVDQ [TMP + IDX], XDATA0 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*1] + VMOVDQ [TMP + IDX], XDATA1 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*2] + VMOVDQ [TMP + IDX], XDATA2 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*3] + VMOVDQ [TMP + IDX], XDATA3 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*4] + VMOVDQ [TMP + IDX], XDATA4 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*5] + VMOVDQ [TMP + IDX], XDATA5 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*6] + VMOVDQ [TMP + IDX], XDATA6 ; write back ciphertext + mov TMP, [ARG + _aesarg_out + 8*7] + VMOVDQ [TMP + IDX], XDATA7 ; write back ciphertext + + add IDX, 16 + cmp [LEN_AREA], IDX + jne main_loop + +done: + ;; update IV + vmovdqa [ARG + _aesarg_IV + 16*0], XDATA0 + vmovdqa [ARG + _aesarg_IV + 16*1], XDATA1 + vmovdqa [ARG + _aesarg_IV + 16*2], XDATA2 + vmovdqa [ARG + _aesarg_IV + 16*3], XDATA3 + vmovdqa [ARG + _aesarg_IV + 16*4], XDATA4 + vmovdqa [ARG + _aesarg_IV + 16*5], XDATA5 + vmovdqa [ARG + _aesarg_IV + 16*6], XDATA6 + vmovdqa [ARG + _aesarg_IV + 16*7], XDATA7 + + ;; update IN and OUT + vmovd xmm0, [LEN_AREA] + vpshufd xmm0, xmm0, 0x44 + vpaddq xmm1, xmm0, [ARG + _aesarg_in + 16*0] + vpaddq xmm2, xmm0, [ARG + _aesarg_in + 16*1] + vpaddq xmm3, xmm0, [ARG + _aesarg_in + 16*2] + vpaddq xmm4, xmm0, [ARG + _aesarg_in + 16*3] + vmovdqa [ARG + _aesarg_in + 16*0], xmm1 + vmovdqa [ARG + _aesarg_in + 16*1], xmm2 + vmovdqa [ARG + _aesarg_in + 16*2], xmm3 + vmovdqa [ARG + _aesarg_in + 16*3], xmm4 + vpaddq xmm5, xmm0, [ARG + _aesarg_out + 16*0] + vpaddq xmm6, xmm0, [ARG + _aesarg_out + 16*1] + vpaddq xmm7, xmm0, [ARG + _aesarg_out + 16*2] + vpaddq xmm8, xmm0, [ARG + _aesarg_out + 16*3] + vmovdqa [ARG + _aesarg_out + 16*0], xmm5 + vmovdqa [ARG + _aesarg_out + 16*1], xmm6 + vmovdqa [ARG + _aesarg_out + 16*2], xmm7 + vmovdqa [ARG + _aesarg_out + 16*3], xmm8 + +;; XMMs are saved at a higher level + mov rbp, [GPR_SAVE_AREA + 8*0] + + add rsp, STACK_size + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/aes_cfb_128_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes_cfb_128_avx.asm new file mode 100644 index 00000000..c3d7f4a1 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes_cfb_128_avx.asm @@ -0,0 +1,161 @@ +;; +;; Copyright (c) 2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "memcpy.asm" + +;;; Routine to do 128 bit CFB AES encrypt/decrypt operations on one block only. +;;; It processes only one buffer at a time. +;;; It is designed to manage partial blocks of DOCSIS 3.1 SEC BPI + +;; In System V AMD64 ABI +;; calle saves: RBX, RBP, R12-R15 +;; Windows x64 ABI +;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 +;; +;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Windows clobbers: RAX R9 R10 R11 +;; Windows preserves: RBX RCX RDX RBP RSI RDI R8 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Linux clobbers: RAX R9 R10 +;; Linux preserves: RBX RCX RDX RBP RSI RDI R8 R11 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; +;; Linux/Windows clobbers: xmm0 +;; + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define arg4 rcx +%define arg5 r8 +%else +%define arg1 rcx +%define arg2 rdx +%define arg3 r8 +%define arg4 r9 +%define arg5 [rsp + 5*8] +%endif + +%define OUT arg1 +%define IN arg2 +%define IV arg3 +%define KEYS arg4 +%ifdef LINUX +%define LEN arg5 +%else +%define LEN2 arg5 +%define LEN r11 +%endif + +%define TMP0 rax +%define TMP1 r10 +%define PTR0 rsp + _buffer + +%define XDATA xmm0 + +section .text + +struc STACK +_buffer: resq 2 +_rsp_save: resq 1 +endstruc + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void aes_cfb_128_one(void *out, void *in, void *iv, void *keys) +;; arg 1: OUT : addr to put clear/cipher text out +;; arg 2: IN : addr to take cipher/clear text from +;; arg 3: IV : initialization vector +;; arg 4: KEYS: pointer to expanded keys structure (16 byte aligned) +;; arg 5: LEN: length of the text to encrypt/decrypt (valid range is 0 to 16) +;; +;; AES CFB128 one block encrypt/decrypt implementation. +;; The function doesn't update IV. The result of operation can be found in OUT. +;; +;; It is primarly designed to process partial block of +;; DOCSIS 3.1 AES Packet PDU Encryption (I.10) +;; +;; It process up to one block only (up to 16 bytes). +;; +;; It makes sure not to read more than LEN bytes from IN and +;; not to store more than LEN bytes to OUT. +MKGLOBAL(aes_cfb_128_one_avx,function,) +align 32 +aes_cfb_128_one_avx: +%ifndef LINUX + mov LEN, LEN2 +%endif + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + mov [rsp + _rsp_save], rax + + test LEN, 16 + jz copy_in_lt16 + vmovdqu XDATA, [IN] + vmovdqa [PTR0], XDATA + jmp copy_in_end +copy_in_lt16: + memcpy_avx_16 PTR0, IN, LEN, TMP0, TMP1 +copy_in_end: + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu XDATA, [IV] ; IV (or next to last block) + vpxor XDATA, XDATA, [KEYS + 16*0] ; 0. ARK + vaesenc XDATA, XDATA, [KEYS + 16*1] ; 1. ENC + vaesenc XDATA, XDATA, [KEYS + 16*2] ; 2. ENC + vaesenc XDATA, XDATA, [KEYS + 16*3] ; 3. ENC + vaesenc XDATA, XDATA, [KEYS + 16*4] ; 4. ENC + vaesenc XDATA, XDATA, [KEYS + 16*5] ; 5. ENC + vaesenc XDATA, XDATA, [KEYS + 16*6] ; 6. ENC + vaesenc XDATA, XDATA, [KEYS + 16*7] ; 7. ENC + vaesenc XDATA, XDATA, [KEYS + 16*8] ; 8. ENC + vaesenc XDATA, XDATA, [KEYS + 16*9] ; 9. ENC + vaesenclast XDATA, XDATA, [KEYS + 16*10] ; 10. ENC + + vpxor XDATA, XDATA, [PTR0] ; plaintext/ciphertext XOR block cipher encryption + + test LEN, 16 + jz copy_out_lt16 + vmovdqu [OUT], XDATA + jmp copy_out_end +copy_out_lt16: + vmovdqa [PTR0], XDATA + memcpy_avx_16 OUT, PTR0, LEN, TMP0, TMP1 +copy_out_end: + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + mov rsp, [rsp + _rsp_save] ; original SP + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/aes_xcbc_mac_128_x8.asm b/src/spdk/intel-ipsec-mb/avx/aes_xcbc_mac_128_x8.asm new file mode 100644 index 00000000..78422c71 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes_xcbc_mac_128_x8.asm @@ -0,0 +1,418 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;;; routine to do 128 bit AES XCBC + +;; clobbers all registers except for ARG1 and rbp + +%include "os.asm" +%include "mb_mgr_datastruct.asm" + +%define VMOVDQ vmovdqu ;; assume buffers not aligned + +%macro VPXOR2 2 + vpxor %1, %1, %2 +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; struct AES_XCBC_ARGS_x8 { +;; void* in[8]; +;; UINT128* keys[8]; +;; UINT128 ICV[8]; +;; } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void aes_xcbc_mac_128_x8(AES_XCBC_ARGS_x8 *args, UINT64 len); +;; arg 1: ARG : addr of AES_XCBC_ARGS_x8 structure +;; arg 2: LEN : len (in units of bytes) + +struc STACK +_gpr_save: resq 1 +_len: resq 1 +endstruc + +%define GPR_SAVE_AREA rsp + _gpr_save +%define LEN_AREA rsp + _len + +%ifdef LINUX +%define ARG rdi +%define LEN rsi +%define REG3 rcx +%define REG4 rdx +%else +%define ARG rcx +%define LEN rdx +%define REG3 rsi +%define REG4 rdi +%endif + +%define IDX rax +%define TMP rbx + +%define KEYS0 REG3 +%define KEYS1 REG4 +%define KEYS2 rbp +%define KEYS3 r8 +%define KEYS4 r9 +%define KEYS5 r10 +%define KEYS6 r11 +%define KEYS7 r12 + +%define IN0 r13 +%define IN2 r14 +%define IN4 r15 +%define IN6 LEN + +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 +%define XDATA4 xmm4 +%define XDATA5 xmm5 +%define XDATA6 xmm6 +%define XDATA7 xmm7 + +%define XKEY0_3 xmm8 +%define XKEY1_4 xmm9 +%define XKEY2_5 xmm10 +%define XKEY3_6 xmm11 +%define XKEY4_7 xmm12 +%define XKEY5_8 xmm13 +%define XKEY6_9 xmm14 +%define XTMP xmm15 + +section .text +MKGLOBAL(aes_xcbc_mac_128_x8,function,internal) +aes_xcbc_mac_128_x8: + + sub rsp, STACK_size + mov [GPR_SAVE_AREA + 8*0], rbp + + mov IDX, 16 + mov [LEN_AREA], LEN + + mov IN0, [ARG + _aesxcbcarg_in + 8*0] + mov IN2, [ARG + _aesxcbcarg_in + 8*2] + mov IN4, [ARG + _aesxcbcarg_in + 8*4] + mov IN6, [ARG + _aesxcbcarg_in + 8*6] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + mov TMP, [ARG + _aesxcbcarg_in + 8*1] + VMOVDQ XDATA0, [IN0] ; load first block of plain text + VMOVDQ XDATA1, [TMP] ; load first block of plain text + mov TMP, [ARG + _aesxcbcarg_in + 8*3] + VMOVDQ XDATA2, [IN2] ; load first block of plain text + VMOVDQ XDATA3, [TMP] ; load first block of plain text + mov TMP, [ARG + _aesxcbcarg_in + 8*5] + VMOVDQ XDATA4, [IN4] ; load first block of plain text + VMOVDQ XDATA5, [TMP] ; load first block of plain text + mov TMP, [ARG + _aesxcbcarg_in + 8*7] + VMOVDQ XDATA6, [IN6] ; load first block of plain text + VMOVDQ XDATA7, [TMP] ; load first block of plain text + + + VPXOR2 XDATA0, [ARG + _aesxcbcarg_ICV + 16*0] ; plaintext XOR ICV + VPXOR2 XDATA1, [ARG + _aesxcbcarg_ICV + 16*1] ; plaintext XOR ICV + VPXOR2 XDATA2, [ARG + _aesxcbcarg_ICV + 16*2] ; plaintext XOR ICV + VPXOR2 XDATA3, [ARG + _aesxcbcarg_ICV + 16*3] ; plaintext XOR ICV + VPXOR2 XDATA4, [ARG + _aesxcbcarg_ICV + 16*4] ; plaintext XOR ICV + VPXOR2 XDATA5, [ARG + _aesxcbcarg_ICV + 16*5] ; plaintext XOR ICV + VPXOR2 XDATA6, [ARG + _aesxcbcarg_ICV + 16*6] ; plaintext XOR ICV + VPXOR2 XDATA7, [ARG + _aesxcbcarg_ICV + 16*7] ; plaintext XOR ICV + + mov KEYS0, [ARG + _aesxcbcarg_keys + 8*0] + mov KEYS1, [ARG + _aesxcbcarg_keys + 8*1] + mov KEYS2, [ARG + _aesxcbcarg_keys + 8*2] + mov KEYS3, [ARG + _aesxcbcarg_keys + 8*3] + mov KEYS4, [ARG + _aesxcbcarg_keys + 8*4] + mov KEYS5, [ARG + _aesxcbcarg_keys + 8*5] + mov KEYS6, [ARG + _aesxcbcarg_keys + 8*6] + mov KEYS7, [ARG + _aesxcbcarg_keys + 8*7] + + VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK + VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK + VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK + VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK + VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK + VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK + VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK + VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK + + vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC + vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC + vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC + vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC + + vmovdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key + + vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC + vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC + vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC + vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC + + vmovdqa XKEY1_4, [KEYS1 + 16*4] ; load round 4 key + + vaesenc XDATA0, XKEY0_3 ; 3. ENC + vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC + vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC + vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC + vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC + vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC + vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC + vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC + + vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + vmovdqa XKEY2_5, [KEYS2 + 16*5] ; load round 5 key + vaesenc XDATA1, XKEY1_4 ; 4. ENC + vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC + vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC + vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC + vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC + + vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + vmovdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key + vaesenc XDATA2, XKEY2_5 ; 5. ENC + vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC + vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC + vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC + vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC + + vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC + vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC + vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC + vmovdqa XKEY4_7, [KEYS4 + 16*7] ; load round 7 key + vaesenc XDATA3, XKEY3_6 ; 6. ENC + vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC + vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC + vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC + vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC + + vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + vmovdqa XKEY5_8, [KEYS5 + 16*8] ; load round 8 key + vaesenc XDATA4, XKEY4_7 ; 7. ENC + vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC + vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC + vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC + + vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC + vmovdqa XKEY6_9, [KEYS6 + 16*9] ; load round 9 key + vaesenc XDATA5, XKEY5_8 ; 8. ENC + vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC + vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC + + vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC + vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC + vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC + vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC + vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC + vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC + vaesenc XDATA6, XKEY6_9 ; 9. ENC + vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC + + vaesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC + vaesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC + vaesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC + vaesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC + vaesenclast XDATA4, [KEYS4 + 16*10] ; 10. ENC + vaesenclast XDATA5, [KEYS5 + 16*10] ; 10. ENC + vaesenclast XDATA6, [KEYS6 + 16*10] ; 10. ENC + vaesenclast XDATA7, [KEYS7 + 16*10] ; 10. ENC + + cmp [LEN_AREA], IDX + je done + +main_loop: + mov TMP, [ARG + _aesxcbcarg_in + 8*1] + VPXOR2 XDATA0, [IN0 + IDX] ; load next block of plain text + VPXOR2 XDATA1, [TMP + IDX] ; load next block of plain text + mov TMP, [ARG + _aesxcbcarg_in + 8*3] + VPXOR2 XDATA2, [IN2 + IDX] ; load next block of plain text + VPXOR2 XDATA3, [TMP + IDX] ; load next block of plain text + mov TMP, [ARG + _aesxcbcarg_in + 8*5] + VPXOR2 XDATA4, [IN4 + IDX] ; load next block of plain text + VPXOR2 XDATA5, [TMP + IDX] ; load next block of plain text + mov TMP, [ARG + _aesxcbcarg_in + 8*7] + VPXOR2 XDATA6, [IN6 + IDX] ; load next block of plain text + VPXOR2 XDATA7, [TMP + IDX] ; load next block of plain text + + + VPXOR2 XDATA0, [KEYS0 + 16*0] ; 0. ARK + VPXOR2 XDATA1, [KEYS1 + 16*0] ; 0. ARK + VPXOR2 XDATA2, [KEYS2 + 16*0] ; 0. ARK + VPXOR2 XDATA3, [KEYS3 + 16*0] ; 0. ARK + VPXOR2 XDATA4, [KEYS4 + 16*0] ; 0. ARK + VPXOR2 XDATA5, [KEYS5 + 16*0] ; 0. ARK + VPXOR2 XDATA6, [KEYS6 + 16*0] ; 0. ARK + VPXOR2 XDATA7, [KEYS7 + 16*0] ; 0. ARK + + vaesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + vaesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + vaesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + vaesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + vaesenc XDATA4, [KEYS4 + 16*1] ; 1. ENC + vaesenc XDATA5, [KEYS5 + 16*1] ; 1. ENC + vaesenc XDATA6, [KEYS6 + 16*1] ; 1. ENC + vaesenc XDATA7, [KEYS7 + 16*1] ; 1. ENC + + vaesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + vaesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + vaesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + vaesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + vaesenc XDATA4, [KEYS4 + 16*2] ; 2. ENC + vaesenc XDATA5, [KEYS5 + 16*2] ; 2. ENC + vaesenc XDATA6, [KEYS6 + 16*2] ; 2. ENC + vaesenc XDATA7, [KEYS7 + 16*2] ; 2. ENC + + vaesenc XDATA0, XKEY0_3 ; 3. ENC + vaesenc XDATA1, [KEYS1 + 16*3] ; 3. ENC + vaesenc XDATA2, [KEYS2 + 16*3] ; 3. ENC + vaesenc XDATA3, [KEYS3 + 16*3] ; 3. ENC + vaesenc XDATA4, [KEYS4 + 16*3] ; 3. ENC + vaesenc XDATA5, [KEYS5 + 16*3] ; 3. ENC + vaesenc XDATA6, [KEYS6 + 16*3] ; 3. ENC + vaesenc XDATA7, [KEYS7 + 16*3] ; 3. ENC + + vaesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + vaesenc XDATA1, XKEY1_4 ; 4. ENC + vaesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + vaesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + vaesenc XDATA4, [KEYS4 + 16*4] ; 4. ENC + vaesenc XDATA5, [KEYS5 + 16*4] ; 4. ENC + vaesenc XDATA6, [KEYS6 + 16*4] ; 4. ENC + vaesenc XDATA7, [KEYS7 + 16*4] ; 4. ENC + + vaesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + vaesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + vaesenc XDATA2, XKEY2_5 ; 5. ENC + vaesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + vaesenc XDATA4, [KEYS4 + 16*5] ; 5. ENC + vaesenc XDATA5, [KEYS5 + 16*5] ; 5. ENC + vaesenc XDATA6, [KEYS6 + 16*5] ; 5. ENC + vaesenc XDATA7, [KEYS7 + 16*5] ; 5. ENC + + vaesenc XDATA0, [KEYS0 + 16*6] ; 6. ENC + vaesenc XDATA1, [KEYS1 + 16*6] ; 6. ENC + vaesenc XDATA2, [KEYS2 + 16*6] ; 6. ENC + vaesenc XDATA3, XKEY3_6 ; 6. ENC + vaesenc XDATA4, [KEYS4 + 16*6] ; 6. ENC + vaesenc XDATA5, [KEYS5 + 16*6] ; 6. ENC + vaesenc XDATA6, [KEYS6 + 16*6] ; 6. ENC + vaesenc XDATA7, [KEYS7 + 16*6] ; 6. ENC + + vaesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + vaesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + vaesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + vaesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + vaesenc XDATA4, XKEY4_7 ; 7. ENC + vaesenc XDATA5, [KEYS5 + 16*7] ; 7. ENC + vaesenc XDATA6, [KEYS6 + 16*7] ; 7. ENC + vaesenc XDATA7, [KEYS7 + 16*7] ; 7. ENC + + vaesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + vaesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + vaesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + vaesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + vaesenc XDATA4, [KEYS4 + 16*8] ; 8. ENC + vaesenc XDATA5, XKEY5_8 ; 8. ENC + vaesenc XDATA6, [KEYS6 + 16*8] ; 8. ENC + vaesenc XDATA7, [KEYS7 + 16*8] ; 8. ENC + + vaesenc XDATA0, [KEYS0 + 16*9] ; 9. ENC + vaesenc XDATA1, [KEYS1 + 16*9] ; 9. ENC + vaesenc XDATA2, [KEYS2 + 16*9] ; 9. ENC + vaesenc XDATA3, [KEYS3 + 16*9] ; 9. ENC + vaesenc XDATA4, [KEYS4 + 16*9] ; 9. ENC + vaesenc XDATA5, [KEYS5 + 16*9] ; 9. ENC + vaesenc XDATA6, XKEY6_9 ; 9. ENC + vaesenc XDATA7, [KEYS7 + 16*9] ; 9. ENC + + + vaesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC + vaesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC + vaesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC + vaesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC + vaesenclast XDATA4, [KEYS4 + 16*10] ; 10. ENC + vaesenclast XDATA5, [KEYS5 + 16*10] ; 10. ENC + vaesenclast XDATA6, [KEYS6 + 16*10] ; 10. ENC + vaesenclast XDATA7, [KEYS7 + 16*10] ; 10. ENC + + add IDX, 16 + cmp [LEN_AREA], IDX + jne main_loop + +done: + ;; update ICV + vmovdqa [ARG + _aesxcbcarg_ICV + 16*0], XDATA0 + vmovdqa [ARG + _aesxcbcarg_ICV + 16*1], XDATA1 + vmovdqa [ARG + _aesxcbcarg_ICV + 16*2], XDATA2 + vmovdqa [ARG + _aesxcbcarg_ICV + 16*3], XDATA3 + vmovdqa [ARG + _aesxcbcarg_ICV + 16*4], XDATA4 + vmovdqa [ARG + _aesxcbcarg_ICV + 16*5], XDATA5 + vmovdqa [ARG + _aesxcbcarg_ICV + 16*6], XDATA6 + vmovdqa [ARG + _aesxcbcarg_ICV + 16*7], XDATA7 + + ;; update IN + vmovd xmm0, [LEN_AREA] + vpshufd xmm0, xmm0, 0x44 + vpaddq xmm1, xmm0, [ARG + _aesxcbcarg_in + 16*0] + vpaddq xmm2, xmm0, [ARG + _aesxcbcarg_in + 16*1] + vpaddq xmm3, xmm0, [ARG + _aesxcbcarg_in + 16*2] + vpaddq xmm4, xmm0, [ARG + _aesxcbcarg_in + 16*3] + vmovdqa [ARG + _aesxcbcarg_in + 16*0], xmm1 + vmovdqa [ARG + _aesxcbcarg_in + 16*1], xmm2 + vmovdqa [ARG + _aesxcbcarg_in + 16*2], xmm3 + vmovdqa [ARG + _aesxcbcarg_in + 16*3], xmm4 + +;; XMMs are saved at a higher level + mov rbp, [GPR_SAVE_AREA + 8*0] + + add rsp, STACK_size + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/gcm128_avx_gen2.asm b/src/spdk/intel-ipsec-mb/avx/gcm128_avx_gen2.asm new file mode 100644 index 00000000..48ff1812 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/gcm128_avx_gen2.asm @@ -0,0 +1,31 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2018 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM128_MODE 1 +%include "gcm_avx_gen2.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/gcm192_avx_gen2.asm b/src/spdk/intel-ipsec-mb/avx/gcm192_avx_gen2.asm new file mode 100644 index 00000000..65a652e0 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/gcm192_avx_gen2.asm @@ -0,0 +1,31 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2018 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM192_MODE 1 +%include "gcm_avx_gen2.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/gcm256_avx_gen2.asm b/src/spdk/intel-ipsec-mb/avx/gcm256_avx_gen2.asm new file mode 100644 index 00000000..296d617e --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/gcm256_avx_gen2.asm @@ -0,0 +1,30 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2018 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define GCM256_MODE 1 +%include "gcm_avx_gen2.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/gcm_avx_gen2.asm b/src/spdk/intel-ipsec-mb/avx/gcm_avx_gen2.asm new file mode 100644 index 00000000..8c635558 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/gcm_avx_gen2.asm @@ -0,0 +1,2127 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2018 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; +; Authors: +; Erdinc Ozturk +; Vinodh Gopal +; James Guilford +; +; +; References: +; This code was derived and highly optimized from the code described in paper: +; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010 +; +; For the shift-based reductions used in this code, we used the method described in paper: +; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010. +; +; +; +; +; Assumptions: +; +; +; +; iv: +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Salt (From the SA) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Initialization Vector | +; | (This is the sequence number from IPSec header) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x1 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; +; +; AAD: +; AAD will be padded with 0 to the next 16byte multiple +; for example, assume AAD is a u32 vector +; +; if AAD is 8 bytes: +; AAD[3] = {A0, A1}; +; padded AAD in xmm register = {A1 A0 0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A1) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 32-bit Sequence Number (A0) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 32-bit Sequence Number +; +; if AAD is 12 bytes: +; AAD[3] = {A0, A1, A2}; +; padded AAD in xmm register = {A2 A1 A0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A2) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 64-bit Extended Sequence Number {A1,A0} | +; | | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 64-bit Extended Sequence Number +; +; +; aadLen: +; Must be a multiple of 4 bytes and from the definition of the spec. +; The code additionally supports any aadLen length. +; +; TLen: +; from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +; +; poly = x^128 + x^127 + x^126 + x^121 + 1 +; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part. +; + +%include "os.asm" +%include "reg_sizes.asm" +%include "gcm_defines.asm" + +%ifndef GCM128_MODE +%ifndef GCM192_MODE +%ifndef GCM256_MODE +%error "No GCM mode selected for gcm_avx_gen2.asm!" +%endif +%endif +%endif + +%ifdef GCM128_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx_gen2 +%define NROUNDS 9 +%endif + +%ifdef GCM192_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx_gen2 +%define NROUNDS 11 +%endif + +%ifdef GCM256_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx_gen2 +%define NROUNDS 13 +%endif + +default rel +; need to push 4 registers into stack to maintain +%define STACK_OFFSET 8*4 + +%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register) +%define TMP3 16*1 ; Temporary storage for AES State 3 +%define TMP4 16*2 ; Temporary storage for AES State 4 +%define TMP5 16*3 ; Temporary storage for AES State 5 +%define TMP6 16*4 ; Temporary storage for AES State 6 +%define TMP7 16*5 ; Temporary storage for AES State 7 +%define TMP8 16*6 ; Temporary storage for AES State 8 + +%define LOCAL_STORAGE 16*7 + +%ifidn __OUTPUT_FORMAT__, win64 + %define XMM_STORAGE 16*10 +%else + %define XMM_STORAGE 0 +%endif + +%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE + +section .text +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Utility Macros +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +; Input: A and B (128-bits each, bit-reflected) +; Output: C = A*B*x mod poly, (i.e. >>1 ) +; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GHASH_MUL 7 +%define %%GH %1 ; 16 Bytes +%define %%HK %2 ; 16 Bytes +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Karatsuba + vpshufd %%T2, %%GH, 01001110b + vpshufd %%T3, %%HK, 01001110b + vpxor %%T2, %%T2, %%GH ; %%T2 = (a1+a0) + vpxor %%T3, %%T3, %%HK ; %%T3 = (b1+b0) + + vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1 + vpclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0 + vpclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + vpxor %%T2, %%T2, %%GH + vpxor %%T2, %%T2, %%T1 ; %%T2 = a0*b1+a1*b0 + + vpslldq %%T3, %%T2, 8 ; shift-L %%T3 2 DWs + vpsrldq %%T2, %%T2, 8 ; shift-R %%T2 2 DWs + vpxor %%GH, %%GH, %%T3 + vpxor %%T1, %%T1, %%T2 ; <%%T1:%%GH> = %%GH x %%HK + + ;first phase of the reduction + vpslld %%T2, %%GH, 31 ; packed right shifting << 31 + vpslld %%T3, %%GH, 30 ; packed right shifting shift << 30 + vpslld %%T4, %%GH, 25 ; packed right shifting shift << 25 + + vpxor %%T2, %%T2, %%T3 ; xor the shifted versions + vpxor %%T2, %%T2, %%T4 + + vpsrldq %%T5, %%T2, 4 ; shift-R %%T5 1 DW + + vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs + vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;second phase of the reduction + + vpsrld %%T2,%%GH,1 ; packed left shifting >> 1 + vpsrld %%T3,%%GH,2 ; packed left shifting >> 2 + vpsrld %%T4,%%GH,7 ; packed left shifting >> 7 + vpxor %%T2, %%T2, %%T3 ; xor the shifted versions + vpxor %%T2, %%T2, %%T4 + + vpxor %%T2, %%T2, %%T5 + vpxor %%GH, %%GH, %%T2 + vpxor %%GH, %%GH, %%T1 ; the result is in %%GH + + +%endmacro + + +%macro PRECOMPUTE 8 +%define %%GDATA %1 +%define %%HK %2 +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 +%define %%T6 %8 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vmovdqa %%T5, %%HK + + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly + vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_2_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly + vmovdqu [%%GDATA + HashKey_3], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_3_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly + vmovdqu [%%GDATA + HashKey_4], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_4_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly + vmovdqu [%%GDATA + HashKey_5], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_5_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly + vmovdqu [%%GDATA + HashKey_6], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_6_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly + vmovdqu [%%GDATA + HashKey_7], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_7_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly + vmovdqu [%%GDATA + HashKey_8], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_8_k], %%T1 +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes. +; Returns 0 if data has length 0. +; Input: The input data (INPUT), that data's length (LENGTH). +; Output: The packed xmm register (OUTPUT). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro READ_SMALL_DATA_INPUT 6 +%define %%OUTPUT %1 ; %%OUTPUT is an xmm register +%define %%INPUT %2 +%define %%LENGTH %3 +%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers +%define %%COUNTER %5 +%define %%TMP1 %6 + + vpxor %%OUTPUT, %%OUTPUT + mov %%COUNTER, %%LENGTH + mov %%END_READ_LOCATION, %%INPUT + add %%END_READ_LOCATION, %%LENGTH + xor %%TMP1, %%TMP1 + + + cmp %%COUNTER, 8 + jl %%_byte_loop_2 + vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists + je %%_done + + sub %%COUNTER, 8 + +%%_byte_loop_1: ;Read in data 1 byte at a time while data is left + shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in + dec %%END_READ_LOCATION + mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] + dec %%COUNTER + jg %%_byte_loop_1 + vpinsrq %%OUTPUT, %%TMP1, 1 + jmp %%_done + +%%_byte_loop_2: ;Read in data 1 byte at a time while data is left + cmp %%COUNTER, 0 + je %%_done + shl %%TMP1, 8 ;This loop handles when no bytes were already read in + dec %%END_READ_LOCATION + mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] + dec %%COUNTER + jg %%_byte_loop_2 + vpinsrq %%OUTPUT, %%TMP1, 0 +%%_done: + +%endmacro ; READ_SMALL_DATA_INPUT + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. +; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). +; Output: The hash of the data (AAD_HASH). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro CALC_AAD_HASH 14 +%define %%A_IN %1 +%define %%A_LEN %2 +%define %%AAD_HASH %3 +%define %%HASH_KEY %4 +%define %%XTMP1 %5 ; xmm temp reg 5 +%define %%XTMP2 %6 +%define %%XTMP3 %7 +%define %%XTMP4 %8 +%define %%XTMP5 %9 ; xmm temp reg 5 +%define %%T1 %10 ; temp reg 1 +%define %%T2 %11 +%define %%T3 %12 +%define %%T4 %13 +%define %%T5 %14 ; temp reg 5 + + + mov %%T1, %%A_IN ; T1 = AAD + mov %%T2, %%A_LEN ; T2 = aadLen + vpxor %%AAD_HASH, %%AAD_HASH + + cmp %%T2, 16 + jl %%_get_small_AAD_block + +%%_get_AAD_loop16: + + vmovdqu %%XTMP1, [%%T1] + ;byte-reflect the AAD data + vpshufb %%XTMP1, [SHUF_MASK] + vpxor %%AAD_HASH, %%XTMP1 + GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 + + sub %%T2, 16 + je %%_CALC_AAD_done + + add %%T1, 16 + cmp %%T2, 16 + jge %%_get_AAD_loop16 + +%%_get_small_AAD_block: + READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5 + ;byte-reflect the AAD data + vpshufb %%XTMP1, [SHUF_MASK] + vpxor %%AAD_HASH, %%XTMP1 + GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 + +%%_CALC_AAD_done: + +%endmacro ; CALC_AAD_HASH + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls. +; Requires the input data be at least 1 byte long. +; Input: +; GDATA_KEY - struct gcm_key_data * +; GDATA_CTX - struct gcm_context_data * +; PLAIN_CYPH_IN - input text +; PLAIN_CYPH_LEN - input text length +; DATA_OFFSET - the current data offset +; ENC_DEC - whether encoding or decoding +; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro PARTIAL_BLOCK 8 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%DATA_OFFSET %6 +%define %%AAD_HASH %7 +%define %%ENC_DEC %8 + mov r13, [%%GDATA_CTX + PBlockLen] + cmp r13, 0 + je %%_partial_block_done ;Leave Macro if no partial blocks + + cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading + jl %%_fewer_than_16_bytes + VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register + jmp %%_data_read + +%%_fewer_than_16_bytes: + lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15 + +%%_data_read: ;Finished reading in data + + + vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key + vmovdqu xmm13, [%%GDATA_KEY + HashKey] + + lea r12, [SHIFT_MASK] + + cmp r13, rax + add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16) + vmovdqu xmm2, [r12] ; get the appropriate shuffle mask + vpshufb xmm9, xmm2 ;shift right r13 bytes + +%ifidn %%ENC_DEC, DEC + vmovdqa xmm3, xmm1 + vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn) + + mov r15, %%PLAIN_CYPH_LEN + add r15, r13 + sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly + sub r12, r15 +%%_no_extra_mask_1: + + vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 + vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 + + vpand xmm3, xmm1 + vpshufb xmm3, [SHUF_MASK] + vpshufb xmm3, xmm2 + vpxor %%AAD_HASH, xmm3 + + + cmp r15,0 + jl %%_partial_incomplete_1 + + GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + xor rax,rax + mov [%%GDATA_CTX + PBlockLen], rax + jmp %%_dec_done +%%_partial_incomplete_1: +%ifidn __OUTPUT_FORMAT__, win64 + mov rax, %%PLAIN_CYPH_LEN + add [%%GDATA_CTX + PBlockLen], rax +%else + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%endif +%%_dec_done: + vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH + +%else + vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + + mov r15, %%PLAIN_CYPH_LEN + add r15, r13 + sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly + sub r12, r15 +%%_no_extra_mask_2: + + vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 + vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 + + vpshufb xmm9, [SHUF_MASK] + vpshufb xmm9, xmm2 + vpxor %%AAD_HASH, xmm9 + + cmp r15,0 + jl %%_partial_incomplete_2 + + GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + xor rax,rax + mov [%%GDATA_CTX + PBlockLen], rax + jmp %%_encode_done +%%_partial_incomplete_2: +%ifidn __OUTPUT_FORMAT__, win64 + mov rax, %%PLAIN_CYPH_LEN + add [%%GDATA_CTX + PBlockLen], rax +%else + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%endif +%%_encode_done: + vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH + + vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext + vpshufb xmm9, xmm2 +%endif + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; output encrypted Bytes + cmp r15,0 + jl %%_partial_fill + mov r12, r13 + mov r13, 16 + sub r13, r12 ; Set r13 to be the number of bytes to write out + jmp %%_count_set +%%_partial_fill: + mov r13, %%PLAIN_CYPH_LEN +%%_count_set: + vmovq rax, xmm9 + cmp r13, 8 + jle %%_less_than_8_bytes_left + + mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax + add %%DATA_OFFSET, 8 + vpsrldq xmm9, xmm9, 8 + vmovq rax, xmm9 + sub r13, 8 +%%_less_than_8_bytes_left: + mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al + add %%DATA_OFFSET, 1 + shr rax, 8 + sub r13, 1 + jne %%_less_than_8_bytes_left + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%%_partial_block_done: +%endmacro ; PARTIAL_BLOCK + + +; if a = number of total plaintext bytes +; b = floor(a/16) +; %%num_initial_blocks = b mod 8; +; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext +; %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified. +; Updated AAD_HASH is returned in %%T3 + +%macro INITIAL_BLOCKS 24 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%LENGTH %5 +%define %%DATA_OFFSET %6 +%define %%num_initial_blocks %7 ; can be 0, 1, 2, 3, 4, 5, 6 or 7 +%define %%T1 %8 +%define %%HASH_KEY %9 +%define %%T3 %10 +%define %%T4 %11 +%define %%T5 %12 +%define %%CTR %13 +%define %%XMM1 %14 +%define %%XMM2 %15 +%define %%XMM3 %16 +%define %%XMM4 %17 +%define %%XMM5 %18 +%define %%XMM6 %19 +%define %%XMM7 %20 +%define %%XMM8 %21 +%define %%T6 %22 +%define %%T_key %23 +%define %%ENC_DEC %24 + +%assign i (8-%%num_initial_blocks) + vmovdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg + ; start AES for %%num_initial_blocks blocks + vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0 + + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa reg(i), %%CTR + vpshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap +%assign i (i+1) +%endrep + + vmovdqu %%T_key, [%%GDATA_KEY+16*0] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vpxor reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j 1 +%rep NROUNDS + vmovdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vaesenc reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j (j+1) +%endrep ; NROUNDS + + +vmovdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vaesenclast reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + vpxor reg(i), %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks + add %%DATA_OFFSET, 16 + %ifidn %%ENC_DEC, DEC + vmovdqa reg(i), %%T1 + %endif + vpshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations +%assign i (i+1) +%endrep + + +%assign i (8-%%num_initial_blocks) +%assign j (9-%%num_initial_blocks) + +%rep %%num_initial_blocks + vpxor reg(j), reg(i) + GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks +%assign i (i+1) +%assign j (j+1) +%endrep + ; %%XMM8 has the current Hash Value + vmovdqa %%T3, %%XMM8 + + cmp %%LENGTH, 128 + jl %%_initial_blocks_done ; no need for precomputed constants + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM1, %%CTR + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM2, %%CTR + vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM3, %%CTR + vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM4, %%CTR + vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM5, %%CTR + vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM6, %%CTR + vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM7, %%CTR + vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + + vpaddd %%CTR, [ONE] ; INCR Y0 + vmovdqa %%XMM8, %%CTR + vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + + vmovdqu %%T_key, [%%GDATA_KEY+16*0] + vpxor %%XMM1, %%T_key + vpxor %%XMM2, %%T_key + vpxor %%XMM3, %%T_key + vpxor %%XMM4, %%T_key + vpxor %%XMM5, %%T_key + vpxor %%XMM6, %%T_key + vpxor %%XMM7, %%T_key + vpxor %%XMM8, %%T_key + + +%assign i 1 +%rep NROUNDS + vmovdqu %%T_key, [%%GDATA_KEY+16*i] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key +%assign i (i+1) +%endrep + + + vmovdqu %%T_key, [%%GDATA_KEY+16*i] + vaesenclast %%XMM1, %%T_key + vaesenclast %%XMM2, %%T_key + vaesenclast %%XMM3, %%T_key + vaesenclast %%XMM4, %%T_key + vaesenclast %%XMM5, %%T_key + vaesenclast %%XMM6, %%T_key + vaesenclast %%XMM7, %%T_key + vaesenclast %%XMM8, %%T_key + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0] + vpxor %%XMM1, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM1, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1] + vpxor %%XMM2, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM2, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2] + vpxor %%XMM3, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM3, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3] + vpxor %%XMM4, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM4, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4] + vpxor %%XMM5, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM5, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5] + vpxor %%XMM6, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM6, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6] + vpxor %%XMM7, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM7, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7] + vpxor %%XMM8, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM8, %%T1 + %endif + + add %%DATA_OFFSET, 128 + + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + vpxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext + vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_initial_blocks_done: + + +%endmacro + + +; encrypt 8 blocks at a time +; ghash the 8 previously encrypted ciphertext blocks +; %%GDATA - (GCM key data), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified +; r11 is the data offset value +%macro GHASH_8_ENCRYPT_8_PARALLEL 22 +%define %%GDATA %1 +%define %%CYPH_PLAIN_OUT %2 +%define %%PLAIN_CYPH_IN %3 +%define %%DATA_OFFSET %4 +%define %%T1 %5 +%define %%T2 %6 +%define %%T3 %7 +%define %%T4 %8 +%define %%T5 %9 +%define %%T6 %10 +%define %%CTR %11 +%define %%XMM1 %12 +%define %%XMM2 %13 +%define %%XMM3 %14 +%define %%XMM4 %15 +%define %%XMM5 %16 +%define %%XMM6 %17 +%define %%XMM7 %18 +%define %%XMM8 %19 +%define %%T7 %20 +%define %%loop_idx %21 +%define %%ENC_DEC %22 + + vmovdqa %%T2, %%XMM1 + vmovdqu [rsp + TMP2], %%XMM2 + vmovdqu [rsp + TMP3], %%XMM3 + vmovdqu [rsp + TMP4], %%XMM4 + vmovdqu [rsp + TMP5], %%XMM5 + vmovdqu [rsp + TMP6], %%XMM6 + vmovdqu [rsp + TMP7], %%XMM7 + vmovdqu [rsp + TMP8], %%XMM8 + +%ifidn %%loop_idx, in_order + vpaddd %%XMM1, %%CTR, [ONE] ; INCR CNT + vpaddd %%XMM2, %%XMM1, [ONE] + vpaddd %%XMM3, %%XMM2, [ONE] + vpaddd %%XMM4, %%XMM3, [ONE] + vpaddd %%XMM5, %%XMM4, [ONE] + vpaddd %%XMM6, %%XMM5, [ONE] + vpaddd %%XMM7, %%XMM6, [ONE] + vpaddd %%XMM8, %%XMM7, [ONE] + vmovdqa %%CTR, %%XMM8 + + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap +%else + vpaddd %%XMM1, %%CTR, [ONEf] ; INCR CNT + vpaddd %%XMM2, %%XMM1, [ONEf] + vpaddd %%XMM3, %%XMM2, [ONEf] + vpaddd %%XMM4, %%XMM3, [ONEf] + vpaddd %%XMM5, %%XMM4, [ONEf] + vpaddd %%XMM6, %%XMM5, [ONEf] + vpaddd %%XMM7, %%XMM6, [ONEf] + vpaddd %%XMM8, %%XMM7, [ONEf] + vmovdqa %%CTR, %%XMM8 +%endif + + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T1, [%%GDATA + 16*0] + vpxor %%XMM1, %%T1 + vpxor %%XMM2, %%T1 + vpxor %%XMM3, %%T1 + vpxor %%XMM4, %%T1 + vpxor %%XMM5, %%T1 + vpxor %%XMM6, %%T1 + vpxor %%XMM7, %%T1 + vpxor %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T1, [%%GDATA + 16*1] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + + vmovdqu %%T1, [%%GDATA + 16*2] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_8] + vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 + vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0 + + vpshufd %%T6, %%T2, 01001110b + vpxor %%T6, %%T2 + + vmovdqu %%T5, [%%GDATA + HashKey_8_k] + vpclmulqdq %%T6, %%T6, %%T5, 0x00 ; + + + vmovdqu %%T1, [%%GDATA + 16*3] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP2] + vmovdqu %%T5, [%%GDATA + HashKey_7] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_7_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*4] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqu %%T1, [rsp + TMP3] + vmovdqu %%T5, [%%GDATA + HashKey_6] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_6_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*5] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + + vmovdqu %%T1, [rsp + TMP4] + vmovdqu %%T5, [%%GDATA + HashKey_5] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_5_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*6] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP5] + vmovdqu %%T5, [%%GDATA + HashKey_4] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_4_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + + vmovdqu %%T1, [%%GDATA + 16*7] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP6] + vmovdqu %%T5, [%%GDATA + HashKey_3] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_3_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*8] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP7] + vmovdqu %%T5, [%%GDATA + HashKey_2] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_2_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + 16*9] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T1, [rsp + TMP8] + vmovdqu %%T5, [%%GDATA + HashKey] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpshufd %%T3, %%T1, 01001110b + vpxor %%T3, %%T1 + vmovdqu %%T5, [%%GDATA + HashKey_k] + vpclmulqdq %%T3, %%T3, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vpxor %%T6, %%T4 + vpxor %%T6, %%T7 + +%ifdef GCM128_MODE + vmovdqu %%T5, [%%GDATA + 16*10] +%endif +%ifdef GCM192_MODE + vmovdqu %%T5, [%%GDATA + 16*10] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*11] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*12] +%endif +%ifdef GCM256_MODE + vmovdqu %%T5, [%%GDATA + 16*10] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*11] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*12] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*13] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*14] +%endif + +%assign i 0 +%assign j 1 +%rep 8 + +%ifidn %%ENC_DEC, ENC +%ifdef NT_LD + VXLDR %%T2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i] + vpxor %%T2, %%T2, %%T5 +%else + vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i] +%endif ; NT_LD + vaesenclast reg(j), reg(j), %%T2 +%else + VXLDR %%T2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i] + vpxor %%T2, %%T2, %%T5 + vaesenclast %%T3, reg(j), %%T2 + vpxor reg(j), %%T2, %%T5 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*i], %%T3 +%endif ; %%ENC_DEC + +%assign i (i+1) +%assign j (j+1) +%endrep + + vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs + vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs + vpxor %%T7, %%T3 + vpxor %%T6, %%T4 ; accumulate the results in %%T6:%%T7 + + + ;first phase of the reduction + + vpslld %%T2, %%T7, 31 ; packed right shifting << 31 + vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30 + vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25 + + vpxor %%T2, %%T2, %%T3 ; xor the shifted versions + vpxor %%T2, %%T2, %%T4 + + vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW + + vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs + vpxor %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + %ifidn %%ENC_DEC, ENC + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 ; Write to the Ciphertext buffer + %endif + + ;second phase of the reduction + + vpsrld %%T2,%%T7,1 ; packed left shifting >> 1 + vpsrld %%T3,%%T7,2 ; packed left shifting >> 2 + vpsrld %%T4,%%T7,7 ; packed left shifting >> 7 + vpxor %%T2, %%T2,%%T3 ; xor the shifted versions + vpxor %%T2, %%T2,%%T4 + + vpxor %%T2, %%T2, %%T1 + vpxor %%T7, %%T7, %%T2 + vpxor %%T6, %%T6, %%T7 ; the result is in %%T6 + + + + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM2, [SHUF_MASK] + vpshufb %%XMM3, [SHUF_MASK] + vpshufb %%XMM4, [SHUF_MASK] + vpshufb %%XMM5, [SHUF_MASK] + vpshufb %%XMM6, [SHUF_MASK] + vpshufb %%XMM7, [SHUF_MASK] + vpshufb %%XMM8, [SHUF_MASK] + + + vpxor %%XMM1, %%T6 + +%endmacro + + +; GHASH the last 4 ciphertext blocks. +; %%GDATA is GCM key data +%macro GHASH_LAST_8 16 +%define %%GDATA %1 +%define %%T1 %2 +%define %%T2 %3 +%define %%T3 %4 +%define %%T4 %5 +%define %%T5 %6 +%define %%T6 %7 +%define %%T7 %8 +%define %%XMM1 %9 +%define %%XMM2 %10 +%define %%XMM3 %11 +%define %%XMM4 %12 +%define %%XMM5 %13 +%define %%XMM6 %14 +%define %%XMM7 %15 +%define %%XMM8 %16 + ;; Karatsuba Method + + + vpshufd %%T2, %%XMM1, 01001110b + vpxor %%T2, %%XMM1 + vmovdqu %%T5, [%%GDATA + HashKey_8] + vpclmulqdq %%T6, %%XMM1, %%T5, 0x11 + vpclmulqdq %%T7, %%XMM1, %%T5, 0x00 + + vmovdqu %%T3, [%%GDATA + HashKey_8_k] + vpclmulqdq %%XMM1, %%T2, %%T3, 0x00 + + + ;;;;;;;;;;;;;;;;;;;;;; + + + vpshufd %%T2, %%XMM2, 01001110b + vpxor %%T2, %%XMM2 + vmovdqu %%T5, [%%GDATA + HashKey_7] + vpclmulqdq %%T4, %%XMM2, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM2, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_7_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + + vpshufd %%T2, %%XMM3, 01001110b + vpxor %%T2, %%XMM3 + vmovdqu %%T5, [%%GDATA + HashKey_6] + vpclmulqdq %%T4, %%XMM3, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM3, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_6_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + + vpshufd %%T2, %%XMM4, 01001110b + vpxor %%T2, %%XMM4 + vmovdqu %%T5, [%%GDATA + HashKey_5] + vpclmulqdq %%T4, %%XMM4, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM4, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_5_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vpshufd %%T2, %%XMM5, 01001110b + vpxor %%T2, %%XMM5 + vmovdqu %%T5, [%%GDATA + HashKey_4] + vpclmulqdq %%T4, %%XMM5, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM5, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_4_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vpshufd %%T2, %%XMM6, 01001110b + vpxor %%T2, %%XMM6 + vmovdqu %%T5, [%%GDATA + HashKey_3] + + vpclmulqdq %%T4, %%XMM6, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM6, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_3_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vpshufd %%T2, %%XMM7, 01001110b + vpxor %%T2, %%XMM7 + vmovdqu %%T5, [%%GDATA + HashKey_2] + vpclmulqdq %%T4, %%XMM7, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM7, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_2_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vpshufd %%T2, %%XMM8, 01001110b + vpxor %%T2, %%XMM8 + vmovdqu %%T5, [%%GDATA + HashKey] + vpclmulqdq %%T4, %%XMM8, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM8, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vmovdqu %%T3, [%%GDATA + HashKey_k] + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + vpxor %%XMM1, %%XMM1, %%T6 + vpxor %%T2, %%XMM1, %%T7 + + + + + vpslldq %%T4, %%T2, 8 + vpsrldq %%T2, %%T2, 8 + + vpxor %%T7, %%T4 + vpxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications + + ;first phase of the reduction + + vpslld %%T2, %%T7, 31 ; packed right shifting << 31 + vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30 + vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25 + + vpxor %%T2, %%T2, %%T3 ; xor the shifted versions + vpxor %%T2, %%T2, %%T4 + + vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW + + vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs + vpxor %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;second phase of the reduction + + vpsrld %%T2,%%T7,1 ; packed left shifting >> 1 + vpsrld %%T3,%%T7,2 ; packed left shifting >> 2 + vpsrld %%T4,%%T7,7 ; packed left shifting >> 7 + vpxor %%T2, %%T2,%%T3 ; xor the shifted versions + vpxor %%T2, %%T2,%%T4 + + vpxor %%T2, %%T2, %%T1 + vpxor %%T7, %%T7, %%T2 + vpxor %%T6, %%T6, %%T7 ; the result is in %%T6 + + +%endmacro + + +; Encryption of a single block +; %%GDATA is GCM key data +%macro ENCRYPT_SINGLE_BLOCK 2 +%define %%GDATA %1 +%define %%XMM0 %2 + + vpxor %%XMM0, [%%GDATA+16*0] +%assign i 1 +%rep NROUNDS + vaesenc %%XMM0, [%%GDATA+16*i] +%assign i (i+1) +%endrep ; NROUNDS + vaesenclast %%XMM0, [%%GDATA+16*i] +%endmacro + + +;; Start of Stack Setup + +%macro FUNC_SAVE 0 + ;; Required for Update/GMC_ENC + ;the number of pushes must equal STACK_OFFSET + push r12 + push r13 + push r14 + push r15 + mov r14, rsp + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 + vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7 + vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8 + vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9 + vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10 + vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11 + vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12 + vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13 + vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14 + vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15 +%endif +%endmacro + + +%macro FUNC_RESTORE 0 + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16] + vmovdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16] + vmovdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16] + vmovdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16] + vmovdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16] + vmovdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16] + vmovdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16] + vmovdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16] + vmovdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16] + vmovdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16] +%endif + +;; Required for Update/GMC_ENC + mov rsp, r14 + pop r15 + pop r14 + pop r13 + pop r12 +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding. +; Input: struct gcm_key_data *(GDATA_KEY), struct gcm_context_data *(GDATA_CTX), +; IV, Additional Authentication data (A_IN), Additional +; Data length (A_LEN) +; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA. +; Clobbers rax, r10-r13, and xmm0-xmm6 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_INIT 5 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%IV %3 +%define %%A_IN %4 +%define %%A_LEN %5 +%define %%AAD_HASH xmm0 +%define %%SUBHASH xmm1 + + + vmovdqu %%SUBHASH, [%%GDATA_KEY + HashKey] + + CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax + vpxor xmm2, xmm3 + mov r10, %%A_LEN + + vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash + mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length + xor r10, r10 + mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0 + mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0 + vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0 + mov r10, %%IV + vmovdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001 + vpinsrq xmm2, [r10], 0 + vpinsrd xmm2, [r10+8], 2 + vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv + + vpshufb xmm2, [SHUF_MASK] + + vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct +; has been initialized by GCM_INIT +; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. +; Input: struct gcm_key_data* (GDATA_KEY), struct gcm_context_data * (GDATA_CTX), +; input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN), +; and whether encoding or decoding (ENC_DEC) +; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10-r15, and xmm0-xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_ENC_DEC 6 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%ENC_DEC %6 +%define %%DATA_OFFSET r11 + +; Macro flow: +; calculate the number of 16byte blocks in the message +; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted' +; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left' +; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes' + cmp %%PLAIN_CYPH_LEN, 0 + je %%_multiple_of_16_bytes + + xor %%DATA_OFFSET, %%DATA_OFFSET +%ifidn __OUTPUT_FORMAT__, win64 + mov rax, %%PLAIN_CYPH_LEN + add [%%GDATA_CTX + InLen], rax ; Update length of data processed +%else + add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN ; Update length of data processed +%endif + vmovdqu xmm13, [%%GDATA_KEY + HashKey] ; xmm13 = HashKey + vmovdqu xmm8, [%%GDATA_CTX + AadHash] + + + PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC + + + mov r13, %%PLAIN_CYPH_LEN + sub r13, %%DATA_OFFSET + mov r10, r13 ; save the amount of data left to process in r10 + and r13, -16 ; r13 = r13 - (r13 mod 16) + + mov r12, r13 + shr r12, 4 + and r12, 7 + + jz %%_initial_num_blocks_is_0 + + cmp r12, 7 + je %%_initial_num_blocks_is_7 + cmp r12, 6 + je %%_initial_num_blocks_is_6 + cmp r12, 5 + je %%_initial_num_blocks_is_5 + cmp r12, 4 + je %%_initial_num_blocks_is_4 + cmp r12, 3 + je %%_initial_num_blocks_is_3 + cmp r12, 2 + je %%_initial_num_blocks_is_2 + + jmp %%_initial_num_blocks_is_1 + +%%_initial_num_blocks_is_7: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*7 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_6: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*6 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_5: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*5 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_4: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*4 + jmp %%_initial_blocks_encrypted + + +%%_initial_num_blocks_is_3: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*3 + jmp %%_initial_blocks_encrypted +%%_initial_num_blocks_is_2: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*2 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_1: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_0: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + + +%%_initial_blocks_encrypted: + cmp r13, 0 + je %%_zero_cipher_left + + sub r13, 128 + je %%_eight_cipher_left + + + + + vmovd r15d, xmm9 + and r15d, 255 + vpshufb xmm9, [SHUF_MASK] + + +%%_encrypt_by_8_new: + cmp r15d, 255-8 + jg %%_encrypt_by_8 + + + + add r15b, 8 + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC + add %%DATA_OFFSET, 128 + sub r13, 128 + jne %%_encrypt_by_8_new + + vpshufb xmm9, [SHUF_MASK] + jmp %%_eight_cipher_left + +%%_encrypt_by_8: + vpshufb xmm9, [SHUF_MASK] + add r15b, 8 + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN,%%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC + vpshufb xmm9, [SHUF_MASK] + add %%DATA_OFFSET, 128 + sub r13, 128 + jne %%_encrypt_by_8_new + + vpshufb xmm9, [SHUF_MASK] + + + + +%%_eight_cipher_left: + GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 + + +%%_zero_cipher_left: + vmovdqu [%%GDATA_CTX + AadHash], xmm14 ; ctx_data.aad hash = xmm14 + vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; ctx_data.current_counter = xmm9 + + mov r13, r10 + and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16) + + je %%_multiple_of_16_bytes + + mov [%%GDATA_CTX + PBlockLen], r13 ; ctx_data.partial_blck_length = r13 + ; handle the last <16 Byte block seperately + + vpaddd xmm9, [ONE] ; INCR CNT to get Yn + vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9 + vpshufb xmm9, [SHUF_MASK] + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Yn) + vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm9 ; ctx_data.partial_block_enc_key = xmm9 + + cmp %%PLAIN_CYPH_LEN, 16 + jge %%_large_enough_update + + lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax + lea r12, [SHIFT_MASK + 16] + sub r12, r13 + jmp %%_data_read + +%%_large_enough_update: + sub %%DATA_OFFSET, 16 + add %%DATA_OFFSET, r13 + + vmovdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block + + sub %%DATA_OFFSET, r13 + add %%DATA_OFFSET, 16 + + + lea r12, [SHIFT_MASK + 16] + sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16) + + vmovdqu xmm2, [r12] ; get the appropriate shuffle mask + vpshufb xmm1, xmm2 ; shift right 16-r13 bytes +%%_data_read: +%ifidn %%ENC_DEC, DEC + vmovdqa xmm2, xmm1 + vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9 + vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9 + vpand xmm2, xmm1 + vpshufb xmm2, [SHUF_MASK] + vpxor xmm14, xmm2 + vmovdqu [%%GDATA_CTX + AadHash], xmm14 + +%else + vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9 + vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9 + vpshufb xmm9, [SHUF_MASK] + vpxor xmm14, xmm9 + vmovdqu [%%GDATA_CTX + AadHash], xmm14 + + vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; output r13 Bytes + vmovq rax, xmm9 + cmp r13, 8 + jle %%_less_than_8_bytes_left + + mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax + add %%DATA_OFFSET, 8 + vpsrldq xmm9, xmm9, 8 + vmovq rax, xmm9 + sub r13, 8 + +%%_less_than_8_bytes_left: + mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al + add %%DATA_OFFSET, 1 + shr rax, 8 + sub r13, 1 + jne %%_less_than_8_bytes_left + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_multiple_of_16_bytes: + + + +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes. +; Input: struct gcm_key_data* (GDATA_KEY), struct gcm_context_data *(GDATA_CTX) and +; whether encoding or decoding (ENC_DEC). +; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN) +; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_COMPLETE 5 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%AUTH_TAG %3 +%define %%AUTH_TAG_LEN %4 +%define %%ENC_DEC %5 +%define %%PLAIN_CYPH_LEN rax + + mov r12, [%%GDATA_CTX + PBlockLen] + vmovdqu xmm14, [%%GDATA_CTX + AadHash] + vmovdqu xmm13, [%%GDATA_KEY + HashKey] + + cmp r12, 0 + + je %%_partial_done + + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + vmovdqu [%%GDATA_CTX + AadHash], xmm14 + +%%_partial_done: + + mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes) + mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen] + + shl r12, 3 ; convert into number of bits + vmovd xmm15, r12d ; len(A) in xmm15 + + shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128) + vmovq xmm1, %%PLAIN_CYPH_LEN + vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000 + vpxor xmm15, xmm1 ; xmm15 = len(A)||len(C) + + vpxor xmm14, xmm15 + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation + vpshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap + + vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0 + + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0) + + vpxor xmm9, xmm14 + + +%%_return_T: + mov r10, %%AUTH_TAG ; r10 = authTag + mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len + + cmp r11, 16 + je %%_T_16 + + cmp r11, 12 + je %%_T_12 + +%%_T_8: + vmovq rax, xmm9 + mov [r10], rax + jmp %%_return_T_done +%%_T_12: + vmovq rax, xmm9 + mov [r10], rax + vpsrldq xmm9, xmm9, 8 + vmovd eax, xmm9 + mov [r10 + 8], eax + jmp %%_return_T_done + +%%_T_16: + vmovdqu [r10], xmm9 + +%%_return_T_done: +%endmacro ; GCM_COMPLETE + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_precomp_128_avx_gen2 +; (struct gcm_key_data *key_data); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(precomp,_),function,) +FN_NAME(precomp,_): + + push r12 + push r13 + push r14 + push r15 + + mov r14, rsp + + + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 ; align rsp to 64 bytes + +%ifidn __OUTPUT_FORMAT__, win64 + ; only xmm6 needs to be maintained + vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 +%endif + + vpxor xmm6, xmm6 + ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey + + vpshufb xmm6, [SHUF_MASK] + ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; + vmovdqa xmm2, xmm6 + vpsllq xmm6, 1 + vpsrlq xmm2, 63 + vmovdqa xmm1, xmm2 + vpslldq xmm2, xmm2, 8 + vpsrldq xmm1, xmm1, 8 + vpor xmm6, xmm2 + ;reduction + vpshufd xmm2, xmm1, 00100100b + vpcmpeqd xmm2, [TWOONE] + vpand xmm2, [POLY] + vpxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly + + + PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] +%endif + mov rsp, r14 + + pop r15 + pop r14 + pop r13 + pop r12 + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_init_128_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *iv, +; const u8 *aad, +; u64 aad_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(init,_),function,) +FN_NAME(init,_): + push r12 + push r13 +%ifidn __OUTPUT_FORMAT__, win64 + push r14 + push r15 + mov r14, rsp + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 1*16 + movdqu [rsp + 0*16], xmm6 +%endif + + GCM_INIT arg1, arg2, arg3, arg4, arg5 + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm6 , [rsp + 0*16] + mov rsp, r14 + pop r15 + pop r14 +%endif + pop r13 + pop r12 + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_update_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(enc,_update_),function,) +FN_NAME(enc,_update_): + + FUNC_SAVE + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC + + FUNC_RESTORE + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_update_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(dec,_update_),function,) +FN_NAME(dec,_update_): + + FUNC_SAVE + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC + + FUNC_RESTORE + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_finalize_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(enc,_finalize_),function,) +FN_NAME(enc,_finalize_): + + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 5*16 + vmovdqu [rsp + 0*16],xmm6 + vmovdqu [rsp + 1*16],xmm9 + vmovdqu [rsp + 2*16],xmm11 + vmovdqu [rsp + 3*16],xmm14 + vmovdqu [rsp + 4*16],xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, ENC + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15 , [rsp + 4*16] + vmovdqu xmm14 , [rsp + 3*16] + vmovdqu xmm11 , [rsp + 2*16] + vmovdqu xmm9 , [rsp + 1*16] + vmovdqu xmm6 , [rsp + 0*16] + add rsp, 5*16 +%endif + + pop r12 + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_finalize_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(dec,_finalize_),function,) +FN_NAME(dec,_finalize_): + + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 5*16 + vmovdqu [rsp + 0*16],xmm6 + vmovdqu [rsp + 1*16],xmm9 + vmovdqu [rsp + 2*16],xmm11 + vmovdqu [rsp + 3*16],xmm14 + vmovdqu [rsp + 4*16],xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, DEC + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15 , [rsp + 4*16] + vmovdqu xmm14 , [rsp + 3*16] + vmovdqu xmm11 , [rsp + 2*16] + vmovdqu xmm9 , [rsp + 1*16] + vmovdqu xmm6 , [rsp + 0*16] + add rsp, 5*16 +%endif + + pop r12 +ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(enc,_),function,) +FN_NAME(enc,_): + + FUNC_SAVE + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC + + GCM_COMPLETE arg1, arg2, arg9, arg10, ENC + + FUNC_RESTORE + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_avx_gen2( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(dec,_),function,) +FN_NAME(dec,_): + + FUNC_SAVE + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC + + GCM_COMPLETE arg1, arg2, arg9, arg10, DEC + + FUNC_RESTORE + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_flush_avx.asm new file mode 100644 index 00000000..22dfe3cc --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_flush_avx.asm @@ -0,0 +1,30 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define AES_CBC_ENC_X8 aes_cbc_enc_192_x8 +%define FLUSH_JOB_AES_ENC flush_job_aes192_enc_avx +%include "mb_mgr_aes_flush_avx.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_submit_avx.asm new file mode 100644 index 00000000..ed1ce53e --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes192_submit_avx.asm @@ -0,0 +1,30 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define AES_CBC_ENC_X8 aes_cbc_enc_192_x8 +%define SUBMIT_JOB_AES_ENC submit_job_aes192_enc_avx +%include "mb_mgr_aes_submit_avx.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_flush_avx.asm new file mode 100644 index 00000000..96c5c351 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_flush_avx.asm @@ -0,0 +1,30 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define AES_CBC_ENC_X8 aes_cbc_enc_256_x8 +%define FLUSH_JOB_AES_ENC flush_job_aes256_enc_avx +%include "mb_mgr_aes_flush_avx.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_submit_avx.asm new file mode 100644 index 00000000..ccad4d0d --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes256_submit_avx.asm @@ -0,0 +1,30 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define AES_CBC_ENC_X8 aes_cbc_enc_256_x8 +%define SUBMIT_JOB_AES_ENC submit_job_aes256_enc_avx +%include "mb_mgr_aes_submit_avx.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_flush_avx.asm new file mode 100644 index 00000000..5ddceb84 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_flush_avx.asm @@ -0,0 +1,232 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +%ifndef AES_CBC_ENC_X8 +%define AES_CBC_ENC_X8 aes_cbc_enc_128_x8 +%define FLUSH_JOB_AES_ENC flush_job_aes128_enc_avx +%endif + +; void AES_CBC_ENC_X8(AES_ARGS_x8 *args, UINT64 len_in_bytes); +extern AES_CBC_ENC_X8 + +section .data +default rel +align 16 +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 + ;ddq 0x00000000000000000000FFFF00000000 + dq 0x0000FFFF00000000, 0x0000000000000000 + ;ddq 0x0000000000000000FFFF000000000000 + dq 0xFFFF000000000000, 0x0000000000000000 + ;ddq 0x000000000000FFFF0000000000000000 + dq 0x0000000000000000, 0x000000000000FFFF + ;ddq 0x00000000FFFF00000000000000000000 + dq 0x0000000000000000, 0x00000000FFFF0000 + ;ddq 0x0000FFFF000000000000000000000000 + dq 0x0000000000000000, 0x0000FFFF00000000 + ;ddq 0xFFFF0000000000000000000000000000 + dq 0x0000000000000000, 0xFFFF000000000000 +dupw: + ;ddq 0x01000100010001000100010001000100 + dq 0x0100010001000100, 0x0100010001000100 +one: dq 1 +two: dq 2 +three: dq 3 +four: dq 4 +five: dq 5 +six: dq 6 +seven: dq 7 + +section .text + +%define APPEND(a,b) a %+ b + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define job_rax rax + +%if 1 +%define unused_lanes rbx +%define tmp1 rbx + +%define good_lane rdx +%define iv rdx + +%define tmp2 rax + +; idx needs to be in rbp +%define tmp rbp +%define idx rbp + +%define tmp3 r8 +%endif + +; STACK_SPACE needs to be an odd multiple of 8 +; This routine and its callee clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +; JOB* FLUSH_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job) +; arg 1 : state +; arg 2 : job +MKGLOBAL(FLUSH_JOB_AES_ENC,function,internal) +FLUSH_JOB_AES_ENC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + ; check for empty + mov unused_lanes, [state + _aes_unused_lanes] + bt unused_lanes, 32+3 + jc return_null + + ; find a lane with a non-null job + xor good_lane, good_lane + cmp qword [state + _aes_job_in_lane + 1*8], 0 + cmovne good_lane, [rel one] + cmp qword [state + _aes_job_in_lane + 2*8], 0 + cmovne good_lane, [rel two] + cmp qword [state + _aes_job_in_lane + 3*8], 0 + cmovne good_lane, [rel three] + cmp qword [state + _aes_job_in_lane + 4*8], 0 + cmovne good_lane, [rel four] + cmp qword [state + _aes_job_in_lane + 5*8], 0 + cmovne good_lane, [rel five] + cmp qword [state + _aes_job_in_lane + 6*8], 0 + cmovne good_lane, [rel six] + cmp qword [state + _aes_job_in_lane + 7*8], 0 + cmovne good_lane, [rel seven] + + ; copy good_lane to empty lanes + mov tmp1, [state + _aes_args_in + good_lane*8] + mov tmp2, [state + _aes_args_out + good_lane*8] + mov tmp3, [state + _aes_args_keys + good_lane*8] + shl good_lane, 4 ; multiply by 16 + vmovdqa xmm2, [state + _aes_args_IV + good_lane] + vmovdqa xmm0, [state + _aes_lens] + +%assign I 0 +%rep 8 + cmp qword [state + _aes_job_in_lane + I*8], 0 + jne APPEND(skip_,I) + mov [state + _aes_args_in + I*8], tmp1 + mov [state + _aes_args_out + I*8], tmp2 + mov [state + _aes_args_keys + I*8], tmp3 + vmovdqa [state + _aes_args_IV + I*16], xmm2 + vpor xmm0, xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _aes_lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call AES_CBC_ENC_X8 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + mov job_rax, [state + _aes_job_in_lane + idx*8] +; Don't write back IV +; mov iv, [job_rax + _iv] + mov unused_lanes, [state + _aes_unused_lanes] + mov qword [state + _aes_job_in_lane + idx*8], 0 + or dword [job_rax + _status], STS_COMPLETED_AES + shl unused_lanes, 4 + or unused_lanes, idx +; shl idx, 4 ; multiply by 16 + mov [state + _aes_unused_lanes], unused_lanes +; vmovdqa xmm0, [state + _aes_args_IV + idx] +; vmovdqu [iv], xmm0 + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +return_null: + xor job_rax, job_rax + jmp return + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_submit_avx.asm new file mode 100644 index 00000000..5acaebb3 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_submit_avx.asm @@ -0,0 +1,188 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +%ifndef AES_CBC_ENC_X8 +%define AES_CBC_ENC_X8 aes_cbc_enc_128_x8 +%define SUBMIT_JOB_AES_ENC submit_job_aes128_enc_avx +%endif + +; void AES_CBC_ENC_X8(AES_ARGS_x8 *args, UINT64 len_in_bytes); +extern AES_CBC_ENC_X8 + +section .data +default rel + +align 16 +dupw: + ;ddq 0x01000100010001000100010001000100 + dq 0x0100010001000100, 0x0100010001000100 + +section .text + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define job_rax rax + +%if 1 +; idx needs to be in rbp +%define len rbp +%define idx rbp +%define tmp rbp + +%define lane r8 + +%define iv r9 + +%define unused_lanes rbx +%endif + +; STACK_SPACE needs to be an odd multiple of 8 +; This routine and its callee clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +; JOB* SUBMIT_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job) +; arg 1 : state +; arg 2 : job +MKGLOBAL(SUBMIT_JOB_AES_ENC,function,internal) +SUBMIT_JOB_AES_ENC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _aes_unused_lanes] + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + mov len, [job + _msg_len_to_cipher_in_bytes] + and len, -16 ; DOCSIS may pass size unaligned to block size + mov iv, [job + _iv] + mov [state + _aes_unused_lanes], unused_lanes + + mov [state + _aes_job_in_lane + lane*8], job + mov [state + _aes_lens + 2*lane], WORD(len) + + mov tmp, [job + _src] + add tmp, [job + _cipher_start_src_offset_in_bytes] + vmovdqu xmm0, [iv] + mov [state + _aes_args_in + lane*8], tmp + mov tmp, [job + _aes_enc_key_expanded] + mov [state + _aes_args_keys + lane*8], tmp + mov tmp, [job + _dst] + mov [state + _aes_args_out + lane*8], tmp + shl lane, 4 ; multiply by 16 + vmovdqa [state + _aes_args_IV + lane], xmm0 + + cmp unused_lanes, 0xf + jne return_null + + ; Find min length + vmovdqa xmm0, [state + _aes_lens] + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...7) + cmp len2, 0 + je len_is_0 + + vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _aes_lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call AES_CBC_ENC_X8 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + mov job_rax, [state + _aes_job_in_lane + idx*8] +; Don't write back IV +; mov iv, [job_rax + _iv] + mov unused_lanes, [state + _aes_unused_lanes] + mov qword [state + _aes_job_in_lane + idx*8], 0 + or dword [job_rax + _status], STS_COMPLETED_AES + shl unused_lanes, 4 + or unused_lanes, idx +; shl idx, 4 ; multiply by 16 + mov [state + _aes_unused_lanes], unused_lanes +; vmovdqa xmm0, [state + _aes_args_IV + idx] +; vmovdqu [iv], xmm0 + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +return_null: + xor job_rax, job_rax + jmp return + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_flush_avx.asm new file mode 100644 index 00000000..baf19297 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_flush_avx.asm @@ -0,0 +1,248 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +%ifndef AES_XCBC_X8 +%define AES_XCBC_X8 aes_xcbc_mac_128_x8 +%define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_avx +%endif + +; void AES_XCBC_X8(AES_XCBC_ARGS_x8 *args, UINT64 len_in_bytes); +extern AES_XCBC_X8 + +section .data +default rel + +align 16 +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 + ;ddq 0x00000000000000000000FFFF00000000 + dq 0x0000FFFF00000000, 0x0000000000000000 + ;ddq 0x0000000000000000FFFF000000000000 + dq 0xFFFF000000000000, 0x0000000000000000 + ;ddq 0x000000000000FFFF0000000000000000 + dq 0x0000000000000000, 0x000000000000FFFF + ;ddq 0x00000000FFFF00000000000000000000 + dq 0x0000000000000000, 0x00000000FFFF0000 + ;ddq 0x0000FFFF000000000000000000000000 + dq 0x0000000000000000, 0x0000FFFF00000000 + ;ddq 0xFFFF0000000000000000000000000000 + dq 0x0000000000000000, 0xFFFF000000000000 +dupw: + ;ddq 0x01000100010001000100010001000100 + dq 0x0100010001000100, 0x0100010001000100 +one: dq 1 +two: dq 2 +three: dq 3 +four: dq 4 +five: dq 5 +six: dq 6 +seven: dq 7 + +section .text + +%define APPEND(a,b) a %+ b + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define job_rax rax + +%if 1 +%define unused_lanes rbx +%define tmp1 rbx + +%define icv rdx + +%define tmp2 rax + +; idx needs to be in rbp +%define tmp r10 +%define idx rbp + +%define tmp3 r8 +%define lane_data r9 +%endif + +; STACK_SPACE needs to be an odd multiple of 8 +; This routine and its callee clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +; JOB* FLUSH_JOB_AES_XCBC(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job) +; arg 1 : state +; arg 2 : job +MKGLOBAL(FLUSH_JOB_AES_XCBC,function,internal) +FLUSH_JOB_AES_XCBC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + ; check for empty + mov unused_lanes, [state + _aes_xcbc_unused_lanes] + bt unused_lanes, 32+3 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _aes_xcbc_ldata + 1 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + cmovne idx, [rel one] + cmp qword [state + _aes_xcbc_ldata + 2 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + cmovne idx, [rel two] + cmp qword [state + _aes_xcbc_ldata + 3 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + cmovne idx, [rel three] + cmp qword [state + _aes_xcbc_ldata + 4 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + cmovne idx, [rel four] + cmp qword [state + _aes_xcbc_ldata + 5 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + cmovne idx, [rel five] + cmp qword [state + _aes_xcbc_ldata + 6 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + cmovne idx, [rel six] + cmp qword [state + _aes_xcbc_ldata + 7 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + cmovne idx, [rel seven] + +copy_lane_data: + ; copy idx to empty lanes + mov tmp1, [state + _aes_xcbc_args_in + idx*8] + mov tmp3, [state + _aes_xcbc_args_keys + idx*8] + shl idx, 4 ; multiply by 16 + vmovdqa xmm2, [state + _aes_xcbc_args_ICV + idx] + vmovdqa xmm0, [state + _aes_xcbc_lens] + +%assign I 0 +%rep 8 + cmp qword [state + _aes_xcbc_ldata + I * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _aes_xcbc_args_in + I*8], tmp1 + mov [state + _aes_xcbc_args_keys + I*8], tmp3 + vmovdqa [state + _aes_xcbc_args_ICV + I*16], xmm2 + vpor xmm0, xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + + vmovdqa [state + _aes_xcbc_lens], xmm0 + + ; Find min length + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _aes_xcbc_lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call AES_XCBC_X8 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _XCBC_LANE_DATA_size + lea lane_data, [state + _aes_xcbc_ldata + lane_data] + cmp dword [lane_data + _xcbc_final_done], 0 + jne end_loop + + mov dword [lane_data + _xcbc_final_done], 1 + mov word [state + _aes_xcbc_lens + 2*idx], 16 + lea tmp, [lane_data + _xcbc_final_block] + mov [state + _aes_xcbc_args_in + 8*idx], tmp + jmp copy_lane_data + +end_loop: + mov job_rax, [lane_data + _xcbc_job_in_lane] + mov icv, [job_rax + _auth_tag_output] + mov unused_lanes, [state + _aes_xcbc_unused_lanes] + mov qword [lane_data + _xcbc_job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 4 + or unused_lanes, idx + shl idx, 4 ; multiply by 16 + mov [state + _aes_xcbc_unused_lanes], unused_lanes + + ; copy 12 bytes + vmovdqa xmm0, [state + _aes_xcbc_args_ICV + idx] + vmovq [icv], xmm0 + vpextrd [icv + 8], xmm0, 2 + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +return_null: + xor job_rax, job_rax + jmp return + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_submit_avx.asm new file mode 100644 index 00000000..cef293c5 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_aes_xcbc_submit_avx.asm @@ -0,0 +1,257 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +%include "memcpy.asm" + +%ifndef AES_XCBC_X8 +%define AES_XCBC_X8 aes_xcbc_mac_128_x8 +%define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_avx +%endif + +; void AES_XCBC_X8(AES_XCBC_ARGS_x8 *args, UINT64 len_in_bytes); +extern AES_XCBC_X8 + + +section .data +default rel + +align 16 +dupw: ;ddq 0x01000100010001000100010001000100 + dq 0x0100010001000100, 0x0100010001000100 +x80: ;ddq 0x00000000000000000000000000000080 + dq 0x0000000000000080, 0x0000000000000000 + +section .text + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define job_rax rax + +%if 1 +; idx needs to be in rbp +%define len r11 +%define idx rbp +%define tmp2 rbp +%define tmp r14 + +%define lane r8 +%define icv r9 +%define p2 r9 + +%define last_len r10 + +%define lane_data r12 +%define p r13 + +%define unused_lanes rbx +%endif + +; STACK_SPACE needs to be an odd multiple of 8 +; This routine and its callee clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +; JOB* SUBMIT_JOB_AES_XCBC(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job) +; arg 1 : state +; arg 2 : job +MKGLOBAL(SUBMIT_JOB_AES_XCBC,function,internal) +SUBMIT_JOB_AES_XCBC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _aes_xcbc_unused_lanes] + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _XCBC_LANE_DATA_size + lea lane_data, [state + _aes_xcbc_ldata + lane_data] + mov len, [job + _msg_len_to_hash_in_bytes] + mov [state + _aes_xcbc_unused_lanes], unused_lanes + mov [lane_data + _xcbc_job_in_lane], job + mov dword [lane_data + _xcbc_final_done], 0 + mov tmp, [job + _k1_expanded] + mov [state + _aes_xcbc_args_keys + lane*8], tmp + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + + mov last_len, len + + cmp len, 16 + jle small_buffer + + mov [state + _aes_xcbc_args_in + lane*8], p + add p, len ; set point to end of data + + and last_len, 15 ; Check lsbs of msg len + jnz slow_copy ; if not 16B mult, do slow copy + +fast_copy: + vmovdqu xmm0, [p - 16] ; load last block M[n] + mov tmp, [job + _k2] ; load K2 address + vmovdqu xmm1, [tmp] ; load K2 + vpxor xmm0, xmm0, xmm1 ; M[n] XOR K2 + vmovdqa [lane_data + _xcbc_final_block], xmm0 + sub len, 16 ; take last block off length +end_fast_copy: + + mov [state + _aes_xcbc_lens + 2*lane], WORD(len) + + vpxor xmm0, xmm0, xmm0 + shl lane, 4 ; multiply by 16 + vmovdqa [state + _aes_xcbc_args_ICV + lane], xmm0 + + cmp unused_lanes, 0xf + jne return_null + +start_loop: + ; Find min length + vmovdqa xmm0, [state + _aes_xcbc_lens] + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...7) + cmp len2, 0 + je len_is_0 + + vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _aes_xcbc_lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call AES_XCBC_X8 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _XCBC_LANE_DATA_size + lea lane_data, [state + _aes_xcbc_ldata + lane_data] + cmp dword [lane_data + _xcbc_final_done], 0 + jne end_loop + + mov dword [lane_data + _xcbc_final_done], 1 + mov word [state + _aes_xcbc_lens + 2*idx], 16 + lea tmp, [lane_data + _xcbc_final_block] + mov [state + _aes_xcbc_args_in + 8*idx], tmp + jmp start_loop + +end_loop: + ; process completed job "idx" + mov job_rax, [lane_data + _xcbc_job_in_lane] + mov icv, [job_rax + _auth_tag_output] + mov unused_lanes, [state + _aes_xcbc_unused_lanes] + mov qword [lane_data + _xcbc_job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 4 + or unused_lanes, idx + shl idx, 4 ; multiply by 16 + mov [state + _aes_xcbc_unused_lanes], unused_lanes + + ; copy 12 bytes + vmovdqa xmm0, [state + _aes_xcbc_args_ICV + idx] + vmovq [icv], xmm0 + vpextrd [icv + 8], xmm0, 2 + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +small_buffer: + ; For buffers <= 16 Bytes + ; The input data is set to final block + lea tmp, [lane_data + _xcbc_final_block] ; final block + mov [state + _aes_xcbc_args_in + lane*8], tmp + add p, len ; set point to end of data + cmp len, 16 + je fast_copy + +slow_copy: + and len, ~15 ; take final block off len + sub p, last_len ; adjust data pointer + lea p2, [lane_data + _xcbc_final_block + 16] ; upper part of final + sub p2, last_len ; adjust data pointer backwards + memcpy_avx_16_1 p2, p, last_len, tmp, tmp2 + vmovdqa xmm0, [rel x80] ; fill reg with padding + vmovdqu [lane_data + _xcbc_final_block + 16], xmm0 ; add padding + vmovdqu xmm0, [p2] ; load final block to process + mov tmp, [job + _k3] ; load K3 address + vmovdqu xmm1, [tmp] ; load K3 + vpxor xmm0, xmm0, xmm1 ; M[n] XOR K3 + vmovdqu [lane_data + _xcbc_final_block], xmm0 ; write final block + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_avx.c b/src/spdk/intel-ipsec-mb/avx/mb_mgr_avx.c new file mode 100644 index 00000000..1227311d --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_avx.c @@ -0,0 +1,490 @@ +/******************************************************************************* + Copyright (c) 2012-2018, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "intel-ipsec-mb.h" +#include "save_xmms.h" +#include "asm.h" +#include "des.h" + +JOB_AES_HMAC *submit_job_aes128_enc_avx(MB_MGR_AES_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_aes128_enc_avx(MB_MGR_AES_OOO *state); + +JOB_AES_HMAC *submit_job_aes192_enc_avx(MB_MGR_AES_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_aes192_enc_avx(MB_MGR_AES_OOO *state); + +JOB_AES_HMAC *submit_job_aes256_enc_avx(MB_MGR_AES_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_aes256_enc_avx(MB_MGR_AES_OOO *state); + +JOB_AES_HMAC *submit_job_aes_xcbc_avx(MB_MGR_AES_XCBC_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_aes_xcbc_avx(MB_MGR_AES_XCBC_OOO *state); + + +#define SAVE_XMMS save_xmms_avx +#define RESTORE_XMMS restore_xmms_avx +#define SUBMIT_JOB_AES128_ENC submit_job_aes128_enc_avx +#define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_avx +#define FLUSH_JOB_AES128_ENC flush_job_aes128_enc_avx +#define SUBMIT_JOB_AES192_ENC submit_job_aes192_enc_avx +#define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_avx +#define FLUSH_JOB_AES192_ENC flush_job_aes192_enc_avx +#define SUBMIT_JOB_AES256_ENC submit_job_aes256_enc_avx +#define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_avx +#define FLUSH_JOB_AES256_ENC flush_job_aes256_enc_avx + + +#define SUBMIT_JOB_AES128_CNTR submit_job_aes128_cntr_avx +#define SUBMIT_JOB_AES192_CNTR submit_job_aes192_cntr_avx +#define SUBMIT_JOB_AES256_CNTR submit_job_aes256_cntr_avx + +#define AES_CBC_DEC_128 aes_cbc_dec_128_avx +#define AES_CBC_DEC_192 aes_cbc_dec_192_avx +#define AES_CBC_DEC_256 aes_cbc_dec_256_avx + +#define AES_CNTR_128 aes_cntr_128_avx +#define AES_CNTR_192 aes_cntr_192_avx +#define AES_CNTR_256 aes_cntr_256_avx + +#ifndef NO_GCM +#define AES_GCM_DEC_128 aes_gcm_dec_128_avx_gen2 +#define AES_GCM_ENC_128 aes_gcm_enc_128_avx_gen2 +#define AES_GCM_DEC_192 aes_gcm_dec_192_avx_gen2 +#define AES_GCM_ENC_192 aes_gcm_enc_192_avx_gen2 +#define AES_GCM_DEC_256 aes_gcm_dec_256_avx_gen2 +#define AES_GCM_ENC_256 aes_gcm_enc_256_avx_gen2 +#endif + +#define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_avx +#define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_avx + +#define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_avx +#define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_avx +#define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_avx +#define QUEUE_SIZE queue_size_avx + +#define SUBMIT_JOB_AES_ENC SUBMIT_JOB_AES_ENC_AVX +#define FLUSH_JOB_AES_ENC FLUSH_JOB_AES_ENC_AVX +#define SUBMIT_JOB_AES_DEC SUBMIT_JOB_AES_DEC_AVX +#define FLUSH_JOB_AES_DEC FLUSH_JOB_AES_DEC_AVX + + + +JOB_AES_HMAC *submit_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_sha_224_avx(MB_MGR_HMAC_SHA_256_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_sha_224_avx(MB_MGR_HMAC_SHA_256_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_sha_256_avx(MB_MGR_HMAC_SHA_256_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_sha_256_avx(MB_MGR_HMAC_SHA_256_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_sha_384_avx(MB_MGR_HMAC_SHA_512_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_sha_384_avx(MB_MGR_HMAC_SHA_512_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_sha_512_avx(MB_MGR_HMAC_SHA_512_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_sha_512_avx(MB_MGR_HMAC_SHA_512_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state); + +#define SUBMIT_JOB_HMAC submit_job_hmac_avx +#define FLUSH_JOB_HMAC flush_job_hmac_avx +#define SUBMIT_JOB_HMAC_SHA_224 submit_job_hmac_sha_224_avx +#define FLUSH_JOB_HMAC_SHA_224 flush_job_hmac_sha_224_avx +#define SUBMIT_JOB_HMAC_SHA_256 submit_job_hmac_sha_256_avx +#define FLUSH_JOB_HMAC_SHA_256 flush_job_hmac_sha_256_avx +#define SUBMIT_JOB_HMAC_SHA_384 submit_job_hmac_sha_384_avx +#define FLUSH_JOB_HMAC_SHA_384 flush_job_hmac_sha_384_avx +#define SUBMIT_JOB_HMAC_SHA_512 submit_job_hmac_sha_512_avx +#define FLUSH_JOB_HMAC_SHA_512 flush_job_hmac_sha_512_avx +#define SUBMIT_JOB_HMAC_MD5 submit_job_hmac_md5_avx +#define FLUSH_JOB_HMAC_MD5 flush_job_hmac_md5_avx + +/* ====================================================================== */ + +#define SUBMIT_JOB submit_job_avx +#define FLUSH_JOB flush_job_avx +#define SUBMIT_JOB_NOCHECK submit_job_nocheck_avx + + +/* ====================================================================== */ + + +#define SUBMIT_JOB_HASH SUBMIT_JOB_HASH_AVX +#define FLUSH_JOB_HASH FLUSH_JOB_HASH_AVX + +/* ====================================================================== */ + +#define AES_CFB_128_ONE aes_cfb_128_one_avx + +void aes128_cbc_mac_x8(AES_ARGS_x8 *args, uint64_t len); + +#define AES128_CBC_MAC aes128_cbc_mac_x8 + +#define FLUSH_JOB_AES_CCM_AUTH flush_job_aes_ccm_auth_arch +#define SUBMIT_JOB_AES_CCM_AUTH submit_job_aes_ccm_auth_arch +#define AES_CCM_MAX_JOBS 8 + +#define FLUSH_JOB_AES_CMAC_AUTH flush_job_aes_cmac_auth_arch +#define SUBMIT_JOB_AES_CMAC_AUTH submit_job_aes_cmac_auth_arch +#define AES_CMAC_MAX_JOBS 8 + +/* ====================================================================== */ + +void +init_mb_mgr_avx(MB_MGR *state) +{ + unsigned int j; + uint8_t *p; + + /* Init AES out-of-order fields */ + state->aes128_ooo.lens[0] = 0; + state->aes128_ooo.lens[1] = 0; + state->aes128_ooo.lens[2] = 0; + state->aes128_ooo.lens[3] = 0; + state->aes128_ooo.lens[4] = 0; + state->aes128_ooo.lens[5] = 0; + state->aes128_ooo.lens[6] = 0; + state->aes128_ooo.lens[7] = 0; + state->aes128_ooo.unused_lanes = 0xF76543210; + state->aes128_ooo.job_in_lane[0] = NULL; + state->aes128_ooo.job_in_lane[1] = NULL; + state->aes128_ooo.job_in_lane[2] = NULL; + state->aes128_ooo.job_in_lane[3] = NULL; + state->aes128_ooo.job_in_lane[4] = NULL; + state->aes128_ooo.job_in_lane[5] = NULL; + state->aes128_ooo.job_in_lane[6] = NULL; + state->aes128_ooo.job_in_lane[7] = NULL; + + state->aes192_ooo.lens[0] = 0; + state->aes192_ooo.lens[1] = 0; + state->aes192_ooo.lens[2] = 0; + state->aes192_ooo.lens[3] = 0; + state->aes192_ooo.lens[4] = 0; + state->aes192_ooo.lens[5] = 0; + state->aes192_ooo.lens[6] = 0; + state->aes192_ooo.lens[7] = 0; + state->aes192_ooo.unused_lanes = 0xF76543210; + state->aes192_ooo.job_in_lane[0] = NULL; + state->aes192_ooo.job_in_lane[1] = NULL; + state->aes192_ooo.job_in_lane[2] = NULL; + state->aes192_ooo.job_in_lane[3] = NULL; + state->aes192_ooo.job_in_lane[4] = NULL; + state->aes192_ooo.job_in_lane[5] = NULL; + state->aes192_ooo.job_in_lane[6] = NULL; + state->aes192_ooo.job_in_lane[7] = NULL; + + + state->aes256_ooo.lens[0] = 0; + state->aes256_ooo.lens[1] = 0; + state->aes256_ooo.lens[2] = 0; + state->aes256_ooo.lens[3] = 0; + state->aes256_ooo.lens[4] = 0; + state->aes256_ooo.lens[5] = 0; + state->aes256_ooo.lens[6] = 0; + state->aes256_ooo.lens[7] = 0; + state->aes256_ooo.unused_lanes = 0xF76543210; + state->aes256_ooo.job_in_lane[0] = NULL; + state->aes256_ooo.job_in_lane[1] = NULL; + state->aes256_ooo.job_in_lane[2] = NULL; + state->aes256_ooo.job_in_lane[3] = NULL; + state->aes256_ooo.job_in_lane[4] = NULL; + state->aes256_ooo.job_in_lane[5] = NULL; + state->aes256_ooo.job_in_lane[6] = NULL; + state->aes256_ooo.job_in_lane[7] = NULL; + + /* DOCSIS SEC BPI uses same settings as AES128 CBC */ + state->docsis_sec_ooo.lens[0] = 0; + state->docsis_sec_ooo.lens[1] = 0; + state->docsis_sec_ooo.lens[2] = 0; + state->docsis_sec_ooo.lens[3] = 0; + state->docsis_sec_ooo.lens[4] = 0; + state->docsis_sec_ooo.lens[5] = 0; + state->docsis_sec_ooo.lens[6] = 0; + state->docsis_sec_ooo.lens[7] = 0; + state->docsis_sec_ooo.unused_lanes = 0xF76543210; + state->docsis_sec_ooo.job_in_lane[0] = NULL; + state->docsis_sec_ooo.job_in_lane[1] = NULL; + state->docsis_sec_ooo.job_in_lane[2] = NULL; + state->docsis_sec_ooo.job_in_lane[3] = NULL; + state->docsis_sec_ooo.job_in_lane[4] = NULL; + state->docsis_sec_ooo.job_in_lane[5] = NULL; + state->docsis_sec_ooo.job_in_lane[6] = NULL; + state->docsis_sec_ooo.job_in_lane[7] = NULL; + + /* Init HMAC/SHA1 out-of-order fields */ + state->hmac_sha_1_ooo.lens[0] = 0; + state->hmac_sha_1_ooo.lens[1] = 0; + state->hmac_sha_1_ooo.lens[2] = 0; + state->hmac_sha_1_ooo.lens[3] = 0; + state->hmac_sha_1_ooo.lens[4] = 0xFFFF; + state->hmac_sha_1_ooo.lens[5] = 0xFFFF; + state->hmac_sha_1_ooo.lens[6] = 0xFFFF; + state->hmac_sha_1_ooo.lens[7] = 0xFFFF; + state->hmac_sha_1_ooo.unused_lanes = 0xFF03020100; + for (j = 0; j < AVX_NUM_SHA1_LANES; j++) { + state->hmac_sha_1_ooo.ldata[j].job_in_lane = NULL; + state->hmac_sha_1_ooo.ldata[j].extra_block[64] = 0x80; + memset(state->hmac_sha_1_ooo.ldata[j].extra_block + 65, + 0x00, + 64+7); + p = state->hmac_sha_1_ooo.ldata[j].outer_block; + memset(p + 5*4 + 1, + 0x00, + 64 - 5*4 - 1 - 2); + p[5*4] = 0x80; + p[64-2] = 0x02; + p[64-1] = 0xA0; + } + /* Init HMAC/SHA224 out-of-order fields */ + state->hmac_sha_224_ooo.lens[0] = 0; + state->hmac_sha_224_ooo.lens[1] = 0; + state->hmac_sha_224_ooo.lens[2] = 0; + state->hmac_sha_224_ooo.lens[3] = 0; + state->hmac_sha_224_ooo.lens[4] = 0xFFFF; + state->hmac_sha_224_ooo.lens[5] = 0xFFFF; + state->hmac_sha_224_ooo.lens[6] = 0xFFFF; + state->hmac_sha_224_ooo.lens[7] = 0xFFFF; + state->hmac_sha_224_ooo.unused_lanes = 0xFF03020100; + for (j = 0; j < AVX_NUM_SHA256_LANES; j++) { + state->hmac_sha_224_ooo.ldata[j].job_in_lane = NULL; + state->hmac_sha_224_ooo.ldata[j].extra_block[64] = 0x80; + memset(state->hmac_sha_224_ooo.ldata[j].extra_block + 65, + 0x00, + 64+7); + p = state->hmac_sha_224_ooo.ldata[j].outer_block; + memset(p + 8*4 + 1, + 0x00, + 64 - 8*4 - 1 - 2); + p[7 * 4] = 0x80; /* digest 7 words long */ + p[64 - 2] = 0x02; /* length in little endian = 0x02E0 */ + p[64 - 1] = 0xE0; + } + + /* Init HMAC/SHA256 out-of-order fields */ + state->hmac_sha_256_ooo.lens[0] = 0; + state->hmac_sha_256_ooo.lens[1] = 0; + state->hmac_sha_256_ooo.lens[2] = 0; + state->hmac_sha_256_ooo.lens[3] = 0; + state->hmac_sha_256_ooo.lens[4] = 0xFFFF; + state->hmac_sha_256_ooo.lens[5] = 0xFFFF; + state->hmac_sha_256_ooo.lens[6] = 0xFFFF; + state->hmac_sha_256_ooo.lens[7] = 0xFFFF; + state->hmac_sha_256_ooo.unused_lanes = 0xFF03020100; + for (j = 0; j < AVX_NUM_SHA256_LANES; j++) { + state->hmac_sha_256_ooo.ldata[j].job_in_lane = NULL; + state->hmac_sha_256_ooo.ldata[j].extra_block[64] = 0x80; + memset(state->hmac_sha_256_ooo.ldata[j].extra_block + 65, + 0x00, + 64+7); + p = state->hmac_sha_256_ooo.ldata[j].outer_block; + memset(p + 8*4 + 1, + 0x00, + 64 - 8*4 - 1 - 2); + p[8 * 4] = 0x80; /* 8 digest words */ + p[64 - 2] = 0x03; /* length */ + p[64 - 1] = 0x00; + } + + + /* Init HMAC/SHA384 out-of-order fields */ + state->hmac_sha_384_ooo.lens[0] = 0; + state->hmac_sha_384_ooo.lens[1] = 0; + state->hmac_sha_384_ooo.lens[2] = 0xFFFF; + state->hmac_sha_384_ooo.lens[3] = 0xFFFF; + state->hmac_sha_384_ooo.lens[4] = 0xFFFF; + state->hmac_sha_384_ooo.lens[5] = 0xFFFF; + state->hmac_sha_384_ooo.lens[6] = 0xFFFF; + state->hmac_sha_384_ooo.lens[7] = 0xFFFF; + state->hmac_sha_384_ooo.unused_lanes = 0xFF0100; + for (j = 0; j < AVX_NUM_SHA512_LANES; j++) { + MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_384_ooo; + + ctx->ldata[j].job_in_lane = NULL; + ctx->ldata[j].extra_block[SHA_384_BLOCK_SIZE] = 0x80; + memset(ctx->ldata[j].extra_block + (SHA_384_BLOCK_SIZE + 1), + 0x00, SHA_384_BLOCK_SIZE + 7); + + p = ctx->ldata[j].outer_block; + memset(p + SHA384_DIGEST_SIZE_IN_BYTES + 1, 0x00, + /* special end point because this length is constant */ + SHA_384_BLOCK_SIZE - + SHA384_DIGEST_SIZE_IN_BYTES - 1 - 2); + /* mark the end */ + p[SHA384_DIGEST_SIZE_IN_BYTES] = 0x80; + /* hmac outer block length always of fixed size, + * it is OKey length, a whole message block length, 1024 bits, + * with padding plus the length of the inner digest, + * which is 384 bits, 1408 bits == 0x0580. + * The input message block needs to be converted to big endian + * within the sha implementation before use. + */ + p[SHA_384_BLOCK_SIZE - 2] = 0x05; + p[SHA_384_BLOCK_SIZE - 1] = 0x80; + } + + /* Init HMAC/SHA512 out-of-order fields */ + state->hmac_sha_512_ooo.lens[0] = 0; + state->hmac_sha_512_ooo.lens[1] = 0; + state->hmac_sha_512_ooo.lens[2] = 0xFFFF; + state->hmac_sha_512_ooo.lens[3] = 0xFFFF; + state->hmac_sha_512_ooo.lens[4] = 0xFFFF; + state->hmac_sha_512_ooo.lens[5] = 0xFFFF; + state->hmac_sha_512_ooo.lens[6] = 0xFFFF; + state->hmac_sha_512_ooo.lens[7] = 0xFFFF; + state->hmac_sha_512_ooo.unused_lanes = 0xFF0100; + for (j = 0; j < AVX_NUM_SHA512_LANES; j++) { + MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_512_ooo; + + ctx->ldata[j].job_in_lane = NULL; + ctx->ldata[j].extra_block[SHA_512_BLOCK_SIZE] = 0x80; + memset(ctx->ldata[j].extra_block + (SHA_512_BLOCK_SIZE + 1), + 0x00, SHA_512_BLOCK_SIZE + 7); + p = ctx->ldata[j].outer_block; + memset(p + SHA512_DIGEST_SIZE_IN_BYTES + 1, 0x00, + /* special end point because this length is constant */ + SHA_512_BLOCK_SIZE - + SHA512_DIGEST_SIZE_IN_BYTES - 1 - 2); + /* mark the end */ + p[SHA512_DIGEST_SIZE_IN_BYTES] = 0x80; + /* + * hmac outer block length always of fixed size, + * it is OKey length, a whole message block length, 1024 bits, + * with padding plus the length of the inner digest, + * which is 512 bits, 1536 bits == 0x600. + * The input message block needs to be converted to big endian + * within the sha implementation before use. + */ + p[SHA_512_BLOCK_SIZE - 2] = 0x06; + p[SHA_512_BLOCK_SIZE - 1] = 0x00; + } + + + /* Init HMAC/MD5 out-of-order fields */ + state->hmac_md5_ooo.lens[0] = 0; + state->hmac_md5_ooo.lens[1] = 0; + state->hmac_md5_ooo.lens[2] = 0; + state->hmac_md5_ooo.lens[3] = 0; + state->hmac_md5_ooo.lens[4] = 0; + state->hmac_md5_ooo.lens[5] = 0; + state->hmac_md5_ooo.lens[6] = 0; + state->hmac_md5_ooo.lens[7] = 0; + state->hmac_md5_ooo.lens[8] = 0xFFFF; + state->hmac_md5_ooo.lens[9] = 0xFFFF; + state->hmac_md5_ooo.lens[10] = 0xFFFF; + state->hmac_md5_ooo.lens[11] = 0xFFFF; + state->hmac_md5_ooo.lens[12] = 0xFFFF; + state->hmac_md5_ooo.lens[13] = 0xFFFF; + state->hmac_md5_ooo.lens[14] = 0xFFFF; + state->hmac_md5_ooo.lens[15] = 0xFFFF; + state->hmac_md5_ooo.unused_lanes = 0xF76543210; + for (j = 0; j < AVX_NUM_MD5_LANES; j++) { + state->hmac_md5_ooo.ldata[j].job_in_lane = NULL; + state->hmac_md5_ooo.ldata[j].extra_block[64] = 0x80; + memset(state->hmac_md5_ooo.ldata[j].extra_block + 65, + 0x00, + 64 + 7); + p = state->hmac_md5_ooo.ldata[j].outer_block; + memset(p + 5*4 + 1, + 0x00, + 64 - 5*4 - 1 - 2); + p[4 * 4] = 0x80; + p[64 - 7] = 0x02; + p[64 - 8] = 0x80; + } + + /* Init AES/XCBC OOO fields */ + state->aes_xcbc_ooo.lens[0] = 0; + state->aes_xcbc_ooo.lens[1] = 0; + state->aes_xcbc_ooo.lens[2] = 0; + state->aes_xcbc_ooo.lens[3] = 0; + state->aes_xcbc_ooo.lens[4] = 0; + state->aes_xcbc_ooo.lens[5] = 0; + state->aes_xcbc_ooo.lens[6] = 0; + state->aes_xcbc_ooo.lens[7] = 0; + state->aes_xcbc_ooo.unused_lanes = 0xF76543210; + for (j = 0; j < 8; j++) { + state->aes_xcbc_ooo.ldata[j].job_in_lane = NULL; + state->aes_xcbc_ooo.ldata[j].final_block[16] = 0x80; + memset(state->aes_xcbc_ooo.ldata[j].final_block + 17, 0x00, 15); + } + + /* Init AES-CCM auth out-of-order fields */ + for (j = 0; j < 8; j++) { + state->aes_ccm_ooo.init_done[j] = 0; + state->aes_ccm_ooo.lens[j] = 0; + state->aes_ccm_ooo.job_in_lane[j] = NULL; + } + state->aes_ccm_ooo.unused_lanes = 0xF76543210; + + /* Init AES-CMAC auth out-of-order fields */ + for (j = 0; j < 8; j++) { + state->aes_cmac_ooo.init_done[j] = 0; + state->aes_cmac_ooo.lens[j] = 0; + state->aes_cmac_ooo.job_in_lane[j] = NULL; + } + state->aes_cmac_ooo.unused_lanes = 0xF76543210; + + /* Init "in order" components */ + state->next_job = 0; + state->earliest_job = -1; + + /* set AVX handlers */ + state->get_next_job = get_next_job_avx; + state->submit_job = submit_job_avx; + state->submit_job_nocheck = submit_job_nocheck_avx; + state->get_completed_job = get_completed_job_avx; + state->flush_job = flush_job_avx; + state->queue_size = queue_size_avx; + state->keyexp_128 = aes_keyexp_128_avx; + state->keyexp_192 = aes_keyexp_192_avx; + state->keyexp_256 = aes_keyexp_256_avx; + state->cmac_subkey_gen_128 = aes_cmac_subkey_gen_avx; + state->xcbc_keyexp = aes_xcbc_expand_key_avx; + state->des_key_sched = des_key_schedule; + state->sha1_one_block = sha1_one_block_avx; + state->sha224_one_block = sha224_one_block_avx; + state->sha256_one_block = sha256_one_block_avx; + state->sha384_one_block = sha384_one_block_avx; + state->sha512_one_block = sha512_one_block_avx; + state->md5_one_block = md5_one_block_avx; +} + +#include "mb_mgr_code.h" diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_flush_avx.asm new file mode 100644 index 00000000..18a2f453 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_flush_avx.asm @@ -0,0 +1,250 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +extern sha1_mult_avx + +section .data +default rel + +align 16 +byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b +x80: ;ddq 0x00000000000000000000000000000080 + dq 0x0000000000000080, 0x0000000000000000 +x00: ;ddq 0x00000000000000000000000000000000 + dq 0x0000000000000000, 0x0000000000000000 + +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 + ;ddq 0x00000000000000000000FFFF00000000 + dq 0x0000FFFF00000000, 0x0000000000000000 + ;ddq 0x0000000000000000FFFF000000000000 + dq 0xFFFF000000000000, 0x0000000000000000 +one: dq 1 +two: dq 2 +three: dq 3 + +section .text + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r12-r15 +%define idx rbp + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 + +%endif + +; This routine clobbers rbx, rbp +struc STACK +_gpr_save: resq 2 +_rsp_save: resq 1 +endstruc + +%define APPEND(a,b) a %+ b + +; JOB* flush_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state) +; arg 1 : rcx : state +MKGLOBAL(flush_job_hmac_avx,function,internal) +flush_job_hmac_avx: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes] + bt unused_lanes, 32+7 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel one] + cmp qword [state + _ldata + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel two] + cmp qword [state + _ldata + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel three] + +copy_lane_data: + ; copy valid lane (idx) to empty lanes + vmovdqa xmm0, [state + _lens] + mov tmp, [state + _args_data_ptr + PTR_SZ*idx] + +%assign I 0 +%rep 4 + cmp qword [state + _ldata + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args_data_ptr + PTR_SZ*I], tmp + vpor xmm0, xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + + vmovdqa [state + _lens], xmm0 + + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + vpshuflw xmm1, xmm1, 0 + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mult_avx + ; state is intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens + 2*idx], 1 + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + + ;; idx determines which column + ;; read off from consecutive rows + vmovd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] + vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1 + vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2 + vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3 + vpshufb xmm0, xmm0, [rel byteswap] + mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE] + bswap DWORD(tmp) + vmovdqa [lane_data + _outer_block], xmm0 + mov [lane_data + _outer_block + 4*4], DWORD(tmp) + + mov tmp, [job + _auth_key_xor_opad] + vmovdqu xmm0, [tmp] + mov DWORD(tmp), [tmp + 4*4] + vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0 + vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1 + vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2 + vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3 + mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp) + jmp copy_lane_data + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + mov [state + _lens + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp copy_lane_data + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 12 bytes + mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] + mov DWORD(tmp4), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE] + mov DWORD(tmp3), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE] + bswap DWORD(tmp2) + bswap DWORD(tmp4) + bswap DWORD(tmp3) + mov [p + 0*4], DWORD(tmp2) + mov [p + 1*4], DWORD(tmp4) + mov [p + 2*4], DWORD(tmp3) + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_flush_avx.asm new file mode 100644 index 00000000..b715d8a5 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_flush_avx.asm @@ -0,0 +1,277 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +extern md5_x4x2_avx + +section .data +default rel +align 16 +dupw: ;ddq 0x01000100010001000100010001000100 + dq 0x0100010001000100, 0x0100010001000100 +x80: ;ddq 0x00000000000000000000000000000080 + dq 0x0000000000000080, 0x0000000000000000 +x00: ;ddq 0x00000000000000000000000000000000 + dq 0x0000000000000000, 0x0000000000000000 +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 + ;ddq 0x00000000000000000000FFFF00000000 + dq 0x0000FFFF00000000, 0x0000000000000000 + ;ddq 0x0000000000000000FFFF000000000000 + dq 0xFFFF000000000000, 0x0000000000000000 + ;ddq 0x000000000000FFFF0000000000000000 + dq 0x0000000000000000, 0x000000000000FFFF + ;ddq 0x00000000FFFF00000000000000000000 + dq 0x0000000000000000, 0x00000000FFFF0000 + ;ddq 0x0000FFFF000000000000000000000000 + dq 0x0000000000000000, 0x0000FFFF00000000 + ;ddq 0xFFFF0000000000000000000000000000 + dq 0x0000000000000000, 0xFFFF000000000000 +one: dq 1 +two: dq 2 +three: dq 3 +four: dq 4 +five: dq 5 +six: dq 6 +seven: dq 7 + +section .text + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbp +%define idx rbp + +; unused_lanes must be in rax-rdx +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 + +%endif + +; This routine and/or the called routine clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +%define APPEND(a,b) a %+ b + +; JOB* flush_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state) +; arg 1 : rcx : state +MKGLOBAL(flush_job_hmac_md5_avx,function,internal) +flush_job_hmac_md5_avx: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_md5] + bt unused_lanes, 32+3 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata_md5 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel one] + cmp qword [state + _ldata_md5 + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel two] + cmp qword [state + _ldata_md5 + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel three] + cmp qword [state + _ldata_md5 + 4 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel four] + cmp qword [state + _ldata_md5 + 5 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel five] + cmp qword [state + _ldata_md5 + 6 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel six] + cmp qword [state + _ldata_md5 + 7 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel seven] + +copy_lane_data: + ; copy good lane (idx) to empty lanes + vmovdqa xmm0, [state + _lens_md5] + mov tmp, [state + _args_data_ptr_md5 + PTR_SZ*idx] + +%assign I 0 +%rep 8 + cmp qword [state + _ldata_md5 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args_data_ptr_md5 + PTR_SZ*I], tmp + vpor xmm0, xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + + vmovdqa [state + _lens_md5], xmm0 + + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + vpshufb xmm1, [rel dupw] ; duplicate words across all lanes + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _lens_md5], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call md5_x4x2_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_md5 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens_md5 + 2*idx], 1 + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp + + vmovd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] + vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1 + vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2 + vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3 + vmovdqa [lane_data + _outer_block], xmm0 + + mov tmp, [job + _auth_key_xor_opad] + vmovdqu xmm0, [tmp] + vmovd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0 + vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1 + vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2 + vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3 + jmp copy_lane_data + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + mov [state + _lens_md5 + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp copy_lane_data + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + mov unused_lanes, [state + _unused_lanes_md5] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes_md5], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 12 bytes + mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] + mov DWORD(tmp4), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE] + mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE] +; bswap DWORD(tmp2) +; bswap DWORD(tmp4) +; bswap DWORD(tmp3) + mov [p + 0*4], DWORD(tmp2) + mov [p + 1*4], DWORD(tmp4) + mov [p + 2*4], DWORD(tmp3) + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_submit_avx.asm new file mode 100644 index 00000000..a81f535c --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_md5_submit_avx.asm @@ -0,0 +1,309 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "memcpy.asm" +%include "reg_sizes.asm" + +extern md5_x4x2_avx + +section .data +default rel +align 16 +dupw: ;ddq 0x01000100010001000100010001000100 + dq 0x0100010001000100, 0x0100010001000100 + +section .text + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define reg3 rcx +%define reg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define reg3 rdi +%define reg4 rsi +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbp +%define last_len rbp +%define idx rbp + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx +%define tmp4 rbx + +%define job_rax rax +%define len rax + +%define size_offset reg3 +%define tmp2 reg3 + +%define lane reg4 +%define tmp3 reg4 + +%define extra_blocks r8 + +%define tmp r9 +%define p2 r9 + +%define lane_data r10 + +%endif + +; This routine and/or the called routine clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +; JOB* submit_job_hmac_md5_avx(MB_MGR_HMAC_MD5_OOO *state, JOB_AES_HMAC *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +MKGLOBAL(submit_job_hmac_md5_avx,function,internal) +submit_job_hmac_md5_avx: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_md5] + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_md5 + lane_data] + mov [state + _unused_lanes_md5], unused_lanes + mov len, [job + _msg_len_to_hash_in_bytes] + mov tmp, len + shr tmp, 6 ; divide by 64, len in terms of blocks + + mov [lane_data + _job_in_lane], job + mov dword [lane_data + _outer_done], 0 + mov [state + _lens_md5 + 2*lane], WORD(tmp) + + mov last_len, len + and last_len, 63 + lea extra_blocks, [last_len + 9 + 63] + shr extra_blocks, 6 + mov [lane_data + _extra_blocks], DWORD(extra_blocks) + + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + mov [state + _args_data_ptr_md5 + PTR_SZ*lane], p + + cmp len, 64 + jb copy_lt64 + +fast_copy: + add p, len + vmovdqu xmm0, [p - 64 + 0*16] + vmovdqu xmm1, [p - 64 + 1*16] + vmovdqu xmm2, [p - 64 + 2*16] + vmovdqu xmm3, [p - 64 + 3*16] + vmovdqa [lane_data + _extra_block + 0*16], xmm0 + vmovdqa [lane_data + _extra_block + 1*16], xmm1 + vmovdqa [lane_data + _extra_block + 2*16], xmm2 + vmovdqa [lane_data + _extra_block + 3*16], xmm3 +end_fast_copy: + + mov size_offset, extra_blocks + shl size_offset, 6 + sub size_offset, last_len + add size_offset, 64-8 + mov [lane_data + _size_offset], DWORD(size_offset) + mov start_offset, 64 + sub start_offset, last_len + mov [lane_data + _start_offset], DWORD(start_offset) + + lea tmp, [8*64 + 8*len] +; bswap tmp + mov [lane_data + _extra_block + size_offset], tmp + + mov tmp, [job + _auth_key_xor_ipad] + vmovdqu xmm0, [tmp] + vmovd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 0*MD5_DIGEST_ROW_SIZE], xmm0 + vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1 + vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2 + vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3 + + test len, ~63 + jnz ge64_bytes + +lt64_bytes: + mov [state + _lens_md5 + 2*lane], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_md5 + PTR_SZ*lane], tmp + mov dword [lane_data + _extra_blocks], 0 + +ge64_bytes: + cmp unused_lanes, 0xf + jne return_null + jmp start_loop + + align 16 +start_loop: + ; Find min length + vmovdqa xmm0, [state + _lens_md5] + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + vpshufb xmm1, xmm1, [rel dupw] ; duplicate words across all lanes + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _lens_md5], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call md5_x4x2_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_md5 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens_md5 + 2*idx], 1 + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp + + vmovd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] + vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1 + vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2 + vpinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3 +; vpshufb xmm0, [byteswap wrt rip] + vmovdqa [lane_data + _outer_block], xmm0 + + mov tmp, [job + _auth_key_xor_opad] + vmovdqu xmm0, [tmp] + vmovd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0 + vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1 + vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2 + vpextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3 + jmp start_loop + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + mov [state + _lens_md5 + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp start_loop + + align 16 + +copy_lt64: + ;; less than one message block of data + ;; beginning of source block + ;; destination extrablock but backwards by len from where 0x80 pre-populated + ;; p2 clobbers unused_lanes, undo before exiting + lea p2, [lane_data + _extra_block + 64] + sub p2, len + memcpy_avx_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 + mov unused_lanes, [state + _unused_lanes_md5] + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes_md5] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes_md5], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 12 bytes + mov DWORD(tmp), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] + mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE] + mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE] + mov [p + 0*4], DWORD(tmp) + mov [p + 1*4], DWORD(tmp2) + mov [p + 2*4], DWORD(tmp3) + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_flush_avx.asm new file mode 100644 index 00000000..525c5439 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_flush_avx.asm @@ -0,0 +1,31 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define FUNC flush_job_hmac_sha_224_avx +%define SHA224 + +%include "mb_mgr_hmac_sha_256_flush_avx.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_submit_avx.asm new file mode 100644 index 00000000..d6ca82bf --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_224_submit_avx.asm @@ -0,0 +1,31 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define FUNC submit_job_hmac_sha_224_avx +%define SHA224 + +%include "mb_mgr_hmac_sha_256_submit_avx.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_flush_avx.asm new file mode 100644 index 00000000..57b9d4de --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_flush_avx.asm @@ -0,0 +1,274 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +extern sha_256_mult_avx + +section .data +default rel +align 16 +byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 + ;ddq 0x00000000000000000000FFFF00000000 + dq 0x0000FFFF00000000, 0x0000000000000000 + ;ddq 0x0000000000000000FFFF000000000000 + dq 0xFFFF000000000000, 0x0000000000000000 +one: dq 1 +two: dq 2 +three: dq 3 + +section .text + +%ifndef FUNC +%define FUNC flush_job_hmac_sha_256_avx +%endif + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r13-r15 +%define idx rbp + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 + +%define tmp5 r9 + +%define tmp6 r10 + +%endif + +; This routine clobbers rbx, rbp; called routine also clobbers r12 +struc STACK +_gpr_save: resq 3 +_rsp_save: resq 1 +endstruc + +%define APPEND(a,b) a %+ b + +; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state) +; arg 1 : rcx : state +MKGLOBAL(FUNC,function,internal) +FUNC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_sha256] + bt unused_lanes, 32+7 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata_sha256 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel one] + cmp qword [state + _ldata_sha256 + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel two] + cmp qword [state + _ldata_sha256 + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel three] + +copy_lane_data: + ; copy idx to empty lanes + vmovdqa xmm0, [state + _lens_sha256] + mov tmp, [state + _args_data_ptr_sha256 + 8*idx] + +%assign I 0 +%rep 4 + cmp qword [state + _ldata_sha256 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args_data_ptr_sha256 + 8*I], tmp + vpor xmm0, xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + + vmovdqa [state + _lens_sha256], xmm0 + + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + vpshuflw xmm1, xmm1, 0 + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _lens_sha256], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha_256_mult_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_sha256 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens_sha256 + 2*idx], 1 + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr_sha256 + 8*idx], tmp + + vmovd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] + vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1 + vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2 + vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3 + vpshufb xmm0, xmm0, [rel byteswap] + vmovd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE] + vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1 + vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2 +%ifndef SHA224 + vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3 +%endif + vpshufb xmm1, xmm1, [rel byteswap] + + vmovdqa [lane_data + _outer_block], xmm0 + vmovdqa [lane_data + _outer_block + 4*4], xmm1 +%ifdef SHA224 + mov dword [lane_data + _outer_block + 7*4], 0x80 +%endif + + mov tmp, [job + _auth_key_xor_opad] + vmovdqu xmm0, [tmp] + vmovdqu xmm1, [tmp + 4*4] + vmovd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0 + vpextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 + vpextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 + vpextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 + vmovd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1 + vpextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 + vpextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 + vpextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 + jmp copy_lane_data + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_sha256 + 8*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp copy_lane_data + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + mov unused_lanes, [state + _unused_lanes_sha256] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes_sha256], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 14 bytes for SHA224 and 16 bytes for SHA256 + mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp6), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE] + + bswap DWORD(tmp2) + bswap DWORD(tmp4) + bswap DWORD(tmp6) + bswap DWORD(tmp5) + + mov [p + 0*4], DWORD(tmp2) + mov [p + 1*4], DWORD(tmp4) + mov [p + 2*4], DWORD(tmp6) + +%ifdef SHA224 + mov [p + 3*4], WORD(tmp5) +%else + mov [p + 3*4], DWORD(tmp5) +%endif + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_submit_avx.asm new file mode 100644 index 00000000..e83c23c6 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_256_submit_avx.asm @@ -0,0 +1,340 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "reg_sizes.asm" +%include "memcpy.asm" + +extern sha_256_mult_avx + +section .data +default rel +align 16 +byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +section .text + +%ifndef FUNC +%define FUNC submit_job_hmac_sha_256_avx +%endif + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define reg3 rcx +%define reg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define reg3 rdi +%define reg4 rsi +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r13-r15 +%define last_len rbp +%define idx rbp + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx +%define tmp4 rbx + +%define job_rax rax +%define len rax + +%define size_offset reg3 +%define tmp2 reg3 + +%define lane reg4 +%define tmp3 reg4 + +%define extra_blocks r8 + +%define tmp r9 +%define p2 r9 + +%define lane_data r10 + +%endif + +; This routine clobbers rbx, rbp, rsi, rdi; called routine also clobbers r12 +struc STACK +_gpr_save: resq 5 +_rsp_save: resq 1 +endstruc + +; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +MKGLOBAL(FUNC,function,internal) +FUNC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 +%ifndef LINUX + mov [rsp + _gpr_save + 8*3], rsi + mov [rsp + _gpr_save + 8*4], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_sha256] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_sha256 + lane_data] + mov [state + _unused_lanes_sha256], unused_lanes + mov len, [job + _msg_len_to_hash_in_bytes] + mov tmp, len + shr tmp, 6 ; divide by 64, len in terms of blocks + + mov [lane_data + _job_in_lane], job + mov dword [lane_data + _outer_done], 0 + mov [state + _lens_sha256 + 2*lane], WORD(tmp) + + mov last_len, len + and last_len, 63 + lea extra_blocks, [last_len + 9 + 63] + shr extra_blocks, 6 + mov [lane_data + _extra_blocks], DWORD(extra_blocks) + + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + mov [state + _args_data_ptr_sha256 + 8*lane], p + + cmp len, 64 + jb copy_lt64 + +fast_copy: + add p, len + vmovdqu xmm0, [p - 64 + 0*16] + vmovdqu xmm1, [p - 64 + 1*16] + vmovdqu xmm2, [p - 64 + 2*16] + vmovdqu xmm3, [p - 64 + 3*16] + vmovdqa [lane_data + _extra_block + 0*16], xmm0 + vmovdqa [lane_data + _extra_block + 1*16], xmm1 + vmovdqa [lane_data + _extra_block + 2*16], xmm2 + vmovdqa [lane_data + _extra_block + 3*16], xmm3 +end_fast_copy: + + mov size_offset, extra_blocks + shl size_offset, 6 + sub size_offset, last_len + add size_offset, 64-8 + mov [lane_data + _size_offset], DWORD(size_offset) + mov start_offset, 64 + sub start_offset, last_len + mov [lane_data + _start_offset], DWORD(start_offset) + + lea tmp, [8*64 + 8*len] + bswap tmp + mov [lane_data + _extra_block + size_offset], tmp + + mov tmp, [job + _auth_key_xor_ipad] + vmovdqu xmm0, [tmp] + vmovdqu xmm1, [tmp + 4*4] + vmovd [state + _args_digest_sha256 + 4*lane + 0*SHA256_DIGEST_ROW_SIZE], xmm0 + vpextrd [state + _args_digest_sha256 + 4*lane + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 + vpextrd [state + _args_digest_sha256 + 4*lane + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 + vpextrd [state + _args_digest_sha256 + 4*lane + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 + vmovd [state + _args_digest_sha256 + 4*lane + 4*SHA256_DIGEST_ROW_SIZE], xmm1 + vpextrd [state + _args_digest_sha256 + 4*lane + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 + vpextrd [state + _args_digest_sha256 + 4*lane + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 + vpextrd [state + _args_digest_sha256 + 4*lane + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 + + test len, ~63 + jnz ge64_bytes + +lt64_bytes: + mov [state + _lens_sha256 + 2*lane], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_sha256 + 8*lane], tmp + mov dword [lane_data + _extra_blocks], 0 + +ge64_bytes: + cmp unused_lanes, 0xff + jne return_null + jmp start_loop + + align 16 +start_loop: + ; Find min length + vmovdqa xmm0, [state + _lens_sha256] + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + vpshuflw xmm1, xmm1, 0 + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _lens_sha256], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha_256_mult_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_sha256 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens_sha256 + 2*idx], 1 + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr_sha256 + 8*idx], tmp + + vmovd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] + vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1 + vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2 + vpinsrd xmm0, xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3 + vpshufb xmm0, xmm0, [rel byteswap] + vmovd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE] + vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1 + vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2 +%ifndef SHA224 + vpinsrd xmm1, xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3 +%endif + vpshufb xmm1, xmm1, [rel byteswap] + vmovdqa [lane_data + _outer_block], xmm0 + vmovdqa [lane_data + _outer_block + 4*4], xmm1 +%ifdef SHA224 + mov dword [lane_data + _outer_block + 7*4], 0x80 +%endif + + mov tmp, [job + _auth_key_xor_opad] + vmovdqu xmm0, [tmp] + vmovdqu xmm1, [tmp + 4*4] + vmovd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0 + vpextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 + vpextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 + vpextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 + vmovd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1 + vpextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 + vpextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 + vpextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 + + jmp start_loop + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_sha256 + 8*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp start_loop + + align 16 + +copy_lt64: + ;; less than one message block of data + ;; beginning of source block + ;; destination extrablock but backwards by len from where 0x80 pre-populated + ;; p2 clobbers unused_lanes, undo before exit + lea p2, [lane_data + _extra_block + 64] + sub p2, len + memcpy_avx_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 + mov unused_lanes, [state + _unused_lanes_sha256] + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes_sha256] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes_sha256], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 14 bytes for SHA224 and 16 bytes for SHA256 + mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE] + + bswap DWORD(tmp) + bswap DWORD(tmp2) + bswap DWORD(tmp3) + bswap DWORD(tmp4) + + mov [p + 0*4], DWORD(tmp) + mov [p + 1*4], DWORD(tmp2) + mov [p + 2*4], DWORD(tmp3) + +%ifdef SHA224 + mov [p + 3*4], WORD(tmp4) +%else + mov [p + 3*4], DWORD(tmp4) +%endif +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*3] + mov rdi, [rsp + _gpr_save + 8*4] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_flush_avx.asm new file mode 100644 index 00000000..4fd4019f --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_flush_avx.asm @@ -0,0 +1,31 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define FUNC flush_job_hmac_sha_384_avx +%define SHA_X_DIGEST_SIZE 384 + +%include "mb_mgr_hmac_sha_512_flush_avx.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_submit_avx.asm new file mode 100644 index 00000000..2c1b45d8 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_384_submit_avx.asm @@ -0,0 +1,31 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define FUNC submit_job_hmac_sha_384_avx +%define SHA_X_DIGEST_SIZE 384 + +%include "mb_mgr_hmac_sha_512_submit_avx.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_flush_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_flush_avx.asm new file mode 100644 index 00000000..ac9ae824 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_flush_avx.asm @@ -0,0 +1,252 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +extern sha512_x2_avx + +section .data +default rel +align 16 +byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607 + dq 0x0001020304050607, 0x08090a0b0c0d0e0f +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 +one: dq 1 + +section .text + +%ifndef FUNC +%define FUNC flush_job_hmac_sha_512_avx +%define SHA_X_DIGEST_SIZE 512 +%endif + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r12-r15 +%define idx rbp + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 + +%define tmp5 r9 + +%define tmp6 r10 + +%endif + +; This routine clobbers rbx, rbp +struc STACK +_gpr_save: resq 2 +_rsp_save: resq 1 +endstruc + +%define APPEND(a,b) a %+ b + +; JOB* FUNC(MB_MGR_HMAC_SHA_512_OOO *state) +; arg 1 : rcx : state +MKGLOBAL(FUNC,function,internal) +FUNC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_sha512] + bt unused_lanes, 16+7 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata_sha512 + 1 * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0 + cmovne idx, [rel one] +copy_lane_data: + ; copy good lane (idx) to empty lanes + vmovdqa xmm0, [state + _lens_sha512] + mov tmp, [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*idx] + +%assign I 0 +%rep 2 + cmp qword [state + _ldata_sha512 + I * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0 + jne APPEND(skip_,I) + mov [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*I], tmp + vpor xmm0, xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + + vmovdqa [state + _lens_sha512], xmm0 + + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + vpshuflw xmm1, xmm1, 0xA0 + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _lens_sha512], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha512_x2_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _SHA512_LANE_DATA_size + lea lane_data, [state + _ldata_sha512 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done_sha512], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done_sha512], 1 + mov DWORD(size_offset), [lane_data + _size_offset_sha512] + mov qword [lane_data + _extra_block_sha512 + size_offset], 0 + mov word [state + _lens_sha512 + 2*idx], 1 + lea tmp, [lane_data + _outer_block_sha512] + mov job, [lane_data + _job_in_lane_sha512] + mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp + + ; move digest into data location + %assign I 0 + %rep (SHA_X_DIGEST_SIZE / (8*16)) + vmovq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE] + vpinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1 + vpshufb xmm0, [rel byteswap] + vmovdqa [lane_data + _outer_block_sha512 + I * 16], xmm0 + %assign I (I+1) + %endrep + + ; move the opad key into digest + mov tmp, [job + _auth_key_xor_opad] + + %assign I 0 + %rep 4 + vmovdqu xmm0, [tmp + I * 16] + vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE], xmm0 + vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 + %assign I (I+1) + %endrep + + jmp copy_lane_data + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset_sha512] + mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block_sha512 + start_offset] + mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks_sha512], 0 + jmp copy_lane_data + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane_sha512] + mov qword [lane_data + _job_in_lane_sha512], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + mov unused_lanes, [state + _unused_lanes_sha512] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes_sha512], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; below is the code for both SHA512 & SHA384. copy SHA512=32 bytes and SHA384=24 bytes + mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE] +%if (SHA_X_DIGEST_SIZE != 384) + mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] +%endif + bswap QWORD(tmp2) + bswap QWORD(tmp4) + bswap QWORD(tmp6) +%if (SHA_X_DIGEST_SIZE != 384) + bswap QWORD(tmp5) +%endif + mov [p + 0*8], QWORD(tmp2) + mov [p + 1*8], QWORD(tmp4) + mov [p + 2*8], QWORD(tmp6) +%if (SHA_X_DIGEST_SIZE != 384) + mov [p + 3*8], QWORD(tmp5) +%endif + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_submit_avx.asm new file mode 100644 index 00000000..57ba40b6 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_sha_512_submit_avx.asm @@ -0,0 +1,327 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "reg_sizes.asm" +%include "memcpy.asm" + +extern sha512_x2_avx + +section .data +default rel +align 16 +byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607 + dq 0x0001020304050607, 0x08090a0b0c0d0e0f + +section .text + +%ifndef FUNC +%define FUNC submit_job_hmac_sha_512_avx +%define SHA_X_DIGEST_SIZE 512 +%endif + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define reg3 rcx +%define reg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define reg3 rdi +%define reg4 rsi +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r12-r15 +%define last_len rbp +%define idx rbp + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx +%define tmp4 rbx + +%define job_rax rax +%define len rax + +%define size_offset reg3 +%define tmp2 reg3 + +%define lane reg4 +%define tmp3 reg4 + +%define extra_blocks r8 + +%define tmp r9 +%define p2 r9 + +%define lane_data r10 + +%endif + +; This routine clobbers rbx, rbp, rsi, rdi +struc STACK +_gpr_save: resq 4 +_rsp_save: resq 1 +endstruc + +; JOB* FUNC(MB_MGR_HMAC_sha_512_OOO *state, JOB_AES_HMAC *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +MKGLOBAL(FUNC,function,internal) +FUNC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp +%ifndef LINUX + mov [rsp + _gpr_save + 8*2], rsi + mov [rsp + _gpr_save + 8*3], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_sha512] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + imul lane_data, lane, _SHA512_LANE_DATA_size + lea lane_data, [state + _ldata_sha512 + lane_data] + mov [state + _unused_lanes_sha512], unused_lanes + mov len, [job + _msg_len_to_hash_in_bytes] + mov tmp, len + shr tmp, 7 ; divide by 128, len in terms of blocks + + mov [lane_data + _job_in_lane_sha512], job + mov dword [lane_data + _outer_done_sha512], 0 + mov [state + _lens_sha512 + 2*lane], WORD(tmp) ; 2 is word size in bytes + + mov last_len, len + and last_len, 127 + lea extra_blocks, [last_len + 17 + 127] + shr extra_blocks, 7 + mov [lane_data + _extra_blocks_sha512], DWORD(extra_blocks) + + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], p + + cmp len, 128 + jb copy_lt128 + +fast_copy: + add p, len +%assign I 0 +%rep 2 + vmovdqu xmm0, [p - 128 + I*4*16 + 0*16] + vmovdqu xmm1, [p - 128 + I*4*16 + 1*16] + vmovdqu xmm2, [p - 128 + I*4*16 + 2*16] + vmovdqu xmm3, [p - 128 + I*4*16 + 3*16] + vmovdqa [lane_data + _extra_block_sha512 + I*4*16 + 0*16], xmm0 + vmovdqa [lane_data + _extra_block_sha512 + I*4*16 + 1*16], xmm1 + vmovdqa [lane_data + _extra_block_sha512 + I*4*16 + 2*16], xmm2 + vmovdqa [lane_data + _extra_block_sha512 + I*4*16 + 3*16], xmm3 +%assign I (I+1) +%endrep + +end_fast_copy: + + mov size_offset, extra_blocks + shl size_offset, 7 + sub size_offset, last_len + add size_offset, 128-8 + mov [lane_data + _size_offset_sha512], DWORD(size_offset) + mov start_offset, 128 + sub start_offset, last_len + mov [lane_data + _start_offset_sha512], DWORD(start_offset) + + lea tmp, [8*128 + 8*len] + bswap tmp + mov [lane_data + _extra_block_sha512 + size_offset], tmp + + mov tmp, [job + _auth_key_xor_ipad] + +%assign I 0 +%rep 4 + vmovdqu xmm0, [tmp + I * 2 * 8] + vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I)*SHA512_DIGEST_ROW_SIZE], xmm0 + vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 +%assign I (I+1) +%endrep + + test len, ~127 + jnz ge128_bytes + +lt128_bytes: + mov [state + _lens_sha512 + 2*lane], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block_sha512 + start_offset] + mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], tmp ;; 8 to hold a UINT8 + mov dword [lane_data + _extra_blocks_sha512], 0 + +ge128_bytes: + cmp unused_lanes, 0xff + jne return_null + jmp start_loop + + align 16 +start_loop: + ; Find min length + vmovdqa xmm0, [state + _lens_sha512] + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...1) + cmp len2, 0 + je len_is_0 + + vpshuflw xmm1, xmm1, 0xA0 + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _lens_sha512], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha512_x2_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _SHA512_LANE_DATA_size + lea lane_data, [state + _ldata_sha512 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done_sha512], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done_sha512], 1 + mov DWORD(size_offset), [lane_data + _size_offset_sha512] + mov qword [lane_data + _extra_block_sha512 + size_offset], 0 + mov word [state + _lens_sha512 + 2*idx], 1 + lea tmp, [lane_data + _outer_block_sha512] + mov job, [lane_data + _job_in_lane_sha512] + mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp + +%assign I 0 +%rep (SHA_X_DIGEST_SIZE / (8 * 16)) + vmovq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE] + vpinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1 + vpshufb xmm0, [rel byteswap] + vmovdqa [lane_data + _outer_block_sha512 + I * 16], xmm0 +%assign I (I+1) +%endrep + + mov tmp, [job + _auth_key_xor_opad] +%assign I 0 +%rep 4 + vmovdqu xmm0, [tmp + I * 16] + vmovq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE], xmm0 + vpextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 +%assign I (I+1) +%endrep + + jmp start_loop + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset_sha512] + mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block_sha512 + start_offset] + mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp ;; idx is index of shortest length message + mov dword [lane_data + _extra_blocks_sha512], 0 + jmp start_loop + + align 16 +copy_lt128: + ;; less than one message block of data + ;; destination extra block but backwards by len from where 0x80 pre-populated + lea p2, [lane_data + _extra_block + 128] + sub p2, len + memcpy_avx_128_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 + mov unused_lanes, [state + _unused_lanes_sha512] + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane_sha512] + mov unused_lanes, [state + _unused_lanes_sha512] + mov qword [lane_data + _job_in_lane_sha512], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes_sha512], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; below is the code for both SHA512 & SHA384. SHA512=32 bytes and SHA384=24 bytes + mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE] +%if (SHA_X_DIGEST_SIZE != 384) + mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] +%endif + bswap QWORD(tmp) + bswap QWORD(tmp2) + bswap QWORD(tmp3) +%if (SHA_X_DIGEST_SIZE != 384) + bswap QWORD(tmp4) +%endif + mov [p + 0*8], QWORD(tmp) + mov [p + 1*8], QWORD(tmp2) + mov [p + 2*8], QWORD(tmp3) +%if (SHA_X_DIGEST_SIZE != 384) + mov [p + 3*8], QWORD(tmp4) +%endif + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*2] + mov rdi, [rsp + _gpr_save + 8*3] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_submit_avx.asm b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_submit_avx.asm new file mode 100644 index 00000000..333c568a --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/mb_mgr_hmac_submit_avx.asm @@ -0,0 +1,307 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "reg_sizes.asm" +%include "memcpy.asm" + +extern sha1_mult_avx + +section .data +default rel +align 16 +byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +section .text + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define reg3 rcx +%define reg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define reg3 rdi +%define reg4 rsi +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r12-r15 +%define last_len rbp +%define idx rbp + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx +%define tmp4 rbx + +%define job_rax rax +%define len rax + +%define size_offset reg3 +%define tmp2 reg3 + +%define lane reg4 +%define tmp3 reg4 + +%define extra_blocks r8 + +%define tmp r9 +%define p2 r9 + +%define lane_data r10 + +%endif + +; This routine clobbers rdi, rsi, rbx, rbp +struc STACK +_gpr_save: resq 4 +_rsp_save: resq 1 +endstruc + +; JOB* submit_job_hmac_avx(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +MKGLOBAL(submit_job_hmac_avx,function,internal) +submit_job_hmac_avx: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp +%ifndef LINUX + mov [rsp + _gpr_save + 8*2], rsi + mov [rsp + _gpr_save + 8*3], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov len, [job + _msg_len_to_hash_in_bytes] + mov tmp, len + shr tmp, 6 ; divide by 64, len in terms of blocks + + mov [lane_data + _job_in_lane], job + mov dword [lane_data + _outer_done], 0 + mov [state + _lens + 2*lane], WORD(tmp) + + mov last_len, len + and last_len, 63 + lea extra_blocks, [last_len + 9 + 63] + shr extra_blocks, 6 + mov [lane_data + _extra_blocks], DWORD(extra_blocks) + + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + mov [state + _args_data_ptr + PTR_SZ*lane], p + cmp len, 64 + jb copy_lt64 + +fast_copy: + add p, len + vmovdqu xmm0, [p - 64 + 0*16] + vmovdqu xmm1, [p - 64 + 1*16] + vmovdqu xmm2, [p - 64 + 2*16] + vmovdqu xmm3, [p - 64 + 3*16] + vmovdqa [lane_data + _extra_block + 0*16], xmm0 + vmovdqa [lane_data + _extra_block + 1*16], xmm1 + vmovdqa [lane_data + _extra_block + 2*16], xmm2 + vmovdqa [lane_data + _extra_block + 3*16], xmm3 +end_fast_copy: + + mov size_offset, extra_blocks + shl size_offset, 6 + sub size_offset, last_len + add size_offset, 64-8 + mov [lane_data + _size_offset], DWORD(size_offset) + mov start_offset, 64 + sub start_offset, last_len + mov [lane_data + _start_offset], DWORD(start_offset) + + lea tmp, [8*64 + 8*len] + bswap tmp + mov [lane_data + _extra_block + size_offset], tmp + + mov tmp, [job + _auth_key_xor_ipad] + vmovdqu xmm0, [tmp] + mov DWORD(tmp), [tmp + 4*4] + vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 0*SHA1_DIGEST_ROW_SIZE], xmm0 + vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1 + vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2 + vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3 + mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp) + + test len, ~63 + jnz ge64_bytes + +lt64_bytes: + mov [state + _lens + 2*lane], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr + PTR_SZ*lane], tmp + mov dword [lane_data + _extra_blocks], 0 + +ge64_bytes: + cmp unused_lanes, 0xff + jne return_null + jmp start_loop + + align 16 +start_loop: + ; Find min length + vmovdqa xmm0, [state + _lens] + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + vpshuflw xmm1, xmm1, 0 + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mult_avx + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens + 2*idx], 1 + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + + vmovd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] + vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1 + vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2 + vpinsrd xmm0, xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3 + vpshufb xmm0, xmm0, [rel byteswap] + mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE] + bswap DWORD(tmp) + vmovdqa [lane_data + _outer_block], xmm0 + mov [lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) + + mov tmp, [job + _auth_key_xor_opad] + vmovdqu xmm0, [tmp] + mov DWORD(tmp), [tmp + 4*4] + vmovd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0 + vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1 + vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2 + vpextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3 + mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp) + jmp start_loop + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + mov [state + _lens + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp start_loop + + align 16 +copy_lt64: + ;; less than one message block of data + ;; beginning of source block + ;; destination extrablock but backwards by len from where 0x80 pre-populated + lea p2, [lane_data + _extra_block + 64] + sub p2, len + memcpy_avx_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 + mov unused_lanes, [state + _unused_lanes] + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 12 bytes + mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] + mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE] + mov DWORD(tmp3), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE] + bswap DWORD(tmp) + bswap DWORD(tmp2) + bswap DWORD(tmp3) + mov [p + 0*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) + mov [p + 1*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2) + mov [p + 2*SHA1_DIGEST_WORD_SIZE], DWORD(tmp3) + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*2] + mov rdi, [rsp + _gpr_save + 8*3] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/md5_x4x2_avx.asm b/src/spdk/intel-ipsec-mb/avx/md5_x4x2_avx.asm new file mode 100644 index 00000000..e55b8b44 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/md5_x4x2_avx.asm @@ -0,0 +1,706 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; code to compute octal MD5 using AVX + +;; Stack must be aligned to 16 bytes before call +;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 +;; Windows preserves: rcx rbp +;; +;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15 +;; Linux preserves: rdi rbp +;; +;; clobbers xmm0-15 + +%include "os.asm" +%include "mb_mgr_datastruct.asm" + +extern MD5_TABLE + +section .data +default rel +align 64 +ONES: + dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff + +section .text + +%ifdef LINUX +;; Linux Registers +%define arg1 rdi +%define arg2 rsi +%define mem1 rcx +%define mem2 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define mem1 rdi +%define mem2 rsi +%endif + +;; rbp is not clobbered + +%define state arg1 +%define num_blks arg2 + +%define inp0 r8 +%define inp1 r9 +%define inp2 r10 +%define inp3 r11 +%define inp4 r12 +%define inp5 r13 +%define inp6 r14 +%define inp7 r15 + +%define TBL rax +%define IDX rbx + +%define A xmm0 +%define B xmm1 +%define C xmm2 +%define D xmm3 +%define E xmm4 ; tmp +%define F xmm5 ; tmp + +%define A2 xmm6 +%define B2 xmm7 +%define C2 xmm8 +%define D2 xmm9 + + +%define FUN E +%define TMP F +%define FUN2 xmm10 +%define TMP2 xmm11 + +%define T0 xmm10 +%define T1 xmm11 +%define T2 xmm12 +%define T3 xmm13 +%define T4 xmm14 +%define T5 xmm15 + +; Stack Layout +; +; 470 DD2 +; 460 CC2 +; 450 BB2 +; 440 AA2 +; 430 DD +; 420 CC +; 410 BB +; 400 AA +; +; 3F0 data2[15] for lanes 7...4 \ +; ... \ +; 300 data2[0] for lanes 7...4 \ +; 2F0 data2[15] for lanes 3...0 > mem block 2 +; ... / +; 210 data2[1] for lanes 3...0 / +; 200 data2[0] for lanes 3...0 / +; +; 1F0 data1[15] for lanes 7...4 \ +; ... \ +; 100 data1[0] for lanes 7...4 \ +; F0 data1[15] for lanes 3...0 > mem block 1 +; ... / +; 10 data1[1] for lanes 3...0 / +; 0 data1[0] for lanes 3...0 / + +; stack size must be an odd multiple of 8 bytes in size +struc STACK +_DATA: reso 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs +_DIGEST: reso 8 ; stores AA-DD, AA2-DD2 + resb 8 ; for alignment +endstruc +%define STACK_SIZE STACK_size + +%define AA rsp + _DIGEST + 16*0 +%define BB rsp + _DIGEST + 16*1 +%define CC rsp + _DIGEST + 16*2 +%define DD rsp + _DIGEST + 16*3 +%define AA2 rsp + _DIGEST + 16*4 +%define BB2 rsp + _DIGEST + 16*5 +%define CC2 rsp + _DIGEST + 16*6 +%define DD2 rsp + _DIGEST + 16*7 + +;; +;; MD5 left rotations (number of bits) +;; +rot11 equ 7 +rot12 equ 12 +rot13 equ 17 +rot14 equ 22 +rot21 equ 5 +rot22 equ 9 +rot23 equ 14 +rot24 equ 20 +rot31 equ 4 +rot32 equ 11 +rot33 equ 16 +rot34 equ 23 +rot41 equ 6 +rot42 equ 10 +rot43 equ 15 +rot44 equ 21 + +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a3 a2 a1 a0} +; r1 = {b3 b2 b1 b0} +; r2 = {c3 c2 c1 c0} +; r3 = {d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; r0 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} + + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} + + vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} + vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} + + vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} +%endmacro + +;; +;; Magic functions defined in RFC 1321 +;; +; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z)))) +%macro MAGIC_F 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + vpxor %%F,%%Z, %%Y + vpand %%F,%%F,%%X + vpxor %%F,%%F,%%Z +%endmacro + +; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y)) +%macro MAGIC_G 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + MAGIC_F %%F,%%Z,%%X,%%Y +%endmacro + +; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z)) +%macro MAGIC_H 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + vpxor %%F,%%Z, %%Y + vpxor %%F,%%F, %%X +%endmacro + +; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z))) +%macro MAGIC_I 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + vpxor %%F,%%Z,[rel ONES] ; pnot %%F + vpor %%F,%%F,%%X + vpxor %%F,%%F,%%Y +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpsrld %%tmp, %%reg, (32-%%imm) + vpslld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +;; +;; single MD5 step +;; +;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) +;; +; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot +%macro MD5_STEP1 14 +%define %%MAGIC_FUN %1 +%define %%A %2 +%define %%B %3 +%define %%C %4 +%define %%D %5 +%define %%A2 %6 +%define %%B2 %7 +%define %%C2 %8 +%define %%D2 %9 +%define %%FUN %10 +%define %%TMP %11 +%define %%data %12 +%define %%MD5const %13 +%define %%nrot %14 + + vpaddd %%A, %%A, %%MD5const + vpaddd %%A2, %%A2, %%MD5const + vpaddd %%A, %%A, [%%data] + vpaddd %%A2, %%A2, [%%data + 16*16] + %%MAGIC_FUN %%FUN, %%B,%%C,%%D + vpaddd %%A, %%A, %%FUN + %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2 + vpaddd %%A2, %%A2, %%FUN + PROLD %%A,%%nrot, %%TMP + PROLD %%A2,%%nrot, %%TMP + vpaddd %%A, %%A, %%B + vpaddd %%A2, %%A2, %%B2 +%endmacro + +;; +;; single MD5 step +;; +;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) +;; +; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data, +; MD5const, nrot +%macro MD5_STEP 16 +%define %%MAGIC_FUN %1 +%define %%A %2 +%define %%B %3 +%define %%C %4 +%define %%D %5 +%define %%A2 %6 +%define %%B2 %7 +%define %%C2 %8 +%define %%D2 %9 +%define %%FUN %10 +%define %%TMP %11 +%define %%FUN2 %12 +%define %%TMP2 %13 +%define %%data %14 +%define %%MD5const %15 +%define %%nrot %16 + + vmovdqa %%TMP,[%%data] + vmovdqa %%TMP2,[%%data + 16*16] + vpaddd %%A, %%A, %%MD5const + vpaddd %%A2, %%A2, %%MD5const + vpaddd %%A, %%A, %%TMP + vpaddd %%A2, %%A2, %%TMP2 + %%MAGIC_FUN %%FUN, %%B,%%C,%%D + %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2 + vpaddd %%A, %%A, %%FUN + vpaddd %%A2, %%A2, %%FUN2 + PROLD %%A,%%nrot, %%TMP + PROLD %%A2,%%nrot, %%TMP2 + vpaddd %%A, %%A, %%B + vpaddd %%A2, %%A2, %%B2 +%endmacro + +; void md5_x4x2_avx(MD5_ARGS *args, UINT64 num_blks) +; arg 1 : pointer to MD5_ARGS structure +; arg 2 : number of blocks (>=1) +; +align 32 +MKGLOBAL(md5_x4x2_avx,function,internal) +md5_x4x2_avx: + + sub rsp, STACK_SIZE + + ;; each row of transposed digests is split into 2 parts, the right half stored in A, and left half in A2 + ;; Initialize digests + vmovdqa A,[state + 0*16 + 0*MD5_DIGEST_ROW_SIZE] + vmovdqa B,[state + 0*16 + 1*MD5_DIGEST_ROW_SIZE] + vmovdqa C,[state + 0*16 + 2*MD5_DIGEST_ROW_SIZE] + vmovdqa D,[state + 0*16 + 3*MD5_DIGEST_ROW_SIZE] + + vmovdqa A2,[state + 1*16 + 0*MD5_DIGEST_ROW_SIZE] + vmovdqa B2,[state + 1*16 + 1*MD5_DIGEST_ROW_SIZE] + vmovdqa C2,[state + 1*16 + 2*MD5_DIGEST_ROW_SIZE] + vmovdqa D2,[state + 1*16 + 3*MD5_DIGEST_ROW_SIZE] + + lea TBL, [rel MD5_TABLE] + + ;; load input pointers + mov inp0,[state+_data_ptr_md5 +0*PTR_SZ] + mov inp1,[state+_data_ptr_md5 +1*PTR_SZ] + mov inp2,[state+_data_ptr_md5 +2*PTR_SZ] + mov inp3,[state+_data_ptr_md5 +3*PTR_SZ] + mov inp4,[state+_data_ptr_md5 +4*PTR_SZ] + mov inp5,[state+_data_ptr_md5 +5*PTR_SZ] + mov inp6,[state+_data_ptr_md5 +6*PTR_SZ] + mov inp7,[state+_data_ptr_md5 +7*PTR_SZ] + xor IDX, IDX + + ; Make ping-pong pointers to the two memory blocks + mov mem1, rsp + lea mem2, [rsp + 16*16*2] + +;; Load first block of data and save back to stack +%assign I 0 +%rep 4 + vmovdqu T2,[inp0+IDX+I*16] + vmovdqu T1,[inp1+IDX+I*16] + vmovdqu T4,[inp2+IDX+I*16] + vmovdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem1+(I*4+0)*16],T0 + vmovdqa [mem1+(I*4+1)*16],T1 + vmovdqa [mem1+(I*4+2)*16],T2 + vmovdqa [mem1+(I*4+3)*16],T3 + + vmovdqu T2,[inp4+IDX+I*16] + vmovdqu T1,[inp5+IDX+I*16] + vmovdqu T4,[inp6+IDX+I*16] + vmovdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem1+(I*4+0)*16 + 16*16],T0 + vmovdqa [mem1+(I*4+1)*16 + 16*16],T1 + vmovdqa [mem1+(I*4+2)*16 + 16*16],T2 + vmovdqa [mem1+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) +%endrep + +lloop: + ; save old digests + vmovdqa [AA], A + vmovdqa [BB], B + vmovdqa [CC], C + vmovdqa [DD], D + ; save old digests + vmovdqa [AA2], A2 + vmovdqa [BB2], B2 + vmovdqa [CC2], C2 + vmovdqa [DD2], D2 + + add IDX, 4*16 + sub num_blks, 1 + je lastblock + + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+ 0*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 1*16, [TBL+ 1*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+ 2*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 3*16, [TBL+ 3*16], rot14 + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+ 4*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 5*16, [TBL+ 5*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+ 6*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 7*16, [TBL+ 7*16], rot14 + +%assign I 0 + vmovdqu T2,[inp0+IDX+I*16] + vmovdqu T1,[inp1+IDX+I*16] + vmovdqu T4,[inp2+IDX+I*16] + vmovdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16],T0 + vmovdqa [mem2+(I*4+1)*16],T1 + vmovdqa [mem2+(I*4+2)*16],T2 + vmovdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+ 8*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 9*16, [TBL+ 9*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+10*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +11*16, [TBL+11*16], rot14 + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+12*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +13*16, [TBL+13*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+14*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +15*16, [TBL+15*16], rot14 + + + vmovdqu T2,[inp4+IDX+I*16] + vmovdqu T1,[inp5+IDX+I*16] + vmovdqu T4,[inp6+IDX+I*16] + vmovdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16 + 16*16],T0 + vmovdqa [mem2+(I*4+1)*16 + 16*16],T1 + vmovdqa [mem2+(I*4+2)*16 + 16*16],T2 + vmovdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+16*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 6*16, [TBL+17*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+18*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 0*16, [TBL+19*16], rot24 + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+20*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +10*16, [TBL+21*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+22*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 4*16, [TBL+23*16], rot24 + + vmovdqu T2,[inp0+IDX+I*16] + vmovdqu T1,[inp1+IDX+I*16] + vmovdqu T4,[inp2+IDX+I*16] + vmovdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16],T0 + vmovdqa [mem2+(I*4+1)*16],T1 + vmovdqa [mem2+(I*4+2)*16],T2 + vmovdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+24*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +14*16, [TBL+25*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+26*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 8*16, [TBL+27*16], rot24 + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+28*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 2*16, [TBL+29*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+30*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +12*16, [TBL+31*16], rot24 + + vmovdqu T2,[inp4+IDX+I*16] + vmovdqu T1,[inp5+IDX+I*16] + vmovdqu T4,[inp6+IDX+I*16] + vmovdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16 + 16*16],T0 + vmovdqa [mem2+(I*4+1)*16 + 16*16],T1 + vmovdqa [mem2+(I*4+2)*16 + 16*16],T2 + vmovdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+32*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 8*16, [TBL+33*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+34*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +14*16, [TBL+35*16], rot34 + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+36*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 4*16, [TBL+37*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+38*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +10*16, [TBL+39*16], rot34 + + vmovdqu T2,[inp0+IDX+I*16] + vmovdqu T1,[inp1+IDX+I*16] + vmovdqu T4,[inp2+IDX+I*16] + vmovdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16],T0 + vmovdqa [mem2+(I*4+1)*16],T1 + vmovdqa [mem2+(I*4+2)*16],T2 + vmovdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+40*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 0*16, [TBL+41*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+42*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 6*16, [TBL+43*16], rot34 + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+44*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +12*16, [TBL+45*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+46*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 2*16, [TBL+47*16], rot34 + + vmovdqu T2,[inp4+IDX+I*16] + vmovdqu T1,[inp5+IDX+I*16] + vmovdqu T4,[inp6+IDX+I*16] + vmovdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16 + 16*16],T0 + vmovdqa [mem2+(I*4+1)*16 + 16*16],T1 + vmovdqa [mem2+(I*4+2)*16 + 16*16],T2 + vmovdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+48*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 7*16, [TBL+49*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+50*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 5*16, [TBL+51*16], rot44 + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+52*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 3*16, [TBL+53*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+54*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 1*16, [TBL+55*16], rot44 + + vmovdqu T2,[inp0+IDX+I*16] + vmovdqu T1,[inp1+IDX+I*16] + vmovdqu T4,[inp2+IDX+I*16] + vmovdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16],T0 + vmovdqa [mem2+(I*4+1)*16],T1 + vmovdqa [mem2+(I*4+2)*16],T2 + vmovdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+56*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +15*16, [TBL+57*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+58*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +13*16, [TBL+59*16], rot44 + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+60*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +11*16, [TBL+61*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+62*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 9*16, [TBL+63*16], rot44 + + vmovdqu T2,[inp4+IDX+I*16] + vmovdqu T1,[inp5+IDX+I*16] + vmovdqu T4,[inp6+IDX+I*16] + vmovdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vmovdqa [mem2+(I*4+0)*16 + 16*16],T0 + vmovdqa [mem2+(I*4+1)*16 + 16*16],T1 + vmovdqa [mem2+(I*4+2)*16 + 16*16],T2 + vmovdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + + vpaddd A,A,[AA] + vpaddd B,B,[BB] + vpaddd C,C,[CC] + vpaddd D,D,[DD] + + vpaddd A2,A2,[AA2] + vpaddd B2,B2,[BB2] + vpaddd C2,C2,[CC2] + vpaddd D2,D2,[DD2] + + ; swap mem1 and mem2 + xchg mem1, mem2 + + jmp lloop + +lastblock: + + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+ 0*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+ 1*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+ 2*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+ 3*16], rot14 + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+ 4*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+ 5*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+ 6*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+ 7*16], rot14 + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+ 8*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+ 9*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+10*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+11*16], rot14 + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+12*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+13*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+14*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+15*16], rot14 + + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+16*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+17*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+18*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+19*16], rot24 + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+20*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+21*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+22*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+23*16], rot24 + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+24*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+25*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+26*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+27*16], rot24 + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+28*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+29*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+30*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+31*16], rot24 + + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+32*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+33*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+34*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+35*16], rot34 + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+36*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+37*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+38*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+39*16], rot34 + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+40*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+41*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+42*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+43*16], rot34 + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+44*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+45*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+46*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+47*16], rot34 + + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+48*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+49*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+50*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+51*16], rot44 + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+52*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+53*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+54*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+55*16], rot44 + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+56*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+57*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+58*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+59*16], rot44 + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+60*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+61*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+62*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+63*16], rot44 + + vpaddd A,A,[AA] + vpaddd B,B,[BB] + vpaddd C,C,[CC] + vpaddd D,D,[DD] + + vpaddd A2,A2,[AA2] + vpaddd B2,B2,[BB2] + vpaddd C2,C2,[CC2] + vpaddd D2,D2,[DD2] + + ; write out digests + vmovdqu [state + 0*16 + 0*MD5_DIGEST_ROW_SIZE ], A + vmovdqu [state + 0*16 + 1*MD5_DIGEST_ROW_SIZE ], B + vmovdqu [state + 0*16 + 2*MD5_DIGEST_ROW_SIZE ], C + vmovdqu [state + 0*16 + 3*MD5_DIGEST_ROW_SIZE ], D + vmovdqu [state + 1*16 + 0*MD5_DIGEST_ROW_SIZE], A2 + vmovdqu [state + 1*16 + 1*MD5_DIGEST_ROW_SIZE], B2 + vmovdqu [state + 1*16 + 2*MD5_DIGEST_ROW_SIZE], C2 + vmovdqu [state + 1*16 + 3*MD5_DIGEST_ROW_SIZE], D2 + + ;; update input pointers + add inp0, IDX + add inp1, IDX + add inp2, IDX + add inp3, IDX + add inp4, IDX + add inp5, IDX + add inp6, IDX + add inp7, IDX + mov [state +_data_ptr_md5 + 0*PTR_SZ], inp0 + mov [state +_data_ptr_md5 + 1*PTR_SZ], inp1 + mov [state +_data_ptr_md5 + 2*PTR_SZ], inp2 + mov [state +_data_ptr_md5 + 3*PTR_SZ], inp3 + mov [state +_data_ptr_md5 + 4*PTR_SZ], inp4 + mov [state +_data_ptr_md5 + 5*PTR_SZ], inp5 + mov [state +_data_ptr_md5 + 6*PTR_SZ], inp6 + mov [state +_data_ptr_md5 + 7*PTR_SZ], inp7 + + ;;;;;;;;;;;;;;;; + ;; Postamble + add rsp, STACK_SIZE + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/sha1_mult_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha1_mult_avx.asm new file mode 100644 index 00000000..32f399f7 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/sha1_mult_avx.asm @@ -0,0 +1,424 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "mb_mgr_datastruct.asm" + +section .data +default rel + +align 16 +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b +K00_19: ;ddq 0x5A8279995A8279995A8279995A827999 + dq 0x5A8279995A827999, 0x5A8279995A827999 +K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 +K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC +K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 + +section .text + +;; code to compute quad SHA1 using AVX +;; derived from ...\sha1_multiple\sha1_quad4.asm +;; variation of sha1_mult2.asm : clobbers all xmm regs, rcx left intact +;; rbx, rsi, rdi, rbp, r12-r15 left intact +;; This version is not safe to call from C/C++ + +;; Stack must be aligned to 16 bytes before call +;; Windows clobbers: rax rdx r8 r9 r10 r11 +;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15 +;; +;; Linux clobbers: rax rsi r8 r9 r10 r11 +;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15 +;; +;; clobbers xmm0-15 + +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a3 a2 a1 a0} +; r1 = {b3 b2 b1 b0} +; r2 = {c3 c2 c1 c0} +; r3 = {d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; r0 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} + + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} + + vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} + + vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} + + vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} +%endmacro +;; +;; Magic functions defined in FIPS 180-1 +;; +; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D))) +%macro MAGIC_F0 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpxor %%regF, %%regC,%%regD + vpand %%regF, %%regF,%%regB + vpxor %%regF, %%regF,%%regD +%endmacro + +; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F1 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpxor %%regF,%%regD,%%regC + vpxor %%regF,%%regF,%%regB +%endmacro + +; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D)) +%macro MAGIC_F2 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + vpor %%regF,%%regB,%%regC + vpand %%regT,%%regB,%%regC + vpand %%regF,%%regF,%%regD + vpor %%regF,%%regF,%%regT +%endmacro + +; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F3 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpsrld %%tmp, %%reg, (32-(%%imm)) + vpslld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; non-destructive +; PROLD_nd reg, imm, tmp, src +%macro PROLD_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpsrld %%tmp, %%src, (32-(%%imm)) + vpslld %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +%macro SHA1_STEP_00_15 10 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 + vpaddd %%regE, %%regE,%%immCNT + vpaddd %%regE, %%regE,[rsp + (%%memW * 16)] + PROLD_nd %%regT,5, %%regF,%%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + vpaddd %%regE, %%regE,%%regF +%endmacro + +%macro SHA1_STEP_16_79 10 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 + vpaddd %%regE, %%regE,%%immCNT + + vmovdqa W14, [rsp + ((%%memW - 14) & 15) * 16] + vpxor W16, W16, W14 + vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 16] + vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 16] + + vpsrld %%regF, W16, (32-1) + vpslld W16, W16, 1 + vpor %%regF, %%regF, W16 + ROTATE_W + + vmovdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF + vpaddd %%regE, %%regE,%%regF + + PROLD_nd %%regT,5, %%regF, %%regA + vpaddd %%regE, %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + vpaddd %%regE,%%regE,%%regF +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; FRAMESZ must be an odd multiple of 8 +%define FRAMESZ 16*16 + 8 + +%define VMOVPS vmovdqu + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define inp0 r8 +%define inp1 r9 +%define inp2 r10 +%define inp3 r11 + +%define IDX rax + +%define A xmm0 +%define B xmm1 +%define C xmm2 +%define D xmm3 +%define E xmm4 +%define F xmm5 ; tmp +%define G xmm6 ; tmp + +%define TMP G +%define FUN F +%define K xmm7 + +%define AA xmm8 +%define BB xmm9 +%define CC xmm10 +%define DD xmm11 +%define EE xmm12 + +%define T0 xmm6 +%define T1 xmm7 +%define T2 xmm8 +%define T3 xmm9 +%define T4 xmm10 +%define T5 xmm11 + +%define W14 xmm13 +%define W15 xmm14 +%define W16 xmm15 + +%macro ROTATE_ARGS 0 +%xdefine TMP_ E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%macro ROTATE_W 0 +%xdefine TMP_ W16 +%xdefine W16 W15 +%xdefine W15 W14 +%xdefine W14 TMP_ +%endm + +align 32 + +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +; void sha1_mult_avx(SHA1_ARGS *args, UINT32 size_in_blocks); +; arg 1 : rcx : pointer to args +; arg 2 : rdx : size (in blocks) ;; assumed to be >= 1 +MKGLOBAL(sha1_mult_avx,function,internal) +sha1_mult_avx: + + sub rsp, FRAMESZ + + ;; Initialize digests + vmovdqa A, [arg1 + 0*SHA1_DIGEST_ROW_SIZE] + vmovdqa B, [arg1 + 1*SHA1_DIGEST_ROW_SIZE] + vmovdqa C, [arg1 + 2*SHA1_DIGEST_ROW_SIZE] + vmovdqa D, [arg1 + 3*SHA1_DIGEST_ROW_SIZE] + vmovdqa E, [arg1 + 4*SHA1_DIGEST_ROW_SIZE] + + ;; transpose input onto stack + mov inp0,[arg1 + _data_ptr_sha1 + 0*PTR_SZ] + mov inp1,[arg1 + _data_ptr_sha1 + 1*PTR_SZ] + mov inp2,[arg1 + _data_ptr_sha1 + 2*PTR_SZ] + mov inp3,[arg1 + _data_ptr_sha1 + 3*PTR_SZ] + + xor IDX, IDX +lloop: + vmovdqa F, [rel PSHUFFLE_BYTE_FLIP_MASK] +%assign I 0 +%rep 4 + VMOVPS T2,[inp0+IDX] + VMOVPS T1,[inp1+IDX] + VMOVPS T4,[inp2+IDX] + VMOVPS T3,[inp3+IDX] + TRANSPOSE T2, T1, T4, T3, T0, T5 + vpshufb T0, T0, F + vmovdqa [rsp+(I*4+0)*16],T0 + vpshufb T1, T1, F + vmovdqa [rsp+(I*4+1)*16],T1 + vpshufb T2, T2, F + vmovdqa [rsp+(I*4+2)*16],T2 + vpshufb T3, T3, F + vmovdqa [rsp+(I*4+3)*16],T3 + add IDX, 4*4 +%assign I (I+1) +%endrep + + ; save old digests + vmovdqa AA, A + vmovdqa BB, B + vmovdqa CC, C + vmovdqa DD, D + vmovdqa EE, E + +;; +;; perform 0-79 steps +;; + vmovdqa K, [rel K00_19] +;; do rounds 0...15 +%assign I 0 +%rep 16 + SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 16...19 + vmovdqa W16, [rsp + ((16 - 16) & 15) * 16] + vmovdqa W15, [rsp + ((16 - 15) & 15) * 16] +%rep 4 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 20...39 + vmovdqa K, [rel K20_39] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 40...59 + vmovdqa K, [rel K40_59] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 60...79 + vmovdqa K, [rel K60_79] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3 + ROTATE_ARGS +%assign I (I+1) +%endrep + + vpaddd A,A,AA + vpaddd B,B,BB + vpaddd C,C,CC + vpaddd D,D,DD + vpaddd E,E,EE + + sub arg2, 1 + jne lloop + + ; write out digests + vmovdqa [arg1 + 0*SHA1_DIGEST_ROW_SIZE], A + vmovdqa [arg1 + 1*SHA1_DIGEST_ROW_SIZE], B + vmovdqa [arg1 + 2*SHA1_DIGEST_ROW_SIZE], C + vmovdqa [arg1 + 3*SHA1_DIGEST_ROW_SIZE], D + vmovdqa [arg1 + 4*SHA1_DIGEST_ROW_SIZE], E + + ; update input pointers + add inp0, IDX + mov [arg1 + _data_ptr_sha1 + 0*PTR_SZ], inp0 + add inp1, IDX + mov [arg1 + _data_ptr_sha1 + 1*PTR_SZ], inp1 + add inp2, IDX + mov [arg1 + _data_ptr_sha1 + 2*PTR_SZ], inp2 + add inp3, IDX + mov [arg1 + _data_ptr_sha1 + 3*PTR_SZ], inp3 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + add rsp, FRAMESZ + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/sha1_one_block_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha1_one_block_avx.asm new file mode 100644 index 00000000..e1c21e20 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/sha1_one_block_avx.asm @@ -0,0 +1,496 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; SHA1 code, hybrid, rolled, interleaved +; Uses AVX instructions +%include "os.asm" + +section .data +default rel +align 16 +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b +K00_19: ;ddq 0x5A8279995A8279995A8279995A827999 + dq 0x5A8279995A827999, 0x5A8279995A827999 +K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 +K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC +K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 + +section .text + +%define VMOVDQ vmovdqu ;; assume buffers not aligned + +%ifdef LINUX +%define INP rdi ; 1st arg +%define CTX rsi ; 2nd arg +%define REG3 ecx +%define REG4 edx +%else +%define INP rcx ; 1st arg +%define CTX rdx ; 2nd arg +%define REG3 edi +%define REG4 esi +%endif + +%define FRAMESZ 3*16 + 1*8 +%define _RSP FRAMESZ-1*8 + rsp + +%define a eax +%define b ebx +%define c REG3 +%define d REG4 +%define e r8d +%define T1 r9d +%define f r10d +%define RND r11d +%define g r12d +%define h r13d + +%define XTMP0 xmm0 +%define XTMP1 xmm1 +%define XK xmm2 + +%xdefine X0 xmm3 +%xdefine X1 xmm4 +%xdefine X2 xmm5 +%xdefine X3 xmm6 +%xdefine X4 xmm7 + +%define XFER xmm8 + +%define SZ 4 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros + +%macro rotate_Xs 0 +%xdefine X_ X0 +%xdefine X0 X1 +%xdefine X1 X2 +%xdefine X2 X3 +%xdefine X3 X4 +%xdefine X4 X_ +%endmacro + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + + +;; Magic functions defined in FIPS 180-1 +;; +; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D))) +%macro MAGIC_F0 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + mov %%regF,%%regC + xor %%regF,%%regD + and %%regF,%%regB + xor %%regF,%%regD +%endmacro + +; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F1 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + mov %%regF,%%regD + xor %%regF,%%regC + xor %%regF,%%regB +%endmacro + +; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D)) +%macro MAGIC_F2 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + mov %%regF,%%regB + mov %%regT,%%regB + or %%regF,%%regC + and %%regT,%%regC + and %%regF,%%regD + or %%regF,%%regT +%endmacro + +; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F3 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT +%endmacro + +;; input is T1 +%macro ROUND 1 +%define %%MAGIC %1 + add e,T1 + mov T1,a + rol T1,5 + add e,T1 + %%MAGIC h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + rol b,30 + add h,e +ROTATE_ARGS +%endmacro + +%macro do_4i 1 + vpaddd XFER, XK, X0 + vpextrd T1, XFER, 0 + ;ROUND %1 + add e,T1 + ;SCHEDULE_4 + vpalignr XTMP0, X1, X0, 8 ; XTMP0 = W[-14] + mov T1,a + rol T1,5 + vpxor XTMP1, X2, X0 ; XTMP1 = W[-8] ^ W[-16] + add e,T1 + vpxor XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-8] ^ W[-14] ^ W[-16] + %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + + ;; Finish low half + rol b,30 + vpsrldq X4, X3, 4 ; X4 = W[-3] {xxBA} + add h,e +ROTATE_ARGS + vpextrd T1, XFER, 1 + ;ROUND %1 + add e,T1 + vpxor X4, X4, XTMP0 + mov T1,a + rol T1,5 + ;; rotate X4 left 1 + vpsrld XTMP1, X4, (32-1) + add e,T1 + vpslld X4, X4, 1 + %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + vpxor X4, X4, XTMP1 ; X4 = W[0] {xxBA} + rol b,30 + add h,e +ROTATE_ARGS + vpextrd T1, XFER, 2 + ;ROUND %1 + add e,T1 + mov T1,a + + ;; Finish high half + vpalignr XTMP1, X4, X3, 4 ; XTMP1 = w[-3] {DCxx} + rol T1,5 + add e,T1 + vpxor XTMP0, XTMP0, XTMP1 + %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + ;; rotate XTMP0 left 1 + vpsrld XTMP1, XTMP0, (32-1) + rol b,30 + add h,e +ROTATE_ARGS + vpextrd T1, XFER, 3 + ;ROUND %1 + add e,T1 + mov T1,a + vpslld XTMP0, XTMP0, 1 + rol T1,5 + add e,T1 + vpxor XTMP0, XTMP0, XTMP1 ; XTMP0 = W[0] {DCxx} + %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + ;; COMBINE HALVES + vshufps X4, X4, XTMP0, 11100100b ; X4 = X[0] {DCBA} + rol b,30 + add h,e + + rotate_Xs +ROTATE_ARGS +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sha1_one_block_avx(void *input_data, UINT32 digest[8] +;; arg 1 : rcx : pointer to input data +;; arg 2 : rdx : pointer to digest +MKGLOBAL(sha1_one_block_avx,function,) +align 32 +sha1_one_block_avx: + push rbx + push rsi + push rdi + push r12 + push r13 + + ;; byte swap first 16 dwords + vmovdqa XTMP0, [rel PSHUFFLE_BYTE_FLIP_MASK] + mov rax,rsp ; copy rsp + VMOVDQ X0, [INP + 0*16] + sub rsp,FRAMESZ + VMOVDQ X1, [INP + 1*16] + and rsp,-64 ; align stack frame + + vmovdqa [rsp + 0 * 16], xmm6 + vmovdqa [rsp + 1 * 16], xmm7 + vmovdqa [rsp + 2 * 16], xmm8 + + VMOVDQ X2, [INP + 2*16] + mov [_RSP],rax ; save copy of rsp + VMOVDQ X3, [INP + 3*16] + ;; load initial digest + mov a,0x67452301 + vpshufb X0, XTMP0 + mov b,0xefcdab89 + vpshufb X1, XTMP0 + mov c,0x98badcfe + vpshufb X2, XTMP0 + mov d,0x10325476 + vpshufb X3, XTMP0 + mov e,0xc3d2e1f0 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; do rounds 00-19 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqa XK, [rel K00_19] + mov RND, 3 + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + jmp loop1_5 +align 16 +loop1: + + do_4i MAGIC_F0 + +loop1_5: + do_4i MAGIC_F0 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + vmovdqa X0, X2 + vmovdqa X2, X4 + vmovdqa X4, X1 + vmovdqa X1, X3 + + sub RND, 1 + jne loop1 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; end rounds 00-19 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; do rounds 20-39 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqa XK, [rel K20_39] + mov RND, 3 + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + jmp loop2_5 +align 16 +loop2: + + do_4i MAGIC_F1 + +loop2_5: + do_4i MAGIC_F1 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + vmovdqa X0, X2 + vmovdqa X2, X4 + vmovdqa X4, X1 + vmovdqa X1, X3 + + sub RND, 1 + jne loop2 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; end rounds 20-39 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; do rounds 40-59 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqa XK, [rel K40_59] + mov RND, 3 + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + jmp loop3_5 +align 16 +loop3: + + do_4i MAGIC_F2 + +loop3_5: + do_4i MAGIC_F2 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + vmovdqa X0, X2 + vmovdqa X2, X4 + vmovdqa X4, X1 + vmovdqa X1, X3 + + sub RND, 1 + jne loop3 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; end rounds 40-59 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + + + ;; do rounds 60-79 + vmovdqa XK, [rel K60_79] + + do_4i MAGIC_F3 + + vpaddd XFER, XK, X0 + vpextrd T1, XFER, 0 + ROUND MAGIC_F3 + vpextrd T1, XFER, 1 + ROUND MAGIC_F3 + vpextrd T1, XFER, 2 + ROUND MAGIC_F3 + vpextrd T1, XFER, 3 + ROUND MAGIC_F3 + + vpaddd XFER, XK, X1 + vpextrd T1, XFER, 0 + ROUND MAGIC_F3 + vpextrd T1, XFER, 1 + ROUND MAGIC_F3 + vpextrd T1, XFER, 2 + ROUND MAGIC_F3 + vpextrd T1, XFER, 3 + ROUND MAGIC_F3 + + vpaddd XFER, XK, X2 + vpextrd T1, XFER, 0 + ROUND MAGIC_F3 + vpextrd T1, XFER, 1 + ROUND MAGIC_F3 + vpextrd T1, XFER, 2 + ROUND MAGIC_F3 + vpextrd T1, XFER, 3 + ROUND MAGIC_F3 + + vpaddd XFER, XK, X3 + vpextrd T1, XFER, 0 + ROUND MAGIC_F3 + vpextrd T1, XFER, 1 + ROUND MAGIC_F3 + vpextrd T1, XFER, 2 + ROUND MAGIC_F3 + vpextrd T1, XFER, 3 + ROUND MAGIC_F3 + + add a,0x67452301 + mov [SZ*0 + CTX], a + add b,0xefcdab89 + mov [SZ*1 + CTX], b + add c,0x98badcfe + mov [SZ*2 + CTX], c + add d,0x10325476 + mov [SZ*3 + CTX], d + add e,0xc3d2e1f0 + mov [SZ*4 + CTX], e + + vmovdqa xmm8, [rsp + 2 * 16] + vmovdqa xmm7, [rsp + 1 * 16] + vmovdqa xmm6, [rsp + 0 * 16] + + mov rsp,[_RSP] + pop r13 + pop r12 + pop rdi + pop rsi + pop rbx + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/sha224_one_block_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha224_one_block_avx.asm new file mode 100644 index 00000000..ace417a4 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/sha224_one_block_avx.asm @@ -0,0 +1,41 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define H0 0xc1059ed8 +%define H1 0x367cd507 +%define H2 0x3070dd17 +%define H3 0xf70e5939 +%define H4 0xffc00b31 +%define H5 0x68581511 +%define H6 0x64f98fa7 +%define H7 0xbefa4fa4 +%define FUNC sha224_one_block_avx + +%include "sha256_one_block_avx.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/sha256_one_block_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha256_one_block_avx.asm new file mode 100644 index 00000000..b92e1f6c --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/sha256_one_block_avx.asm @@ -0,0 +1,557 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%include "os.asm" + +section .data +default rel +align 64 +K256: + dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +; shuffle xBxA -> 00BA +_SHUF_00BA: ;ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100 + dq 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF + +; shuffle xDxC -> DC00 +_SHUF_DC00: ;ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF + dq 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 + +section .text + +%define VMOVDQ vmovdqu ;; assume buffers not aligned + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros + +%macro MY_ROR 2 + shld %1,%1,(32-(%2)) +%endm + +; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask +; Load xmm with mem and byte swap each dword +%macro COPY_XMM_AND_BSWAP 3 + VMOVDQ %1, %2 + vpshufb %1, %1, %3 +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define X0 xmm4 +%define X1 xmm5 +%define X2 xmm6 +%define X3 xmm7 + +%define XTMP0 xmm0 +%define XTMP1 xmm1 +%define XTMP2 xmm2 +%define XTMP3 xmm3 +%define XTMP4 xmm8 +%define XFER xmm9 +%define XTMP5 xmm11 + +%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA +%define SHUF_DC00 xmm12 ; shuffle xDxC -> DC00 +%define BYTE_FLIP_MASK xmm13 + +%ifdef LINUX +%define CTX rsi ; 2nd arg +%define INP rdi ; 1st arg + +%define SRND rdi ; clobbers INP +%define c ecx +%define d r8d +%define e edx +%else +%define CTX rdx ; 2nd arg +%define INP rcx ; 1st arg + +%define SRND rcx ; clobbers INP +%define c edi +%define d esi +%define e r8d + +%endif +%define TBL rbp +%define a eax +%define b ebx + +%define f r9d +%define g r10d +%define h r11d + +%define y0 r13d +%define y1 r14d +%define y2 r15d + + +struc STACK +%ifndef LINUX +_XMM_SAVE: reso 7 +%endif +_XFER: reso 1 +endstruc + +%ifndef H0 +%define H0 0x6a09e667 +%define H1 0xbb67ae85 +%define H2 0x3c6ef372 +%define H3 0xa54ff53a +%define H4 0x510e527f +%define H5 0x9b05688c +%define H6 0x1f83d9ab +%define H7 0x5be0cd19 +%define FUNC sha256_one_block_avx +%endif + +; rotate_Xs +; Rotate values of symbols X0...X3 +%macro rotate_Xs 0 +%xdefine X_ X0 +%xdefine X0 X1 +%xdefine X1 X2 +%xdefine X2 X3 +%xdefine X3 X_ +%endm + +; ROTATE_ARGS +; Rotate values of symbols a...h +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +%macro FOUR_ROUNDS_AND_SCHED 0 + ;; compute s0 four at a time and s1 two at a time + ;; compute W[-16] + W[-7] 4 at a time + ;vmovdqa XTMP0, X3 + mov y0, e ; y0 = e + MY_ROR y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + vpalignr XTMP0, X3, X2, 4 ; XTMP0 = W[-7] + MY_ROR y1, (22-13) ; y1 = a >> (22-13) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + ;vmovdqa XTMP1, X1 + xor y1, a ; y1 = a ^ (a >> (22-13) + xor y2, g ; y2 = f^g + vpaddd XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16] + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + ;; compute s0 + vpalignr XTMP1, X1, X0, 4 ; XTMP1 = W[-15] + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + + MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH + + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + + vpsrld XTMP2, XTMP1, 7 + + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + + vpslld XTMP3, XTMP1, (32-7) + + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + + vpor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 + + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS + + mov y0, e ; y0 = e + mov y1, a ; y1 = a + + + MY_ROR y0, (25-11) ; y0 = e >> (25-11) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + MY_ROR y1, (22-13) ; y1 = a >> (22-13) + + vpsrld XTMP2, XTMP1,18 + + xor y1, a ; y1 = a ^ (a >> (22-13) + MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor y2, g ; y2 = f^g + + vpsrld XTMP4, XTMP1, 3 ; XTMP4 = W[-15] >> 3 + + MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + + vpslld XTMP1, XTMP1, (32-18) + + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor y2, g ; y2 = CH = ((f^g)&e)^g + + vpxor XTMP3, XTMP3, XTMP1 + + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH + MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + + vpxor XTMP3, XTMP3, XTMP2 ; XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 + + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + + vpxor XTMP1, XTMP3, XTMP4 ; XTMP1 = s0 + + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + ;; compute low s1 + vpshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + vpaddd XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS + ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA} + + mov y0, e ; y0 = e + mov y1, a ; y1 = a + MY_ROR y0, (25-11) ; y0 = e >> (25-11) + + ;vmovdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA} + + xor y0, e ; y0 = e ^ (e >> (25-11)) + MY_ROR y1, (22-13) ; y1 = a >> (22-13) + mov y2, f ; y2 = f + xor y1, a ; y1 = a ^ (a >> (22-13) + MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + + vpsrld XTMP4, XTMP2, 10 ; XTMP4 = W[-2] >> 10 {BBAA} + + xor y2, g ; y2 = f^g + + vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xBxA} + + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + + vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xBxA} + + MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor y2, g ; y2 = CH = ((f^g)&e)^g + MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + vpxor XTMP2, XTMP2, XTMP3 + add y2, y0 ; y2 = S1 + CH + MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH + vpxor XTMP4, XTMP4, XTMP2 ; XTMP4 = s1 {xBxA} + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + vpshufb XTMP4, XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA} + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + vpaddd XTMP0, XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + ;; compute high s1 + vpshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC} + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS + ;vmovdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC} + mov y0, e ; y0 = e + MY_ROR y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + ;vmovdqa XTMP5, XTMP2 ; XTMP5 = W[-2] {DDCC} + MY_ROR y1, (22-13) ; y1 = a >> (22-13) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + + vpsrld XTMP5, XTMP2, 10 ; XTMP5 = W[-2] >> 10 {DDCC} + + xor y1, a ; y1 = a ^ (a >> (22-13) + xor y2, g ; y2 = f^g + + vpsrlq XTMP3, XTMP2, 19 ; XTMP3 = W[-2] MY_ROR 19 {xDxC} + + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + + vpsrlq XTMP2, XTMP2, 17 ; XTMP2 = W[-2] MY_ROR 17 {xDxC} + + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + + vpxor XTMP2, XTMP2, XTMP3 + + MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH + vpxor XTMP5, XTMP5, XTMP2 ; XTMP5 = s1 {xDxC} + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + vpshufb XTMP5, XTMP5, SHUF_DC00 ; XTMP5 = s1 {DC00} + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + vpaddd X0, XTMP5, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS +rotate_Xs +%endm + +;; input is [rsp + _XFER + %1 * 4] +%macro DO_ROUND 1 + mov y0, e ; y0 = e + MY_ROR y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + xor y0, e ; y0 = e ^ (e >> (25-11)) + MY_ROR y1, (22-13) ; y1 = a >> (22-13) + mov y2, f ; y2 = f + xor y1, a ; y1 = a ^ (a >> (22-13) + MY_ROR y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor y2, g ; y2 = f^g + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + MY_ROR y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + and y2, e ; y2 = (f^g)&e + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + MY_ROR y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + add y2, y0 ; y2 = S1 + CH + MY_ROR y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void FUNC(void *input_data, UINT32 digest[8], UINT64 num_blks) +;; arg 1 : pointer to input data +;; arg 2 : pointer to digest +section .text +MKGLOBAL(FUNC,function,) +align 32 +FUNC: + push rbx +%ifndef LINUX + push rsi + push rdi +%endif + push rbp + push r13 + push r14 + push r15 + + sub rsp,STACK_size +%ifndef LINUX + vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6 + vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7 + vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8 + vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9 + vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10 + vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11 + vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12 + vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13 +%endif + + ;; load initial digest + mov a,H0 + mov b,H1 + mov c,H2 + mov d,H3 + mov e,H4 + mov f,H5 + mov g,H6 + mov h,H7 + + vmovdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK] + vmovdqa SHUF_00BA, [rel _SHUF_00BA] + vmovdqa SHUF_DC00, [rel _SHUF_DC00] + + lea TBL,[rel K256] + + ;; byte swap first 16 dwords + COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK + + ;; schedule 48 input dwords, by doing 3 rounds of 16 each + mov SRND, 3 +align 16 +loop1: + vpaddd XFER, X0, [TBL + 0*16] + vmovdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + vpaddd XFER, X0, [TBL + 1*16] + vmovdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + vpaddd XFER, X0, [TBL + 2*16] + vmovdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + vpaddd XFER, X0, [TBL + 3*16] + vmovdqa [rsp + _XFER], XFER + add TBL, 4*16 + FOUR_ROUNDS_AND_SCHED + + sub SRND, 1 + jne loop1 + + mov SRND, 2 +loop2: + vpaddd XFER, X0, [TBL + 0*16] + vmovdqa [rsp + _XFER], XFER + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + + vpaddd XFER, X1, [TBL + 1*16] + vmovdqa [rsp + _XFER], XFER + add TBL, 2*16 + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + + vmovdqa X0, X2 + vmovdqa X1, X3 + + sub SRND, 1 + jne loop2 + + add a,H0 + add b,H1 + add c,H2 + add d,H3 + add e,H4 + add f,H5 + add g,H6 + mov [4*0 + CTX],a + mov [4*1 + CTX],b + mov [4*2 + CTX],c + mov [4*3 + CTX],d + mov [4*4 + CTX],e + mov [4*5 + CTX],f + mov [4*6 + CTX],g + add h,H7 + mov [4*7 + CTX],h + +done_hash: +%ifndef LINUX + vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16] + vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16] + vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16] + vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16] + vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16] + vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16] + vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16] + vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16] +%endif + + add rsp, STACK_size + + pop r15 + pop r14 + pop r13 + pop rbp +%ifndef LINUX + pop rdi + pop rsi +%endif + pop rbx + + ret + + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/sha384_one_block_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha384_one_block_avx.asm new file mode 100644 index 00000000..cdcea9ff --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/sha384_one_block_avx.asm @@ -0,0 +1,41 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define H0 0xcbbb9d5dc1059ed8 +%define H1 0x629a292a367cd507 +%define H2 0x9159015a3070dd17 +%define H3 0x152fecd8f70e5939 +%define H4 0x67332667ffc00b31 +%define H5 0x8eb44a8768581511 +%define H6 0xdb0c2e0d64f98fa7 +%define H7 0x47b5481dbefa4fa4 +%define FUNC sha384_one_block_avx + +%include "sha512_one_block_avx.asm" diff --git a/src/spdk/intel-ipsec-mb/avx/sha512_one_block_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha512_one_block_avx.asm new file mode 100644 index 00000000..1f48ff99 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/sha512_one_block_avx.asm @@ -0,0 +1,489 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%include "os.asm" + +%define VMOVDQ vmovdqu ;; assume buffers not aligned + +%ifndef FUNC +%define FUNC sha512_one_block_avx +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros + +%macro MY_ROR 2 +shld %1,%1,(64-(%2)) +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask +; Load xmm with mem and byte swap each dword +%macro COPY_XMM_AND_BSWAP 3 + VMOVDQ %1, %2 + vpshufb %1, %3 +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define X0 xmm4 +%define X1 xmm5 +%define X2 xmm6 +%define X3 xmm7 +%define X4 xmm8 +%define X5 xmm9 +%define X6 xmm10 +%define X7 xmm11 + +%define XTMP0 xmm0 +%define XTMP1 xmm1 +%define XTMP2 xmm2 +%define XTMP3 xmm3 +%define XFER xmm13 + +%define BYTE_FLIP_MASK xmm12 + +%ifdef LINUX +%define CTX rsi ; 2nd arg +%define INP rdi ; 1st arg + +%define SRND rdi ; clobbers INP +%define c rcx +%define d r8 +%define e rdx +%else +%define CTX rdx ; 2nd arg +%define INP rcx ; 1st arg + +%define SRND rcx ; clobbers INP +%define c rdi +%define d rsi +%define e r8 + +%endif +%define TBL rbp +%define a rax +%define b rbx + +%define f r9 +%define g r10 +%define h r11 + +%define y0 r13 +%define y1 r14 +%define y2 r15 + +%ifndef H0 +%define H0 0x6a09e667f3bcc908 +%define H1 0xbb67ae8584caa73b +%define H2 0x3c6ef372fe94f82b +%define H3 0xa54ff53a5f1d36f1 +%define H4 0x510e527fade682d1 +%define H5 0x9b05688c2b3e6c1f +%define H6 0x1f83d9abfb41bd6b +%define H7 0x5be0cd19137e2179 +%endif + +struc STACK +%ifndef LINUX +_XMM_SAVE: reso 8 +%endif +_XFER: reso 1 +endstruc + + +; rotate_Xs +; Rotate values of symbols X0...X7 +%macro rotate_Xs 0 +%xdefine X_ X0 +%xdefine X0 X1 +%xdefine X1 X2 +%xdefine X2 X3 +%xdefine X3 X4 +%xdefine X4 X5 +%xdefine X5 X6 +%xdefine X6 X7 +%xdefine X7 X_ +%endm + +; ROTATE_ARGS +; Rotate values of symbols a...h +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +%macro TWO_ROUNDS_AND_SCHED 0 + + vpalignr XTMP0, X5, X4, 8 ; XTMP0 = W[-7] + ;; compute s0 four at a time and s1 two at a time + ;; compute W[-16] + W[-7] 4 at a time + mov y0, e ; y0 = e + mov y1, a ; y1 = a + MY_ROR y0, (41-18) ; y0 = e >> (41-18) + vpaddq XTMP0, XTMP0, X0 ; XTMP0 = W[-7] + W[-16] + xor y0, e ; y0 = e ^ (e >> (41-18)) + mov y2, f ; y2 = f + MY_ROR y1, (39-34) ; y1 = a >> (39-34) + ;; compute s0 + vpalignr XTMP1, X1, X0, 8 ; XTMP1 = W[-15] + xor y1, a ; y1 = a ^ (a >> (39-34) + MY_ROR y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14)) + vpsllq XTMP2, XTMP1, (64-1) + xor y2, g ; y2 = f^g + MY_ROR y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28)) + vpsrlq XTMP3, XTMP1, 1 + xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (41-14)) + and y2, e ; y2 = (f^g)&e + MY_ROR y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41) + vpor XTMP2, XTMP2, XTMP3 ; XTMP2 = W[-15] ror 1 + xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28)) + xor y2, g ; y2 = CH = ((f^g)&e)^g + add y2, y0 ; y2 = S1 + CH + vpsrlq XTMP3, XTMP1, 8 + add y2, [rsp + _XFER + 0*8] ; y2 = k + w + S1 + CH + MY_ROR y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39) + mov y0, a ; y0 = a + vpsllq X0, XTMP1, (64-8) + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + or y0, c ; y0 = a|c + vpor X0, X0, XTMP3 + add d, h ; d = d + t1 + and y2, c ; y2 = a&c + and y0, b ; y0 = (a|c)&b + vpsrlq XTMP1, XTMP1, 7 ; X0 = W[-15] >> 7 + add h, y1 ; h = t1 + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + vpxor XTMP1, XTMP1, XTMP2 ; XTMP1 = W[-15] ror 1 ^ W[-15] ror 8 + add h, y0 ; h = t1 + S0 + MAJ + vpxor XTMP1, XTMP1, X0 ; XTMP1 = s0 + + +ROTATE_ARGS + ;; compute s1 + vpaddq XTMP0, XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 + mov y0, e ; y0 = e + mov y1, a ; y1 = a + MY_ROR y0, (41-18) ; y0 = e >> (41-18) + vpsllq XTMP3, X7, (64-19) + xor y0, e ; y0 = e ^ (e >> (41-18)) + mov y2, f ; y2 = f + MY_ROR y1, (39-34) ; y1 = a >> (39-34) + vpsrlq X0, X7, 19 + xor y1, a ; y1 = a ^ (a >> (39-34) + MY_ROR y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14)) + vpor XTMP3, XTMP3, X0 ; XTMP3 = W[-2] ror 19 + xor y2, g ; y2 = f^g + MY_ROR y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28)) + vpsllq XTMP2, X7, (64-61) + xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (41-14)) + and y2, e ; y2 = (f^g)&e + MY_ROR y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41) + vpsrlq XTMP1, X7, 61 + xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28)) + xor y2, g ; y2 = CH = ((f^g)&e)^g + add y2, y0 ; y2 = S1 + CH + vpor XTMP2, XTMP2, XTMP1 ; XTMP2 = W[-2] ror 61 + add y2, [rsp + _XFER + 1*8] ; y2 = k + w + S1 + CH + MY_ROR y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39) + mov y0, a ; y0 = a + vpsrlq XTMP1, X7, 6 ; XTMP1 = W[-2] >> 6 + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + or y0, c ; y0 = a|c + vpxor XTMP1, XTMP1, XTMP2 + add d, h ; d = d + t1 + and y2, c ; y2 = a&c + and y0, b ; y0 = (a|c)&b + vpxor X0, XTMP3, XTMP1 ; X0 = s1 + add h, y1 ; h = t1 + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = t1 + S0 + MAJ + vpaddq X0, X0, XTMP0 ; X0 = {W[1], W[0]} + +ROTATE_ARGS +rotate_Xs +%endm + +;; input is [rsp + _XFER + %1 * 8] +%macro DO_ROUND 1 + mov y0, e ; y0 = e + MY_ROR y0, (41-18) ; y0 = e >> (41-18) + mov y1, a ; y1 = a + xor y0, e ; y0 = e ^ (e >> (41-18)) + MY_ROR y1, (39-34) ; y1 = a >> (39-34) + mov y2, f ; y2 = f + xor y1, a ; y1 = a ^ (a >> (39-34) + MY_ROR y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14)) + xor y2, g ; y2 = f^g + xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (25-6)) + MY_ROR y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28)) + and y2, e ; y2 = (f^g)&e + xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28)) + MY_ROR y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41) + xor y2, g ; y2 = CH = ((f^g)&e)^g + add y2, y0 ; y2 = S1 + CH + MY_ROR y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39) + add y2, [rsp + _XFER + %1*8] ; y2 = k + w + S1 + CH + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + or y0, c ; y0 = a|c + add d, h ; d = d + t1 + and y2, c ; y2 = a&c + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = t1 + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = t1 + S0 + MAJ + ROTATE_ARGS +%endm + +section .data +default rel +align 64 +K512: + dq 0x428a2f98d728ae22,0x7137449123ef65cd + dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + dq 0x3956c25bf348b538,0x59f111f1b605d019 + dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + dq 0xd807aa98a3030242,0x12835b0145706fbe + dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + dq 0x9bdc06a725c71235,0xc19bf174cf692694 + dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + dq 0x983e5152ee66dfab,0xa831c66d2db43210 + dq 0xb00327c898fb213f,0xbf597fc7beef0ee4 + dq 0xc6e00bf33da88fc2,0xd5a79147930aa725 + dq 0x06ca6351e003826f,0x142929670a0e6e70 + dq 0x27b70a8546d22ffc,0x2e1b21385c26c926 + dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + dq 0x650a73548baf63de,0x766a0abb3c77b2a8 + dq 0x81c2c92e47edaee6,0x92722c851482353b + dq 0xa2bfe8a14cf10364,0xa81a664bbc423001 + dq 0xc24b8b70d0f89791,0xc76c51a30654be30 + dq 0xd192e819d6ef5218,0xd69906245565a910 + dq 0xf40e35855771202a,0x106aa07032bbd1b8 + dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + dq 0x748f82ee5defb2fc,0x78a5636f43172f60 + dq 0x84c87814a1f0ab72,0x8cc702081a6439ec + dq 0x90befffa23631e28,0xa4506cebde82bde9 + dq 0xbef9a3f7b2c67915,0xc67178f2e372532b + dq 0xca273eceea26619c,0xd186b8c721c0c207 + dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + dq 0x06f067aa72176fba,0x0a637dc5a2c898a6 + dq 0x113f9804bef90dae,0x1b710b35131c471b + dq 0x28db77f523047d84,0x32caab7b40c72493 + dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +h0: dq H0 +h1: dq H1 +h2: dq H2 +h3: dq H3 +h4: dq H4 +h5: dq H5 +h6: dq H6 +h7: dq H7 + +align 16 +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607 + dq 0x0001020304050607, 0x08090a0b0c0d0e0f + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void FUNC(void *input_data, UINT64 digest[8]) +;; arg 1 : pointer to input data +;; arg 2 : pointer to digest +section .text +MKGLOBAL(FUNC,function,) +align 32 +FUNC: + push rbx +%ifndef LINUX + push rsi + push rdi +%endif + push rbp + push r13 + push r14 + push r15 + + sub rsp,STACK_size +%ifndef LINUX + vmovdqa [rsp + _XMM_SAVE + 0*16],xmm6 + vmovdqa [rsp + _XMM_SAVE + 1*16],xmm7 + vmovdqa [rsp + _XMM_SAVE + 2*16],xmm8 + vmovdqa [rsp + _XMM_SAVE + 3*16],xmm9 + vmovdqa [rsp + _XMM_SAVE + 4*16],xmm10 + vmovdqa [rsp + _XMM_SAVE + 5*16],xmm11 + vmovdqa [rsp + _XMM_SAVE + 6*16],xmm12 + vmovdqa [rsp + _XMM_SAVE + 7*16],xmm13 +%endif + + ;; load initial digest + mov a,[rel h0] + mov b,[rel h1] + mov c,[rel h2] + mov d,[rel h3] + mov e,[rel h4] + mov f,[rel h5] + mov g,[rel h6] + mov h,[rel h7] + + vmovdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK] + + lea TBL,[rel K512] + + ;; byte swap first 16 qwords + COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X4, [INP + 4*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X5, [INP + 5*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X6, [INP + 6*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X7, [INP + 7*16], BYTE_FLIP_MASK + + ;; schedule 64 input qwords, by doing 4 iterations of 16 rounds + mov SRND, 4 +align 16 +loop1: + +%assign i 0 +%rep 7 + vpaddq XFER, X0, [TBL + i*16] + vmovdqa [rsp + _XFER], XFER + TWO_ROUNDS_AND_SCHED +%assign i (i+1) +%endrep + + vpaddq XFER, X0, [TBL + 7*16] + vmovdqa [rsp + _XFER], XFER + add TBL, 8*16 + TWO_ROUNDS_AND_SCHED + + sub SRND, 1 + jne loop1 + + mov SRND, 2 + jmp loop2a +loop2: + vmovdqa X0, X4 + vmovdqa X1, X5 + vmovdqa X2, X6 + vmovdqa X3, X7 + +loop2a: + vpaddq X0, X0, [TBL + 0*16] + vmovdqa [rsp + _XFER], X0 + DO_ROUND 0 + DO_ROUND 1 + + vpaddq X1, X1, [TBL + 1*16] + vmovdqa [rsp + _XFER], X1 + DO_ROUND 0 + DO_ROUND 1 + + vpaddq X2, X2, [TBL + 2*16] + vmovdqa [rsp + _XFER], X2 + DO_ROUND 0 + DO_ROUND 1 + + vpaddq X3, X3, [TBL + 3*16] + vmovdqa [rsp + _XFER], X3 + add TBL, 4*16 + DO_ROUND 0 + DO_ROUND 1 + + sub SRND, 1 + jne loop2 + + add a,[rel h0] + add b,[rel h1] + add c,[rel h2] + add d,[rel h3] + add e,[rel h4] + add f,[rel h5] + add g,[rel h6] + mov [8*0 + CTX],a + mov [8*1 + CTX],b + mov [8*2 + CTX],c + mov [8*3 + CTX],d + mov [8*4 + CTX],e + mov [8*5 + CTX],f + mov [8*6 + CTX],g + add h,[rel h7] + mov [8*7 + CTX],h + +done_hash: +%ifndef LINUX + vmovdqa xmm6,[rsp + _XMM_SAVE + 0*16] + vmovdqa xmm7,[rsp + _XMM_SAVE + 1*16] + vmovdqa xmm8,[rsp + _XMM_SAVE + 2*16] + vmovdqa xmm9,[rsp + _XMM_SAVE + 3*16] + vmovdqa xmm10,[rsp + _XMM_SAVE + 4*16] + vmovdqa xmm11,[rsp + _XMM_SAVE + 5*16] + vmovdqa xmm12,[rsp + _XMM_SAVE + 6*16] + vmovdqa xmm13,[rsp + _XMM_SAVE + 7*16] +%endif + + add rsp, STACK_size + + pop r15 + pop r14 + pop r13 + pop rbp +%ifndef LINUX + pop rdi + pop rsi +%endif + pop rbx + + ret + + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/sha512_x2_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha512_x2_avx.asm new file mode 100644 index 00000000..fbffcad6 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/sha512_x2_avx.asm @@ -0,0 +1,371 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; code to compute SHA512 by-2 using AVX +;; outer calling routine takes care of save and restore of XMM registers +;; Logic designed/laid out by JDG + +;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15 +;; Stack must be aligned to 16 bytes before call +;; Windows clobbers: rax rdx r8 r9 r10 r11 +;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15 +;; +;; Linux clobbers: rax rsi r8 r9 r10 r11 +;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15 +;; +;; clobbers xmm0-15 + +%include "os.asm" +%include "mb_mgr_datastruct.asm" +extern K512_2 + +section .data +default rel + +align 32 +; one from sha512_rorx +; this does the big endian to little endian conversion +; over a quad word +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607 + dq 0x0001020304050607, 0x08090a0b0c0d0e0f + ;ddq 0x18191a1b1c1d1e1f1011121314151617 + dq 0x1011121314151617, 0x18191a1b1c1d1e1f + +section .text + +%ifdef LINUX ; Linux definitions +%define arg1 rdi +%define arg2 rsi +%else ; Windows definitions +%define arg1 rcx +%define arg2 rdx +%endif + +; Common definitions +%define STATE arg1 +%define INP_SIZE arg2 + +%define IDX rax +%define ROUND r8 +%define TBL r11 + +%define inp0 r9 +%define inp1 r10 + +%define a xmm0 +%define b xmm1 +%define c xmm2 +%define d xmm3 +%define e xmm4 +%define f xmm5 +%define g xmm6 +%define h xmm7 + +%define a0 xmm8 +%define a1 xmm9 +%define a2 xmm10 + +%define TT0 xmm14 +%define TT1 xmm13 +%define TT2 xmm12 +%define TT3 xmm11 +%define TT4 xmm10 +%define TT5 xmm9 + +%define T1 xmm14 +%define TMP xmm15 + + + +%define SZ2 2*SHA512_DIGEST_WORD_SIZE ; Size of one vector register +%define ROUNDS 80*SZ2 + +; Define stack usage + +struc STACK +_DATA: resb SZ2 * 16 +_DIGEST: resb SZ2 * NUM_SHA512_DIGEST_WORDS + resb 8 ; for alignment, must be odd multiple of 8 +endstruc + +%define VMOVPD vmovupd + +; transpose r0, r1, t0 +; Input looks like {r0 r1} +; r0 = {a1 a0} +; r1 = {b1 b0} +; +; output looks like +; r0 = {b0, a0} +; t0 = {b1, a1} + +%macro TRANSPOSE 3 +%define %%r0 %1 +%define %%r1 %2 +%define %%t0 %3 + vshufpd %%t0, %%r0, %%r1, 11b ; t0 = b1 a1 + vshufpd %%r0, %%r0, %%r1, 00b ; r0 = b0 a0 +%endm + + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +; PRORQ reg, imm, tmp +; packed-rotate-right-double +; does a rotate by doing two shifts and an or +%macro PRORQ 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpsllq %%tmp, %%reg, (64-(%%imm)) + vpsrlq %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; non-destructive +; PRORQ_nd reg, imm, tmp, src +%macro PRORQ_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + vpsllq %%tmp, %%src, (64-(%%imm)) + vpsrlq %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; PRORQ dst/src, amt +%macro PRORQ 2 + PRORQ %1, %2, TMP +%endmacro + +; PRORQ_nd dst, src, amt +%macro PRORQ_nd 3 + PRORQ_nd %1, %3, TMP, %2 +%endmacro + + + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15 2 +%define %%T1 %1 +%define %%i %2 + PRORQ_nd a0, e, (18-14) ; sig1: a0 = (e >> 4) + + vpxor a2, f, g ; ch: a2 = f^g + vpand a2, a2, e ; ch: a2 = (f^g)&e + vpxor a2, a2, g ; a2 = ch + + PRORQ_nd a1, e, 41 ; sig1: a1 = (e >> 41) + vmovdqa [SZ2*(%%i&0xf) + rsp + _DATA],%%T1 + vpaddq %%T1,%%T1,[TBL + ROUND] ; T1 = W + K + vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) + PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18) + vpaddq h, h, a2 ; h = h + ch + PRORQ_nd a2, a, (34-28) ; sig0: a2 = (a >> 6) + vpaddq h, h, %%T1 ; h = h + ch + W + K + vpxor a0, a0, a1 ; a0 = sigma1 + vmovdqa %%T1, a ; maj: T1 = a + PRORQ_nd a1, a, 39 ; sig0: a1 = (a >> 39) + vpxor %%T1, %%T1, c ; maj: T1 = a^c + add ROUND, SZ2 ; ROUND++ + vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b + vpaddq h, h, a0 + + vpaddq d, d, h + + vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) + PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34) + vpxor a2, a2, a1 ; a2 = sig0 + vpand a1, a, c ; maj: a1 = a&c + vpor a1, a1, %%T1 ; a1 = maj + vpaddq h, h, a1 ; h = h + ch + W + K + maj + vpaddq h, h, a2 ; h = h + ch + W + K + maj + sigma0 + ROTATE_ARGS + +%endm + + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 2 +%define %%T1 %1 +%define %%i %2 + vmovdqa %%T1, [SZ2*((%%i-15)&0xf) + rsp + _DATA] + vmovdqa a1, [SZ2*((%%i-2)&0xf) + rsp + _DATA] + vmovdqa a0, %%T1 + PRORQ %%T1, 8-1 + vmovdqa a2, a1 + PRORQ a1, 61-19 + vpxor %%T1, %%T1, a0 + PRORQ %%T1, 1 + vpxor a1, a1, a2 + PRORQ a1, 19 + vpsrlq a0, a0, 7 + vpxor %%T1, %%T1, a0 + vpsrlq a2, a2, 6 + vpxor a1, a1, a2 + vpaddq %%T1, %%T1, [SZ2*((%%i-16)&0xf) + rsp + _DATA] + vpaddq a1, a1, [SZ2*((%%i-7)&0xf) + rsp + _DATA] + vpaddq %%T1, %%T1, a1 + + ROUND_00_15 %%T1, %%i + +%endm + + + +;; SHA512_ARGS: +;; UINT128 digest[8]; // transposed digests +;; UINT8 *data_ptr[2]; +;; + +;; void sha512_x2_avx(SHA512_ARGS *args, UINT64 msg_size_in_blocks) +;; arg 1 : STATE : pointer args +;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1) +;; +MKGLOBAL(sha512_x2_avx,function,internal) +align 32 +sha512_x2_avx: + ; general registers preserved in outer calling routine + ; outer calling routine saves all the XMM registers + + sub rsp, STACK_size + + ;; Load the pre-transposed incoming digest. + vmovdqa a,[STATE + 0 * SHA512_DIGEST_ROW_SIZE] + vmovdqa b,[STATE + 1 * SHA512_DIGEST_ROW_SIZE] + vmovdqa c,[STATE + 2 * SHA512_DIGEST_ROW_SIZE] + vmovdqa d,[STATE + 3 * SHA512_DIGEST_ROW_SIZE] + vmovdqa e,[STATE + 4 * SHA512_DIGEST_ROW_SIZE] + vmovdqa f,[STATE + 5 * SHA512_DIGEST_ROW_SIZE] + vmovdqa g,[STATE + 6 * SHA512_DIGEST_ROW_SIZE] + vmovdqa h,[STATE + 7 * SHA512_DIGEST_ROW_SIZE] + + lea TBL,[rel K512_2] + + ;; load the address of each of the 2 message lanes + ;; getting ready to transpose input onto stack + mov inp0,[STATE + _data_ptr_sha512 +0*PTR_SZ] + mov inp1,[STATE + _data_ptr_sha512 +1*PTR_SZ] + + xor IDX, IDX +lloop: + + xor ROUND, ROUND + + ;; save old digest + vmovdqa [rsp + _DIGEST + 0*SZ2], a + vmovdqa [rsp + _DIGEST + 1*SZ2], b + vmovdqa [rsp + _DIGEST + 2*SZ2], c + vmovdqa [rsp + _DIGEST + 3*SZ2], d + vmovdqa [rsp + _DIGEST + 4*SZ2], e + vmovdqa [rsp + _DIGEST + 5*SZ2], f + vmovdqa [rsp + _DIGEST + 6*SZ2], g + vmovdqa [rsp + _DIGEST + 7*SZ2], h + +%assign i 0 +%rep 8 + ;; load up the shuffler for little-endian to big-endian format + vmovdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK] + VMOVPD TT0,[inp0+IDX+i*16] ;; double precision is 64 bits + VMOVPD TT2,[inp1+IDX+i*16] + + TRANSPOSE TT0, TT2, TT1 + vpshufb TT0, TT0, TMP + vpshufb TT1, TT1, TMP + + ROUND_00_15 TT0,(i*2+0) + ROUND_00_15 TT1,(i*2+1) +%assign i (i+1) +%endrep + +;; Increment IDX by message block size == 8 (loop) * 16 (XMM width in bytes) + add IDX, 8 * 16 + +%assign i (i*4) + + jmp Lrounds_16_xx +align 16 +Lrounds_16_xx: +%rep 16 + ROUND_16_XX T1, i +%assign i (i+1) +%endrep + + cmp ROUND,ROUNDS + jb Lrounds_16_xx + + ;; add old digest + vpaddq a, a, [rsp + _DIGEST + 0*SZ2] + vpaddq b, b, [rsp + _DIGEST + 1*SZ2] + vpaddq c, c, [rsp + _DIGEST + 2*SZ2] + vpaddq d, d, [rsp + _DIGEST + 3*SZ2] + vpaddq e, e, [rsp + _DIGEST + 4*SZ2] + vpaddq f, f, [rsp + _DIGEST + 5*SZ2] + vpaddq g, g, [rsp + _DIGEST + 6*SZ2] + vpaddq h, h, [rsp + _DIGEST + 7*SZ2] + + sub INP_SIZE, 1 ;; consumed one message block + jne lloop + + ; write back to memory (state object) the transposed digest + vmovdqa [STATE+0*SHA512_DIGEST_ROW_SIZE],a + vmovdqa [STATE+1*SHA512_DIGEST_ROW_SIZE],b + vmovdqa [STATE+2*SHA512_DIGEST_ROW_SIZE],c + vmovdqa [STATE+3*SHA512_DIGEST_ROW_SIZE],d + vmovdqa [STATE+4*SHA512_DIGEST_ROW_SIZE],e + vmovdqa [STATE+5*SHA512_DIGEST_ROW_SIZE],f + vmovdqa [STATE+6*SHA512_DIGEST_ROW_SIZE],g + vmovdqa [STATE+7*SHA512_DIGEST_ROW_SIZE],h + + ; update input pointers + add inp0, IDX + mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0 + add inp1, IDX + mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + add rsp, STACK_size + + ; outer calling routine restores XMM and other GP registers + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/avx/sha_256_mult_avx.asm b/src/spdk/intel-ipsec-mb/avx/sha_256_mult_avx.asm new file mode 100644 index 00000000..8458b866 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/sha_256_mult_avx.asm @@ -0,0 +1,381 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; code to compute quad SHA256 using AVX +;; outer calling routine takes care of save and restore of XMM registers +;; Logic designed/laid out by JDG + +;; Stack must be aligned to 16 bytes before call +;; Windows clobbers: rax rbx rdx r8 r9 r10 r11 r12 +;; Windows preserves: rcx rsi rdi rbp r12 r14 r15 +;; +;; Linux clobbers: rax rbx rsi r8 r9 r10 r11 r12 +;; Linux preserves: rcx rdx rdi rbp r13 r14 r15 +;; +;; clobbers xmm0-15 + +%include "os.asm" +%include "mb_mgr_datastruct.asm" + +extern K256_4 + +%ifdef LINUX + %define arg1 rdi + %define arg2 rsi +%else + ; Windows definitions + %define arg1 rcx + %define arg2 rdx +%endif + +; Common definitions +%define STATE arg1 +%define INP_SIZE arg2 + +%define IDX rax +%define ROUND rbx +%define TBL r12 + +%define inp0 r8 +%define inp1 r9 +%define inp2 r10 +%define inp3 r11 + +%define a xmm0 +%define b xmm1 +%define c xmm2 +%define d xmm3 +%define e xmm4 +%define f xmm5 +%define g xmm6 +%define h xmm7 + +%define a0 xmm8 +%define a1 xmm9 +%define a2 xmm10 + +%define TT0 xmm14 +%define TT1 xmm13 +%define TT2 xmm12 +%define TT3 xmm11 +%define TT4 xmm10 +%define TT5 xmm9 + +%define T1 xmm14 +%define TMP xmm15 + +%define SZ4 4*SHA256_DIGEST_WORD_SIZE ; Size of one vector register +%define ROUNDS 64*SZ4 + +; Define stack usage +struc STACK +_DATA: resb SZ4 * 16 +_DIGEST: resb SZ4 * NUM_SHA256_DIGEST_WORDS + resb 8 ; for alignment, must be odd multiple of 8 +endstruc + +%define VMOVPS vmovups + +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a3 a2 a1 a0} +; r1 = {b3 b2 b1 b0} +; r2 = {c3 c2 c1 c0} +; r3 = {d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; r0 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} + + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} + + vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} + + vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} + + vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} + vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} +%endmacro + + + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +; PRORD reg, imm, tmp +%macro PRORD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + vpslld %%tmp, %%reg, (32-(%%imm)) + vpsrld %%reg, %%reg, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; non-destructive +; PRORD_nd reg, imm, tmp, src +%macro PRORD_nd 4 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 +%define %%src %4 + ;vmovdqa %%tmp, %%reg + vpslld %%tmp, %%src, (32-(%%imm)) + vpsrld %%reg, %%src, %%imm + vpor %%reg, %%reg, %%tmp +%endmacro + +; PRORD dst/src, amt +%macro PRORD 2 + PRORD %1, %2, TMP +%endmacro + +; PRORD_nd dst, src, amt +%macro PRORD_nd 3 + PRORD_nd %1, %3, TMP, %2 +%endmacro + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15 2 +%define %%T1 %1 +%define %%i %2 + PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5) + + vpxor a2, f, g ; ch: a2 = f^g + vpand a2, a2, e ; ch: a2 = (f^g)&e + vpxor a2, a2, g ; a2 = ch + + PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25) + vmovdqa [SZ4*(%%i&0xf) + rsp + _DATA], %%T1 + vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K + vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5) + PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) + vpaddd h, h, a2 ; h = h + ch + PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11) + vpaddd h, h, %%T1 ; h = h + ch + W + K + vpxor a0, a0, a1 ; a0 = sigma1 + PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22) + vpxor %%T1, a, c ; maj: T1 = a^c + add ROUND, SZ4 ; ROUND++ + vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b + vpaddd h, h, a0 + + vpaddd d, d, h + + vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11) + PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) + vpxor a2, a2, a1 ; a2 = sig0 + vpand a1, a, c ; maj: a1 = a&c + vpor a1, a1, %%T1 ; a1 = maj + vpaddd h, h, a1 ; h = h + ch + W + K + maj + vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm + + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 2 +%define %%T1 %1 +%define %%i %2 + vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp + _DATA] + vmovdqa a1, [SZ4*((%%i-2)&0xf) + rsp + _DATA] + vmovdqa a0, %%T1 + PRORD %%T1, 18-7 + vmovdqa a2, a1 + PRORD a1, 19-17 + vpxor %%T1, %%T1, a0 + PRORD %%T1, 7 + vpxor a1, a1, a2 + PRORD a1, 17 + vpsrld a0, a0, 3 + vpxor %%T1, %%T1, a0 + vpsrld a2, a2, 10 + vpxor a1, a1, a2 + vpaddd %%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp + _DATA] + vpaddd a1, a1, [SZ4*((%%i-7)&0xf) + rsp + _DATA] + vpaddd %%T1, %%T1, a1 + + ROUND_00_15 %%T1, %%i +%endm + +section .data +default rel +align 16 +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +section .text + +;; SHA256_ARGS: +;; UINT128 digest[8]; // transposed digests +;; UINT8 *data_ptr[4]; +;; + +;; void sha_256_mult_avx(SHA256_ARGS *args, UINT64 num_blocks); +;; arg 1 : STATE : pointer args +;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1) +;; +MKGLOBAL(sha_256_mult_avx,function,internal) +align 16 +sha_256_mult_avx: + ; general registers preserved in outer calling routine + ; outer calling routine saves all the XMM registers + sub rsp, STACK_size + + ;; Load the pre-transposed incoming digest. + vmovdqa a,[STATE+0*SHA256_DIGEST_ROW_SIZE] + vmovdqa b,[STATE+1*SHA256_DIGEST_ROW_SIZE] + vmovdqa c,[STATE+2*SHA256_DIGEST_ROW_SIZE] + vmovdqa d,[STATE+3*SHA256_DIGEST_ROW_SIZE] + vmovdqa e,[STATE+4*SHA256_DIGEST_ROW_SIZE] + vmovdqa f,[STATE+5*SHA256_DIGEST_ROW_SIZE] + vmovdqa g,[STATE+6*SHA256_DIGEST_ROW_SIZE] + vmovdqa h,[STATE+7*SHA256_DIGEST_ROW_SIZE] + + lea TBL,[rel K256_4] + + ;; load the address of each of the 4 message lanes + ;; getting ready to transpose input onto stack + mov inp0,[STATE + _data_ptr_sha256 + 0*PTR_SZ] + mov inp1,[STATE + _data_ptr_sha256 + 1*PTR_SZ] + mov inp2,[STATE + _data_ptr_sha256 + 2*PTR_SZ] + mov inp3,[STATE + _data_ptr_sha256 + 3*PTR_SZ] + + xor IDX, IDX +lloop: + xor ROUND, ROUND + + ;; save old digest + vmovdqa [rsp + _DIGEST + 0*SZ4], a + vmovdqa [rsp + _DIGEST + 1*SZ4], b + vmovdqa [rsp + _DIGEST + 2*SZ4], c + vmovdqa [rsp + _DIGEST + 3*SZ4], d + vmovdqa [rsp + _DIGEST + 4*SZ4], e + vmovdqa [rsp + _DIGEST + 5*SZ4], f + vmovdqa [rsp + _DIGEST + 6*SZ4], g + vmovdqa [rsp + _DIGEST + 7*SZ4], h + +%assign i 0 +%rep 4 + vmovdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK] + VMOVPS TT2,[inp0+IDX+i*16] + VMOVPS TT1,[inp1+IDX+i*16] + VMOVPS TT4,[inp2+IDX+i*16] + VMOVPS TT3,[inp3+IDX+i*16] + TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5 + vpshufb TT0, TT0, TMP + vpshufb TT1, TT1, TMP + vpshufb TT2, TT2, TMP + vpshufb TT3, TT3, TMP + ROUND_00_15 TT0,(i*4+0) + ROUND_00_15 TT1,(i*4+1) + ROUND_00_15 TT2,(i*4+2) + ROUND_00_15 TT3,(i*4+3) +%assign i (i+1) +%endrep + add IDX, 4*4*4 + +%assign i (i*4) + + jmp Lrounds_16_xx +align 16 +Lrounds_16_xx: +%rep 16 + ROUND_16_XX T1, i +%assign i (i+1) +%endrep + + cmp ROUND,ROUNDS + jb Lrounds_16_xx + + ;; add old digest + vpaddd a, a, [rsp + _DIGEST + 0*SZ4] + vpaddd b, b, [rsp + _DIGEST + 1*SZ4] + vpaddd c, c, [rsp + _DIGEST + 2*SZ4] + vpaddd d, d, [rsp + _DIGEST + 3*SZ4] + vpaddd e, e, [rsp + _DIGEST + 4*SZ4] + vpaddd f, f, [rsp + _DIGEST + 5*SZ4] + vpaddd g, g, [rsp + _DIGEST + 6*SZ4] + vpaddd h, h, [rsp + _DIGEST + 7*SZ4] + + sub INP_SIZE, 1 ;; unit is blocks + jne lloop + + ; write back to memory (state object) the transposed digest + vmovdqa [STATE+0*SHA256_DIGEST_ROW_SIZE],a + vmovdqa [STATE+1*SHA256_DIGEST_ROW_SIZE],b + vmovdqa [STATE+2*SHA256_DIGEST_ROW_SIZE],c + vmovdqa [STATE+3*SHA256_DIGEST_ROW_SIZE],d + vmovdqa [STATE+4*SHA256_DIGEST_ROW_SIZE],e + vmovdqa [STATE+5*SHA256_DIGEST_ROW_SIZE],f + vmovdqa [STATE+6*SHA256_DIGEST_ROW_SIZE],g + vmovdqa [STATE+7*SHA256_DIGEST_ROW_SIZE],h + + ; update input pointers + add inp0, IDX + mov [STATE + _data_ptr_sha256 + 0*8], inp0 + add inp1, IDX + mov [STATE + _data_ptr_sha256 + 1*8], inp1 + add inp2, IDX + mov [STATE + _data_ptr_sha256 + 2*8], inp2 + add inp3, IDX + mov [STATE + _data_ptr_sha256 + 3*8], inp3 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + add rsp, STACK_size + ; outer calling routine restores XMM and other GP registers + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif |