diff options
Diffstat (limited to 'src/spdk/intel-ipsec-mb/avx/aes256_cbc_dec_by8_avx.asm')
-rw-r--r-- | src/spdk/intel-ipsec-mb/avx/aes256_cbc_dec_by8_avx.asm | 344 |
1 files changed, 344 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/avx/aes256_cbc_dec_by8_avx.asm b/src/spdk/intel-ipsec-mb/avx/aes256_cbc_dec_by8_avx.asm new file mode 100644 index 000000000..6a8f100ec --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/aes256_cbc_dec_by8_avx.asm @@ -0,0 +1,344 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; routine to do AES256 CBC decrypt "by8" + +; XMM registers are clobbered. Saving/restoring must be done at a higher level +%include "include/os.asm" + +%define CONCAT(a,b) a %+ b +%define VMOVDQ vmovdqu + +%define xdata0 xmm0 +%define xdata1 xmm1 +%define xdata2 xmm2 +%define xdata3 xmm3 +%define xdata4 xmm4 +%define xdata5 xmm5 +%define xdata6 xmm6 +%define xdata7 xmm7 +%define xIV xmm8 +%define xkey0 xmm9 +%define xkey3 xmm10 +%define xkey6 xmm11 +%define xkey9 xmm12 +%define xkey12 xmm13 +%define xkeyA xmm14 +%define xkeyB xmm15 + +%ifdef LINUX +%define p_in rdi +%define p_IV rsi +%define p_keys rdx +%define p_out rcx +%define num_bytes r8 +%else +%define p_in rcx +%define p_IV rdx +%define p_keys r8 +%define p_out r9 +%define num_bytes rax +%endif + +%define tmp r10 + +%macro do_aes_load 1 + do_aes %1, 1 +%endmacro + +%macro do_aes_noload 1 + do_aes %1, 0 +%endmacro + +; do_aes num_in_par load_keys +; This increments p_in, but not p_out +%macro do_aes 2 +%define %%by %1 +%define %%load_keys %2 + +%if (%%load_keys) + vmovdqa xkey0, [p_keys + 0*16] +%endif + +%assign i 0 +%rep %%by + VMOVDQ CONCAT(xdata,i), [p_in + i*16] +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 1*16] + +%assign i 0 +%rep %%by + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xkey0 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 2*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%assign i (i+1) +%endrep + + add p_in, 16*%%by + +%if (%%load_keys) + vmovdqa xkey3, [p_keys + 3*16] +%endif + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 4*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey3 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 5*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey6, [p_keys + 6*16] +%endif + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 7*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey6 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 8*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey9, [p_keys + 9*16] +%endif + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 10*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey9 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 11*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%assign i (i+1) +%endrep + +%if (%%load_keys) + vmovdqa xkey12, [p_keys + 12*16] +%endif + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyB +%assign i (i+1) +%endrep + + vmovdqa xkeyA, [p_keys + 13*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkey12 +%assign i (i+1) +%endrep + + vmovdqa xkeyB, [p_keys + 14*16] + +%assign i 0 +%rep %%by + vaesdec CONCAT(xdata,i), CONCAT(xdata,i), xkeyA +%assign i (i+1) +%endrep + +%assign i 0 +%rep %%by + vaesdeclast CONCAT(xdata,i), CONCAT(xdata,i), xkeyB +%assign i (i+1) +%endrep + + vpxor xdata0, xdata0, xIV +%assign i 1 +%if (%%by > 1) +%rep (%%by - 1) + VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by] + vpxor CONCAT(xdata,i), CONCAT(xdata,i), xIV +%assign i (i+1) +%endrep +%endif + VMOVDQ xIV, [p_in + (i-1)*16 - 16*%%by] + +%assign i 0 +%rep %%by + VMOVDQ [p_out + i*16], CONCAT(xdata,i) +%assign i (i+1) +%endrep +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +section .text + +;; aes_cbc_dec_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes) +MKGLOBAL(aes_cbc_dec_256_avx,function,internal) +aes_cbc_dec_256_avx: + +%ifndef LINUX + mov num_bytes, [rsp + 8*5] +%endif + + vmovdqu xIV, [p_IV] + + mov tmp, num_bytes + and tmp, 7*16 + jz mult_of_8_blks + + ; 1 <= tmp <= 7 + cmp tmp, 4*16 + jg gt4 + je eq4 + +lt4: + cmp tmp, 2*16 + jg eq3 + je eq2 +eq1: + do_aes_load 1 + add p_out, 1*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq2: + do_aes_load 2 + add p_out, 2*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq3: + do_aes_load 3 + add p_out, 3*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq4: + do_aes_load 4 + add p_out, 4*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +gt4: + cmp tmp, 6*16 + jg eq7 + je eq6 + +eq5: + do_aes_load 5 + add p_out, 5*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq6: + do_aes_load 6 + add p_out, 6*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +eq7: + do_aes_load 7 + add p_out, 7*16 + and num_bytes, ~7*16 + jz do_return2 + jmp main_loop2 + +mult_of_8_blks: + vmovdqa xkey0, [p_keys + 0*16] + vmovdqa xkey3, [p_keys + 3*16] + vmovdqa xkey6, [p_keys + 6*16] + vmovdqa xkey9, [p_keys + 9*16] + vmovdqa xkey12, [p_keys + 12*16] + +main_loop2: + ; num_bytes is a multiple of 8 and >0 + do_aes_noload 8 + add p_out, 8*16 + sub num_bytes, 8*16 + jne main_loop2 + +do_return2: +; Don't write back IV +; vmovdqu [p_IV], xIV + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif |