diff options
Diffstat (limited to 'src/spdk/intel-ipsec-mb/sse/aes128_cbc_dec_by4_sse.asm')
-rw-r--r-- | src/spdk/intel-ipsec-mb/sse/aes128_cbc_dec_by4_sse.asm | 532 |
1 files changed, 532 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/sse/aes128_cbc_dec_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes128_cbc_dec_by4_sse.asm new file mode 100644 index 000000000..7c57688ff --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes128_cbc_dec_by4_sse.asm @@ -0,0 +1,532 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; routine to do AES cbc decrypt on 16n bytes doing AES by 4 + +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +; void aes_cbc_dec_128_sse(void *in, +; UINT128 *IV, +; UINT128 keys[11], +; void *out, +; UINT64 len_bytes); +; +; arg 1: IN: pointer to input (cipher text) +; arg 2: IV: pointer to IV +; arg 3: KEYS: pointer to keys +; arg 4: OUT: pointer to output (plain text) +; arg 5: LEN: length in bytes (multiple of 16) +; +%include "include/os.asm" + +%ifndef AES_CBC_DEC_128 +%define AES_CBC_DEC_128 aes_cbc_dec_128_sse +%endif + +%define MOVDQ movdqu + +%ifdef LINUX +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%else +%define IN rcx +%define IV rdx +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%endif + +%define IDX rax +%define TMP IDX +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 +%define XKEY0 xmm4 +%define XKEY2 xmm5 +%define XKEY4 xmm6 +%define XKEY6 xmm7 +%define XKEY8 xmm8 +%define XKEY10 xmm9 +%define XIV xmm10 +%define XSAVED0 xmm11 +%define XSAVED1 xmm12 +%define XSAVED2 xmm13 +%define XSAVED3 xmm14 +%define XKEY xmm15 + +%define IV_TMP XSAVED3 + +section .text + +MKGLOBAL(AES_CBC_DEC_128,function,internal) +AES_CBC_DEC_128: +%ifndef LINUX + mov LEN, [rsp + 8*5] +%endif + + mov TMP, LEN + and TMP, 3*16 + jz initial_4 + cmp TMP, 2*16 + jb initial_1 + ja initial_3 + +initial_2: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + movdqu XDATA1, [IN + 1*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XIV, XDATA1 + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, [KEYS + 1*16] ; 1. DEC + aesdec XDATA1, [KEYS + 1*16] + + mov IDX, 2*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, [KEYS + 3*16] ; 3. DEC + aesdec XDATA1, [KEYS + 3*16] + + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, [KEYS + 5*16] ; 5. DEC + aesdec XDATA1, [KEYS + 5*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + + movdqa XKEY8, [KEYS + 8*16] + + aesdec XDATA0, [KEYS + 7*16] ; 7. DEC + aesdec XDATA1, [KEYS + 7*16] + + aesdec XDATA0, XKEY8 ; 8. DEC + aesdec XDATA1, XKEY8 + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, [KEYS + 9*16] ; 9. DEC + aesdec XDATA1, [KEYS + 9*16] + + aesdeclast XDATA0, XKEY10 ; 10. DEC + aesdeclast XDATA1, XKEY10 + + pxor XDATA0, IV_TMP + pxor XDATA1, XSAVED0 + + movdqu [OUT + 0*16], XDATA0 + movdqu [OUT + 1*16], XDATA1 + + cmp LEN, 2*16 + je done + jmp main_loop + + + align 16 +initial_1: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XIV, XDATA0 + + pxor XDATA0, XKEY0 ; 0. ARK + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, [KEYS + 1*16] ; 1. DEC + + mov IDX, 1*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, [KEYS + 3*16] ; 3. DEC + + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, [KEYS + 5*16] ; 5. DEC + + aesdec XDATA0, XKEY6 ; 6. DEC + + movdqa XKEY8, [KEYS + 8*16] + + aesdec XDATA0, [KEYS + 7*16] ; 7. DEC + + aesdec XDATA0, XKEY8 ; 8. DEC + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, [KEYS + 9*16] ; 9. DEC + + aesdeclast XDATA0, XKEY10 ; 10. DEC + + pxor XDATA0, IV_TMP + + movdqu [OUT + 0*16], XDATA0 + + cmp LEN, 1*16 + je done + jmp main_loop + + +initial_3: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + movdqu XDATA1, [IN + 1*16] + movdqu XDATA2, [IN + 2*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XSAVED1, XDATA1 + movdqa XIV, XDATA2 + + movdqa XKEY, [KEYS + 1*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + pxor XDATA2, XKEY0 + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, XKEY ; 1. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + + movdqa XKEY, [KEYS + 3*16] + + mov IDX, 3*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + aesdec XDATA2, XKEY2 + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, XKEY ; 3. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + + movdqa XKEY, [KEYS + 5*16] + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + aesdec XDATA2, XKEY4 + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, XKEY ; 5. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + + movdqa XKEY, [KEYS + 7*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + aesdec XDATA2, XKEY6 + + movdqa XKEY8, [KEYS + 8*16] + + aesdec XDATA0, XKEY ; 7. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + + movdqa XKEY, [KEYS + 9*16] + + aesdec XDATA0, XKEY8 ; 8. DEC + aesdec XDATA1, XKEY8 + aesdec XDATA2, XKEY8 + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, XKEY ; 9. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + + aesdeclast XDATA0, XKEY10 ; 10. DEC + aesdeclast XDATA1, XKEY10 + aesdeclast XDATA2, XKEY10 + + pxor XDATA0, IV_TMP + pxor XDATA1, XSAVED0 + pxor XDATA2, XSAVED1 + + movdqu [OUT + 0*16], XDATA0 + movdqu [OUT + 1*16], XDATA1 + movdqu [OUT + 2*16], XDATA2 + + cmp LEN, 3*16 + je done + jmp main_loop + + + align 16 +initial_4: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + movdqu XDATA1, [IN + 1*16] + movdqu XDATA2, [IN + 2*16] + movdqu XDATA3, [IN + 3*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XSAVED1, XDATA1 + movdqa XSAVED2, XDATA2 + movdqa XIV, XDATA3 + + movdqa XKEY, [KEYS + 1*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + pxor XDATA2, XKEY0 + pxor XDATA3, XKEY0 + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, XKEY ; 1. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + movdqa XKEY, [KEYS + 3*16] + + mov IDX, 4*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + aesdec XDATA2, XKEY2 + aesdec XDATA3, XKEY2 + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, XKEY ; 3. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + movdqa XKEY, [KEYS + 5*16] + + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + aesdec XDATA2, XKEY4 + aesdec XDATA3, XKEY4 + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, XKEY ; 5. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + movdqa XKEY, [KEYS + 7*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + aesdec XDATA2, XKEY6 + aesdec XDATA3, XKEY6 + + movdqa XKEY8, [KEYS + 8*16] + + aesdec XDATA0, XKEY ; 7. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + movdqa XKEY, [KEYS + 9*16] + + aesdec XDATA0, XKEY8 ; 8. DEC + aesdec XDATA1, XKEY8 + aesdec XDATA2, XKEY8 + aesdec XDATA3, XKEY8 + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, XKEY ; 9. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + aesdeclast XDATA0, XKEY10 ; 10. DEC + aesdeclast XDATA1, XKEY10 + aesdeclast XDATA2, XKEY10 + aesdeclast XDATA3, XKEY10 + + pxor XDATA0, IV_TMP + pxor XDATA1, XSAVED0 + pxor XDATA2, XSAVED1 + pxor XDATA3, XSAVED2 + + movdqu [OUT + 0*16], XDATA0 + movdqu [OUT + 1*16], XDATA1 + movdqu [OUT + 2*16], XDATA2 + movdqu [OUT + 3*16], XDATA3 + + cmp LEN, 4*16 + jz done + jmp main_loop + + align 16 +main_loop: + ; load cipher text + movdqu XDATA0, [IN + IDX + 0*16] + movdqu XDATA1, [IN + IDX + 1*16] + movdqu XDATA2, [IN + IDX + 2*16] + movdqu XDATA3, [IN + IDX + 3*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XSAVED1, XDATA1 + movdqa XSAVED2, XDATA2 + movdqa XSAVED3, XDATA3 + + movdqa XKEY, [KEYS + 1*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + pxor XDATA2, XKEY0 + pxor XDATA3, XKEY0 + + add IDX, 4*16 + + aesdec XDATA0, XKEY ; 1. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + movdqa XKEY, [KEYS + 3*16] + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + aesdec XDATA2, XKEY2 + aesdec XDATA3, XKEY2 + + aesdec XDATA0, XKEY ; 3. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + movdqa XKEY, [KEYS + 5*16] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + aesdec XDATA2, XKEY4 + aesdec XDATA3, XKEY4 + + aesdec XDATA0, XKEY ; 5. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + movdqa XKEY, [KEYS + 7*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + aesdec XDATA2, XKEY6 + aesdec XDATA3, XKEY6 + + aesdec XDATA0, XKEY ; 7. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + movdqa XKEY, [KEYS + 9*16] + + aesdec XDATA0, XKEY8 ; 8. DEC + aesdec XDATA1, XKEY8 + aesdec XDATA2, XKEY8 + aesdec XDATA3, XKEY8 + + aesdec XDATA0, XKEY ; 9. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + aesdeclast XDATA0, XKEY10 ; 10. DEC + aesdeclast XDATA1, XKEY10 + aesdeclast XDATA2, XKEY10 + aesdeclast XDATA3, XKEY10 + + pxor XDATA0, XIV + pxor XDATA1, XSAVED0 + pxor XDATA2, XSAVED1 + pxor XDATA3, XSAVED2 + + movdqu [OUT + IDX + 0*16 - 4*16], XDATA0 + movdqu [OUT + IDX + 1*16 - 4*16], XDATA1 + movdqu [OUT + IDX + 2*16 - 4*16], XDATA2 + movdqu [OUT + IDX + 3*16 - 4*16], XDATA3 + + movdqa XIV, XSAVED3 + + CMP IDX, LEN + jne main_loop + +done: +; Don't write back IV +; movdqu [IV], XIV + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif |