summaryrefslogtreecommitdiffstats
path: root/src/spdk/intel-ipsec-mb/sse/aes_ecb_by4_sse.asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/spdk/intel-ipsec-mb/sse/aes_ecb_by4_sse.asm')
-rw-r--r--src/spdk/intel-ipsec-mb/sse/aes_ecb_by4_sse.asm654
1 files changed, 654 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/sse/aes_ecb_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes_ecb_by4_sse.asm
new file mode 100644
index 000000000..c4b767932
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes_ecb_by4_sse.asm
@@ -0,0 +1,654 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+; routine to do AES ECB encrypt/decrypt on 16n bytes doing AES by 4
+
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void aes_ecb_x_y_sse(void *in,
+; UINT128 keys[],
+; void *out,
+; UINT64 len_bytes);
+;
+; x = direction (enc/dec)
+; y = key size (128/192/256)
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: KEYS: pointer to keys
+; arg 3: OUT: pointer to output (plain text)
+; arg 4: LEN: length in bytes (multiple of 16)
+;
+
+%include "include/os.asm"
+
+%ifndef AES_ECB_ENC_128
+%define AES_ECB_ENC_128 aes_ecb_enc_128_sse
+%define AES_ECB_ENC_192 aes_ecb_enc_192_sse
+%define AES_ECB_ENC_256 aes_ecb_enc_256_sse
+%define AES_ECB_DEC_128 aes_ecb_dec_128_sse
+%define AES_ECB_DEC_192 aes_ecb_dec_192_sse
+%define AES_ECB_DEC_256 aes_ecb_dec_256_sse
+%endif
+
+%ifdef LINUX
+%define IN rdi
+%define KEYS rsi
+%define OUT rdx
+%define LEN rcx
+%else
+%define IN rcx
+%define KEYS rdx
+%define OUT r8
+%define LEN r9
+%endif
+
+%define IDX rax
+%define TMP IDX
+%define XDATA0 xmm0
+%define XDATA1 xmm1
+%define XDATA2 xmm2
+%define XDATA3 xmm3
+%define XKEY0 xmm4
+%define XKEY2 xmm5
+%define XKEY4 xmm6
+%define XKEY6 xmm7
+%define XKEY10 xmm8
+%define XKEY_A xmm14
+%define XKEY_B xmm15
+
+section .text
+
+%macro AES_ECB 2
+%define %%NROUNDS %1 ; [in] Number of AES rounds, numerical value
+%define %%DIR %2 ; [in] Direction (encrypt/decrypt)
+
+%ifidn %%DIR, ENC
+%define AES aesenc
+%define AES_LAST aesenclast
+%else ; DIR = DEC
+%define AES aesdec
+%define AES_LAST aesdeclast
+%endif
+ mov TMP, LEN
+ and TMP, 3*16
+ jz %%initial_4
+ cmp TMP, 2*16
+ jb %%initial_1
+ ja %%initial_3
+
+%%initial_2:
+ ; load plain/cipher text
+ movdqu XDATA0, [IN + 0*16]
+ movdqu XDATA1, [IN + 1*16]
+
+ movdqa XKEY0, [KEYS + 0*16]
+
+ pxor XDATA0, XKEY0 ; 0. ARK
+ pxor XDATA1, XKEY0
+
+ movdqa XKEY2, [KEYS + 2*16]
+
+ AES XDATA0, [KEYS + 1*16] ; 1. ENC
+ AES XDATA1, [KEYS + 1*16]
+
+ mov IDX, 2*16
+
+ AES XDATA0, XKEY2 ; 2. ENC
+ AES XDATA1, XKEY2
+
+ movdqa XKEY4, [KEYS + 4*16]
+
+ AES XDATA0, [KEYS + 3*16] ; 3. ENC
+ AES XDATA1, [KEYS + 3*16]
+
+ AES XDATA0, XKEY4 ; 4. ENC
+ AES XDATA1, XKEY4
+
+ movdqa XKEY6, [KEYS + 6*16]
+
+ AES XDATA0, [KEYS + 5*16] ; 5. ENC
+ AES XDATA1, [KEYS + 5*16]
+
+ AES XDATA0, XKEY6 ; 6. ENC
+ AES XDATA1, XKEY6
+
+ movdqa XKEY_B, [KEYS + 8*16]
+
+ AES XDATA0, [KEYS + 7*16] ; 7. ENC
+ AES XDATA1, [KEYS + 7*16]
+
+ AES XDATA0, XKEY_B ; 8. ENC
+ AES XDATA1, XKEY_B
+
+ movdqa XKEY10, [KEYS + 10*16]
+
+ AES XDATA0, [KEYS + 9*16] ; 9. ENC
+ AES XDATA1, [KEYS + 9*16]
+
+%if %%NROUNDS >= 12
+ AES XDATA0, XKEY10 ; 10. ENC
+ AES XDATA1, XKEY10
+
+ AES XDATA0, [KEYS + 11*16] ; 11. ENC
+ AES XDATA1, [KEYS + 11*16]
+%endif
+
+%if %%NROUNDS == 14
+ AES XDATA0, [KEYS + 12*16] ; 12. ENC
+ AES XDATA1, [KEYS + 12*16]
+
+ AES XDATA0, [KEYS + 13*16] ; 13. ENC
+ AES XDATA1, [KEYS + 13*16]
+%endif
+
+%if %%NROUNDS == 10
+ AES_LAST XDATA0, XKEY10 ; 10. ENC
+ AES_LAST XDATA1, XKEY10
+%elif %%NROUNDS == 12
+ AES_LAST XDATA0, [KEYS + 12*16] ; 12. ENC
+ AES_LAST XDATA1, [KEYS + 12*16]
+%else
+ AES_LAST XDATA0, [KEYS + 14*16] ; 14. ENC
+ AES_LAST XDATA1, [KEYS + 14*16]
+%endif
+ movdqu [OUT + 0*16], XDATA0
+ movdqu [OUT + 1*16], XDATA1
+
+ cmp LEN, 2*16
+ je %%done
+ jmp %%main_loop
+
+
+ align 16
+%%initial_1:
+ ; load plain/cipher text
+ movdqu XDATA0, [IN + 0*16]
+
+ movdqa XKEY0, [KEYS + 0*16]
+
+ pxor XDATA0, XKEY0 ; 0. ARK
+
+ movdqa XKEY2, [KEYS + 2*16]
+
+ AES XDATA0, [KEYS + 1*16] ; 1. ENC
+
+ mov IDX, 1*16
+
+ AES XDATA0, XKEY2 ; 2. ENC
+
+ movdqa XKEY4, [KEYS + 4*16]
+
+ AES XDATA0, [KEYS + 3*16] ; 3. ENC
+
+ AES XDATA0, XKEY4 ; 4. ENC
+
+ movdqa XKEY6, [KEYS + 6*16]
+
+ AES XDATA0, [KEYS + 5*16] ; 5. ENC
+
+ AES XDATA0, XKEY6 ; 6. ENC
+
+ movdqa XKEY_B, [KEYS + 8*16]
+
+ AES XDATA0, [KEYS + 7*16] ; 7. ENC
+
+ AES XDATA0, XKEY_B ; 8. ENC
+
+ movdqa XKEY10, [KEYS + 10*16]
+
+ AES XDATA0, [KEYS + 9*16] ; 9. ENC
+
+%if %%NROUNDS >= 12
+ AES XDATA0, XKEY10 ; 10. ENC
+
+ AES XDATA0, [KEYS + 11*16] ; 11. ENC
+%endif
+
+%if %%NROUNDS == 14
+ AES XDATA0, [KEYS + 12*16] ; 12. ENC
+
+ AES XDATA0, [KEYS + 13*16] ; 13. ENC
+%endif
+
+%if %%NROUNDS == 10
+
+ AES_LAST XDATA0, XKEY10 ; 10. ENC
+%elif %%NROUNDS == 12
+ AES_LAST XDATA0, [KEYS + 12*16] ; 12. ENC
+%else
+ AES_LAST XDATA0, [KEYS + 14*16] ; 14. ENC
+%endif
+
+ movdqu [OUT + 0*16], XDATA0
+
+ cmp LEN, 1*16
+ je %%done
+ jmp %%main_loop
+
+
+%%initial_3:
+ ; load plain/cipher text
+ movdqu XDATA0, [IN + 0*16]
+ movdqu XDATA1, [IN + 1*16]
+ movdqu XDATA2, [IN + 2*16]
+
+ movdqa XKEY0, [KEYS + 0*16]
+
+ movdqa XKEY_A, [KEYS + 1*16]
+
+ pxor XDATA0, XKEY0 ; 0. ARK
+ pxor XDATA1, XKEY0
+ pxor XDATA2, XKEY0
+
+ movdqa XKEY2, [KEYS + 2*16]
+
+ AES XDATA0, XKEY_A ; 1. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 3*16]
+ mov IDX, 3*16
+
+ AES XDATA0, XKEY2 ; 2. ENC
+ AES XDATA1, XKEY2
+ AES XDATA2, XKEY2
+
+ movdqa XKEY4, [KEYS + 4*16]
+
+ AES XDATA0, XKEY_A ; 3. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 5*16]
+
+ AES XDATA0, XKEY4 ; 4. ENC
+ AES XDATA1, XKEY4
+ AES XDATA2, XKEY4
+
+ movdqa XKEY6, [KEYS + 6*16]
+
+ AES XDATA0, XKEY_A ; 5. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 7*16]
+
+ AES XDATA0, XKEY6 ; 6. ENC
+ AES XDATA1, XKEY6
+ AES XDATA2, XKEY6
+
+ movdqa XKEY_B, [KEYS + 8*16]
+
+ AES XDATA0, XKEY_A ; 7. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 9*16]
+
+ AES XDATA0, XKEY_B ; 8. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+
+ movdqa XKEY_B, [KEYS + 10*16]
+
+ AES XDATA0, XKEY_A ; 9. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+
+%if %%NROUNDS >= 12
+ movdqa XKEY_A, [KEYS + 11*16]
+
+ AES XDATA0, XKEY_B ; 10. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+
+ movdqa XKEY_B, [KEYS + 12*16]
+
+ AES XDATA0, XKEY_A ; 11. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+
+%endif
+
+%if %%NROUNDS == 14
+ movdqa XKEY_A, [KEYS + 13*16]
+
+ AES XDATA0, XKEY_B ; 12. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+
+ movdqa XKEY_B, [KEYS + 14*16]
+
+ AES XDATA0, XKEY_A ; 13. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+%endif
+
+ AES_LAST XDATA0, XKEY_B ; 10/12/14. ENC (depending on key size)
+ AES_LAST XDATA1, XKEY_B
+ AES_LAST XDATA2, XKEY_B
+
+ movdqu [OUT + 0*16], XDATA0
+ movdqu [OUT + 1*16], XDATA1
+ movdqu [OUT + 2*16], XDATA2
+
+ cmp LEN, 3*16
+ je %%done
+ jmp %%main_loop
+
+
+ align 16
+%%initial_4:
+ ; load plain/cipher text
+ movdqu XDATA0, [IN + 0*16]
+ movdqu XDATA1, [IN + 1*16]
+ movdqu XDATA2, [IN + 2*16]
+ movdqu XDATA3, [IN + 3*16]
+
+ movdqa XKEY0, [KEYS + 0*16]
+
+ movdqa XKEY_A, [KEYS + 1*16]
+
+ pxor XDATA0, XKEY0 ; 0. ARK
+ pxor XDATA1, XKEY0
+ pxor XDATA2, XKEY0
+ pxor XDATA3, XKEY0
+
+ movdqa XKEY2, [KEYS + 2*16]
+
+ AES XDATA0, XKEY_A ; 1. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 3*16]
+
+ mov IDX, 4*16
+
+ AES XDATA0, XKEY2 ; 2. ENC
+ AES XDATA1, XKEY2
+ AES XDATA2, XKEY2
+ AES XDATA3, XKEY2
+
+ movdqa XKEY4, [KEYS + 4*16]
+
+ AES XDATA0, XKEY_A ; 3. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 5*16]
+
+ AES XDATA0, XKEY4 ; 4. ENC
+ AES XDATA1, XKEY4
+ AES XDATA2, XKEY4
+ AES XDATA3, XKEY4
+
+ movdqa XKEY6, [KEYS + 6*16]
+
+ AES XDATA0, XKEY_A ; 5. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 7*16]
+
+ AES XDATA0, XKEY6 ; 6. ENC
+ AES XDATA1, XKEY6
+ AES XDATA2, XKEY6
+ AES XDATA3, XKEY6
+
+ movdqa XKEY_B, [KEYS + 8*16]
+
+ AES XDATA0, XKEY_A ; 7. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 9*16]
+
+ AES XDATA0, XKEY_B ; 8. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+ AES XDATA3, XKEY_B
+
+ movdqa XKEY_B, [KEYS + 10*16]
+
+ AES XDATA0, XKEY_A ; 9. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+%if %%NROUNDS >= 12
+ movdqa XKEY_A, [KEYS + 11*16]
+
+ AES XDATA0, XKEY_B ; 10. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+ AES XDATA3, XKEY_B
+
+ movdqa XKEY_B, [KEYS + 12*16]
+
+ AES XDATA0, XKEY_A ; 11. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+%endif
+
+%if %%NROUNDS == 14
+ movdqa XKEY_A, [KEYS + 13*16]
+
+ AES XDATA0, XKEY_B ; 12. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+ AES XDATA3, XKEY_B
+
+ movdqa XKEY_B, [KEYS + 14*16]
+
+ AES XDATA0, XKEY_A ; 13. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+%endif
+
+ AES_LAST XDATA0, XKEY_B ; 10/12/14. ENC (depending on key size)
+ AES_LAST XDATA1, XKEY_B
+ AES_LAST XDATA2, XKEY_B
+ AES_LAST XDATA3, XKEY_B
+
+ movdqu [OUT + 0*16], XDATA0
+ movdqu [OUT + 1*16], XDATA1
+ movdqu [OUT + 2*16], XDATA2
+ movdqu [OUT + 3*16], XDATA3
+
+ cmp LEN, 4*16
+ jz %%done
+ jmp %%main_loop
+
+ align 16
+%%main_loop:
+ ; load plain/cipher text
+ movdqu XDATA0, [IN + IDX + 0*16]
+ movdqu XDATA1, [IN + IDX + 1*16]
+ movdqu XDATA2, [IN + IDX + 2*16]
+ movdqu XDATA3, [IN + IDX + 3*16]
+
+ movdqa XKEY_A, [KEYS + 1*16]
+
+ pxor XDATA0, XKEY0 ; 0. ARK
+ pxor XDATA1, XKEY0
+ pxor XDATA2, XKEY0
+ pxor XDATA3, XKEY0
+
+ add IDX, 4*16
+
+ AES XDATA0, XKEY_A ; 1. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 3*16]
+
+ AES XDATA0, XKEY2 ; 2. ENC
+ AES XDATA1, XKEY2
+ AES XDATA2, XKEY2
+ AES XDATA3, XKEY2
+
+ AES XDATA0, XKEY_A ; 3. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 5*16]
+
+ AES XDATA0, XKEY4 ; 4. ENC
+ AES XDATA1, XKEY4
+ AES XDATA2, XKEY4
+ AES XDATA3, XKEY4
+
+ AES XDATA0, XKEY_A ; 5. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 7*16]
+
+ AES XDATA0, XKEY6 ; 6. ENC
+ AES XDATA1, XKEY6
+ AES XDATA2, XKEY6
+ AES XDATA3, XKEY6
+
+ movdqa XKEY_B, [KEYS + 8*16]
+
+ AES XDATA0, XKEY_A ; 7. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+ movdqa XKEY_A, [KEYS + 9*16]
+
+ AES XDATA0, XKEY_B ; 8. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+ AES XDATA3, XKEY_B
+
+ movdqa XKEY_B, [KEYS + 10*16]
+
+ AES XDATA0, XKEY_A ; 9. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+
+%if %%NROUNDS >= 12
+ movdqa XKEY_A, [KEYS + 11*16]
+
+ AES XDATA0, XKEY_B ; 10. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+ AES XDATA3, XKEY_B
+
+ movdqa XKEY_B, [KEYS + 12*16]
+
+ AES XDATA0, XKEY_A ; 11. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+%endif
+
+%if %%NROUNDS == 14
+ movdqa XKEY_A, [KEYS + 13*16]
+
+ AES XDATA0, XKEY_B ; 12. ENC
+ AES XDATA1, XKEY_B
+ AES XDATA2, XKEY_B
+ AES XDATA3, XKEY_B
+
+ movdqa XKEY_B, [KEYS + 14*16]
+
+ AES XDATA0, XKEY_A ; 13. ENC
+ AES XDATA1, XKEY_A
+ AES XDATA2, XKEY_A
+ AES XDATA3, XKEY_A
+%endif
+
+ AES_LAST XDATA0, XKEY_B ; 10/12/14. ENC (depending on key size)
+ AES_LAST XDATA1, XKEY_B
+ AES_LAST XDATA2, XKEY_B
+ AES_LAST XDATA3, XKEY_B
+
+ movdqu [OUT + IDX + 0*16 - 4*16], XDATA0
+ movdqu [OUT + IDX + 1*16 - 4*16], XDATA1
+ movdqu [OUT + IDX + 2*16 - 4*16], XDATA2
+ movdqu [OUT + IDX + 3*16 - 4*16], XDATA3
+
+ cmp IDX, LEN
+ jne %%main_loop
+
+%%done:
+
+ ret
+
+%endmacro
+
+align 16
+MKGLOBAL(AES_ECB_ENC_128,function,internal)
+AES_ECB_ENC_128:
+
+ AES_ECB 10, ENC
+
+align 16
+MKGLOBAL(AES_ECB_ENC_192,function,internal)
+AES_ECB_ENC_192:
+
+ AES_ECB 12, ENC
+
+align 16
+MKGLOBAL(AES_ECB_ENC_256,function,internal)
+AES_ECB_ENC_256:
+
+ AES_ECB 14, ENC
+
+align 16
+MKGLOBAL(AES_ECB_DEC_128,function,internal)
+AES_ECB_DEC_128:
+
+ AES_ECB 10, DEC
+
+align 16
+MKGLOBAL(AES_ECB_DEC_192,function,internal)
+AES_ECB_DEC_192:
+
+ AES_ECB 12, DEC
+
+align 16
+MKGLOBAL(AES_ECB_DEC_256,function,internal)
+AES_ECB_DEC_256:
+
+ AES_ECB 14, DEC
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif