summaryrefslogtreecommitdiffstats
path: root/src/spdk/intel-ipsec-mb/sse/aes_xcbc_mac_128_x4.asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/spdk/intel-ipsec-mb/sse/aes_xcbc_mac_128_x4.asm')
-rw-r--r--src/spdk/intel-ipsec-mb/sse/aes_xcbc_mac_128_x4.asm303
1 files changed, 303 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/sse/aes_xcbc_mac_128_x4.asm b/src/spdk/intel-ipsec-mb/sse/aes_xcbc_mac_128_x4.asm
new file mode 100644
index 000000000..afbb38512
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes_xcbc_mac_128_x4.asm
@@ -0,0 +1,303 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+;;; routine to do 128 bit AES XCBC
+;;; process 4 buffers at a time, single data structure as input
+;;; Updates In pointer at end
+
+;; clobbers all registers except for ARG1 and rbp
+
+%include "include/os.asm"
+%include "mb_mgr_datastruct.asm"
+
+
+%ifndef AES_XCBC_X4
+%define AES_XCBC_X4 aes_xcbc_mac_128_x4
+%endif
+
+%define MOVDQ movdqu ;; assume buffers not aligned
+%macro pxor2 2
+ MOVDQ XTMP, %2
+ pxor %1, XTMP
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; struct AES_XCBC_ARGS_x8 {
+;; void* in[8];
+;; UINT128* keys[8];
+;; UINT128 ICV[8];
+;; }
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; void aes_xcbc_mac_128_x4(AES_XCBC_ARGS_x8 *args, UINT64 len);
+;; arg 1: ARG : addr of AES_XCBC_ARGS_x8 structure
+;; arg 2: LEN : len (in units of bytes)
+
+%ifdef LINUX
+%define ARG rdi
+%define LEN rsi
+%define REG3 rcx
+%define REG4 rdx
+%else
+%define ARG rcx
+%define LEN rdx
+%define REG3 rsi
+%define REG4 rdi
+%endif
+
+%define IDX rax
+
+%define IN0 r8
+%define KEYS0 rbx
+%define OUT0 r9
+
+%define IN1 r10
+%define KEYS1 REG3
+%define OUT1 r11
+
+%define IN2 r12
+%define KEYS2 REG4
+%define OUT2 r13
+
+%define IN3 r14
+%define KEYS3 rbp
+%define OUT3 r15
+
+
+%define XDATA0 xmm0
+%define XDATA1 xmm1
+%define XDATA2 xmm2
+%define XDATA3 xmm3
+
+%define XKEY0_3 xmm4
+%define XKEY0_6 [KEYS0 + 16*6]
+%define XTMP xmm5
+%define XKEY0_9 xmm6
+
+%define XKEY1_3 xmm7
+%define XKEY1_6 xmm8
+%define XKEY1_9 xmm9
+
+%define XKEY2_3 xmm10
+%define XKEY2_6 xmm11
+%define XKEY2_9 xmm12
+
+%define XKEY3_3 xmm13
+%define XKEY3_6 xmm14
+%define XKEY3_9 xmm15
+
+section .text
+
+MKGLOBAL(AES_XCBC_X4,function,internal)
+AES_XCBC_X4:
+
+ push rbp
+
+ mov IDX, 16
+
+ mov IN0, [ARG + _aesxcbcarg_in + 8*0]
+ mov IN1, [ARG + _aesxcbcarg_in + 8*1]
+ mov IN2, [ARG + _aesxcbcarg_in + 8*2]
+ mov IN3, [ARG + _aesxcbcarg_in + 8*3]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ MOVDQ XDATA0, [IN0] ; load first block of plain text
+ MOVDQ XDATA1, [IN1] ; load first block of plain text
+ MOVDQ XDATA2, [IN2] ; load first block of plain text
+ MOVDQ XDATA3, [IN3] ; load first block of plain text
+
+ mov KEYS0, [ARG + _aesxcbcarg_keys + 8*0]
+ mov KEYS1, [ARG + _aesxcbcarg_keys + 8*1]
+ mov KEYS2, [ARG + _aesxcbcarg_keys + 8*2]
+ mov KEYS3, [ARG + _aesxcbcarg_keys + 8*3]
+
+ pxor XDATA0, [ARG + _aesxcbcarg_ICV + 16*0] ; plaintext XOR ICV
+ pxor XDATA1, [ARG + _aesxcbcarg_ICV + 16*1] ; plaintext XOR ICV
+ pxor XDATA2, [ARG + _aesxcbcarg_ICV + 16*2] ; plaintext XOR ICV
+ pxor XDATA3, [ARG + _aesxcbcarg_ICV + 16*3] ; plaintext XOR ICV
+
+ pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK
+
+ aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+
+ movdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key
+ movdqa XKEY1_3, [KEYS1 + 16*3] ; load round 3 key
+ movdqa XKEY2_3, [KEYS2 + 16*3] ; load round 3 key
+ movdqa XKEY3_3, [KEYS3 + 16*3] ; load round 3 key
+
+ aesenc XDATA0, XKEY0_3 ; 3. ENC
+ aesenc XDATA1, XKEY1_3 ; 3. ENC
+ aesenc XDATA2, XKEY2_3 ; 3. ENC
+ aesenc XDATA3, XKEY3_3 ; 3. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC
+ aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC
+ aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+
+ movdqa XKEY1_6, [KEYS1 + 16*6] ; load round 6 key
+ movdqa XKEY2_6, [KEYS2 + 16*6] ; load round 6 key
+ movdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key
+
+ aesenc XDATA0, XKEY0_6 ; 6. ENC
+ aesenc XDATA1, XKEY1_6 ; 6. ENC
+ aesenc XDATA2, XKEY2_6 ; 6. ENC
+ aesenc XDATA3, XKEY3_6 ; 6. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+
+ movdqa XKEY0_9, [KEYS0 + 16*9] ; load round 9 key
+ movdqa XKEY1_9, [KEYS1 + 16*9] ; load round 9 key
+ movdqa XKEY2_9, [KEYS2 + 16*9] ; load round 9 key
+ movdqa XKEY3_9, [KEYS3 + 16*9] ; load round 9 key
+
+ aesenc XDATA0, XKEY0_9 ; 9. ENC
+ aesenc XDATA1, XKEY1_9 ; 9. ENC
+ aesenc XDATA2, XKEY2_9 ; 9. ENC
+ aesenc XDATA3, XKEY3_9 ; 9. ENC
+
+ aesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ aesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ aesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ aesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC
+
+ cmp LEN, IDX
+ je done
+
+main_loop:
+ pxor2 XDATA0, [IN0 + IDX] ; plaintext XOR ICV
+ pxor2 XDATA1, [IN1 + IDX] ; plaintext XOR ICV
+ pxor2 XDATA2, [IN2 + IDX] ; plaintext XOR ICV
+ pxor2 XDATA3, [IN3 + IDX] ; plaintext XOR ICV
+
+ pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK
+ pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK
+ pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK
+ pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK
+
+ aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC
+ aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC
+ aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC
+ aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC
+ aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC
+ aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC
+ aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC
+
+ aesenc XDATA0, XKEY0_3 ; 3. ENC
+ aesenc XDATA1, XKEY1_3 ; 3. ENC
+ aesenc XDATA2, XKEY2_3 ; 3. ENC
+ aesenc XDATA3, XKEY3_3 ; 3. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC
+ aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC
+ aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC
+ aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC
+ aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC
+ aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC
+ aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC
+
+ aesenc XDATA0, XKEY0_6 ; 6. ENC
+ aesenc XDATA1, XKEY1_6 ; 6. ENC
+ aesenc XDATA2, XKEY2_6 ; 6. ENC
+ aesenc XDATA3, XKEY3_6 ; 6. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC
+ aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC
+ aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC
+ aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC
+
+ aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC
+ aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC
+ aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC
+ aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC
+
+ aesenc XDATA0, XKEY0_9 ; 9. ENC
+ aesenc XDATA1, XKEY1_9 ; 9. ENC
+ aesenc XDATA2, XKEY2_9 ; 9. ENC
+ aesenc XDATA3, XKEY3_9 ; 9. ENC
+
+ aesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC
+ aesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC
+ aesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC
+ aesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC
+
+ add IDX, 16
+ cmp LEN, IDX
+ jne main_loop
+
+done:
+ ;; update ICV
+ movdqa [ARG + _aesxcbcarg_ICV + 16*0], XDATA0
+ movdqa [ARG + _aesxcbcarg_ICV + 16*1], XDATA1
+ movdqa [ARG + _aesxcbcarg_ICV + 16*2], XDATA2
+ movdqa [ARG + _aesxcbcarg_ICV + 16*3], XDATA3
+
+ ;; update IN
+ add IN0, LEN
+ mov [ARG + _aesxcbcarg_in + 8*0], IN0
+ add IN1, LEN
+ mov [ARG + _aesxcbcarg_in + 8*1], IN1
+ add IN2, LEN
+ mov [ARG + _aesxcbcarg_in + 8*2], IN2
+ add IN3, LEN
+ mov [ARG + _aesxcbcarg_in + 8*3], IN3
+
+ pop rbp
+
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif