From 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Sat, 27 Apr 2024 20:24:20 +0200
Subject: Adding upstream version 14.2.21.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 .../intel-ipsec-mb/sse/aes128_cntr_by4_sse.asm     | 331 +++++++++++++++++++++
 1 file changed, 331 insertions(+)
 create mode 100644 src/spdk/intel-ipsec-mb/sse/aes128_cntr_by4_sse.asm

(limited to 'src/spdk/intel-ipsec-mb/sse/aes128_cntr_by4_sse.asm')

diff --git a/src/spdk/intel-ipsec-mb/sse/aes128_cntr_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes128_cntr_by4_sse.asm
new file mode 100644
index 00000000..3df1f74f
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/aes128_cntr_by4_sse.asm
@@ -0,0 +1,331 @@
+;;
+;; Copyright (c) 2012-2018, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;;     * Redistributions of source code must retain the above copyright notice,
+;;       this list of conditions and the following disclaimer.
+;;     * Redistributions in binary form must reproduce the above copyright
+;;       notice, this list of conditions and the following disclaimer in the
+;;       documentation and/or other materials provided with the distribution.
+;;     * Neither the name of Intel Corporation nor the names of its contributors
+;;       may be used to endorse or promote products derived from this software
+;;       without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "os.asm"
+%include "memcpy.asm"
+
+; routine to do AES128 CNTR enc/decrypt "by4"
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+extern byteswap_const, ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4
+
+%define CONCAT(a,b) a %+ b
+%define MOVDQ movdqu
+
+%define xdata0	xmm0
+%define xdata1	xmm1
+%define xdata2	xmm2
+%define xdata3	xmm3
+%define xdata4	xmm4
+%define xdata5	xmm5
+%define xdata6	xmm6
+%define xdata7	xmm7
+%define xcounter xmm8
+%define xbyteswap xmm9
+%define xkey0 	xmm10
+%define xkey3 	xmm11
+%define xkey6 	xmm12
+%define xkey9	xmm13
+%define xkeyA	xmm14
+%define xkeyB	xmm15
+
+%ifdef LINUX
+%define p_in	  rdi
+%define p_IV	  rsi
+%define p_keys	  rdx
+%define p_out	  rcx
+%define num_bytes r8
+%define p_ivlen   r9
+%else
+%define p_in	  rcx
+%define p_IV	  rdx
+%define p_keys	  r8
+%define p_out	  r9
+%define num_bytes r10
+%define p_ivlen   qword [rsp + 8*6]
+%endif
+
+%define p_tmp	rsp + _buffer
+%define tmp	r11
+
+%macro do_aes_load 1
+	do_aes %1, 1
+%endmacro
+
+%macro do_aes_noload 1
+	do_aes %1, 0
+%endmacro
+
+; do_aes num_in_par load_keys
+; This increments p_in, but not p_out
+%macro do_aes 2
+%define %%by %1
+%define %%load_keys %2
+
+%if (%%load_keys)
+	movdqa	xkey0, [p_keys + 0*16]
+%endif
+
+	movdqa	xdata0, xcounter
+	pshufb	xdata0, xbyteswap
+%assign i 1
+%rep (%%by - 1)
+	movdqa	CONCAT(xdata,i), xcounter
+	paddd	CONCAT(xdata,i), [rel CONCAT(ddq_add_,i)]
+	pshufb	CONCAT(xdata,i), xbyteswap
+%assign i (i + 1)
+%endrep
+
+	movdqa	xkeyA, [p_keys + 1*16]
+
+	pxor	xdata0, xkey0
+	paddd	xcounter, [rel CONCAT(ddq_add_,%%by)]
+%assign i 1
+%rep (%%by - 1)
+	pxor	CONCAT(xdata,i), xkey0
+%assign i (i + 1)
+%endrep
+
+	movdqa	xkeyB, [p_keys + 2*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyA		; key 1
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+	movdqa	xkey3, [p_keys + 3*16]
+%endif
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyB		; key 2
+%assign i (i+1)
+%endrep
+
+	add	p_in, 16*%%by
+
+	movdqa	xkeyB, [p_keys + 4*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkey3		; key 3
+%assign i (i+1)
+%endrep
+
+	movdqa	xkeyA, [p_keys + 5*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyB		; key 4
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+	movdqa	xkey6, [p_keys + 6*16]
+%endif
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyA		; key 5
+%assign i (i+1)
+%endrep
+
+	movdqa	xkeyA, [p_keys + 7*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkey6		; key 6
+%assign i (i+1)
+%endrep
+
+	movdqa	xkeyB, [p_keys + 8*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyA		; key 7
+%assign i (i+1)
+%endrep
+
+%if (%%load_keys)
+	movdqa	xkey9, [p_keys + 9*16]
+%endif
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkeyB		; key 8
+%assign i (i+1)
+%endrep
+
+	movdqa	xkeyB, [p_keys + 10*16]
+%assign i 0
+%rep %%by
+	aesenc	CONCAT(xdata,i), xkey9		; key 9
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep %%by
+	aesenclast	CONCAT(xdata,i), xkeyB		; key 10
+%assign i (i+1)
+%endrep
+
+%assign i 0
+%rep (%%by / 2)
+%assign j (i+1)
+	MOVDQ	xkeyA, [p_in + i*16 - 16*%%by]
+	MOVDQ	xkeyB, [p_in + j*16 - 16*%%by]
+	pxor	CONCAT(xdata,i), xkeyA
+	pxor	CONCAT(xdata,j), xkeyB
+%assign i (i+2)
+%endrep
+%if (i < %%by)
+	MOVDQ	xkeyA, [p_in + i*16 - 16*%%by]
+	pxor	CONCAT(xdata,i), xkeyA
+%endif
+
+%assign i 0
+%rep %%by
+	MOVDQ	[p_out  + i*16], CONCAT(xdata,i)
+%assign i (i+1)
+%endrep
+%endmacro
+
+struc STACK
+_buffer:	resq	2
+_rsp_save:	resq	1
+endstruc
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+section .text
+
+;; aes_cntr_128_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len)
+align 32
+MKGLOBAL(aes_cntr_128_sse,function,internal)
+aes_cntr_128_sse:
+
+%ifndef LINUX
+	mov	num_bytes, [rsp + 8*5] ; arg5
+%endif
+
+	movdqa	xbyteswap, [rel byteswap_const]
+        test    p_ivlen, 16
+        jnz     iv_is_16_bytes
+        ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001
+        mov     DWORD(tmp), 0x01000000
+        pinsrq  xcounter, [p_IV], 0
+        pinsrd  xcounter, [p_IV + 8], 2
+        pinsrd  xcounter, DWORD(tmp), 3
+bswap_iv:
+	pshufb	xcounter, xbyteswap
+
+	mov	tmp, num_bytes
+	and	tmp, 3*16
+	jz	chk             ; x4 > or < 15 (not 3 lines)
+
+	; 1 <= tmp <= 3
+	cmp	tmp, 2*16
+	jg	eq3
+	je	eq2
+eq1:
+	do_aes_load	1	; 1 block
+	add	p_out, 1*16
+        jmp     chk
+
+eq2:
+	do_aes_load	2	; 2 blocks
+	add	p_out, 2*16
+        jmp      chk
+
+eq3:
+	do_aes_load	3	; 3 blocks
+	add	p_out, 3*16
+	; fall through to chk
+chk:
+        and	num_bytes, ~(3*16)
+        jz	do_return2
+        cmp	num_bytes, 16
+        jb	last
+
+	; process multiples of 4 blocks
+	movdqa	xkey0, [p_keys + 0*16]
+	movdqa	xkey3, [p_keys + 3*16]
+	movdqa	xkey6, [p_keys + 6*16]
+	movdqa	xkey9, [p_keys + 9*16]
+	jmp	main_loop2
+
+align 32
+main_loop2:
+	; num_bytes is a multiple of 4 blocks + partial bytes
+	do_aes_noload	4
+	add	p_out,	4*16
+	sub	num_bytes, 4*16
+        cmp	num_bytes, 4*16
+	jae	main_loop2
+
+	test	num_bytes, 15	; partial bytes to be processed?
+	jnz	last
+
+do_return2:
+        ; don't return updated IV
+; 	pshufb	xcounter, xbyteswap
+;	movdqu	[p_IV], xcounter
+	ret
+
+last:
+	;; Code dealing with the partial block cases
+	; reserve 16 byte aligned buffer on the stack
+        mov	rax, rsp
+        sub	rsp, STACK_size
+        and	rsp, -16
+	mov	[rsp + _rsp_save], rax ; save SP
+
+	; copy input bytes into scratch buffer
+	memcpy_sse_16_1	p_tmp, p_in, num_bytes, tmp, rax
+	; Encryption of a single partial block (p_tmp)
+        pshufb	xcounter, xbyteswap
+        movdqa	xdata0, xcounter
+        pxor    xdata0, [p_keys + 16*0]
+%assign i 1
+%rep 9
+        aesenc  xdata0, [p_keys + 16*i]
+%assign i (i+1)
+%endrep
+	; created keystream
+        aesenclast xdata0, [p_keys + 16*i]
+	; xor keystream with the message (scratch)
+        pxor	xdata0, [p_tmp]
+	movdqa	[p_tmp], xdata0
+	; copy result into the output buffer
+	memcpy_sse_16_1	p_out, p_tmp, num_bytes, tmp, rax
+	; remove the stack frame
+	mov	rsp, [rsp + _rsp_save]	; original SP
+	jmp	do_return2
+
+iv_is_16_bytes:
+        ; Read 16 byte IV: Nonce + ESP IV + block counter (BE)
+        movdqu  xcounter, [p_IV]
+        jmp     bswap_iv
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
-- 
cgit v1.2.3