From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Sun, 21 Apr 2024 13:54:28 +0200
Subject: Adding upstream version 18.2.2.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 .../md5_mb/md5_mb_mgr_submit_avx512.asm            | 283 +++++++++++++++++++++
 1 file changed, 283 insertions(+)
 create mode 100644 src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm

(limited to 'src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm')

diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm
new file mode 100644
index 000000000..1bbc2be2c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm
@@ -0,0 +1,283 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
+;    * Redistributions of source code must retain the above copyright
+;      notice, this list of conditions and the following disclaimer.
+;    * Redistributions in binary form must reproduce the above copyright
+;      notice, this list of conditions and the following disclaimer in
+;      the documentation and/or other materials provided with the
+;      distribution.
+;    * Neither the name of Intel Corporation nor the names of its
+;      contributors may be used to endorse or promote products derived
+;      from this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+extern md5_mb_x16x2_avx512
+
+[bits 64]
+default rel
+section .text
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1    rcx
+%define arg2    rdx
+
+%define lane	rsi
+
+%else
+; UN*X register definitions
+%define arg1    rdi
+%define arg2    rsi
+
+%define lane	rdx
+
+%endif
+
+; Common definitions
+%define state   arg1
+%define job     arg2
+%define len2    arg2
+
+; idx needs to be in a register not clobberred by md5_mb_x16_avx512
+%define idx	rbp
+
+%define p	r11
+
+%define unused_lanes	ymm7
+
+%define job_rax	rax
+%define len	rax
+
+%define num_lanes_inuse r9
+
+%define lane_data	r10
+
+%endif ; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE	8*8 + 16*10 + 8
+
+;; Byte shift in MEM addr, read a extra byte [addr+16]
+%macro MEM_VPSRLDDQ 2
+%define %%addr	  %1
+%define %%TMP_YMM %2
+	vmovdqu	%%TMP_YMM, [%%addr + 1]
+	vmovdqu [%%addr], %%TMP_YMM
+	mov	[%%addr + 31], byte 0
+%endmacro
+
+;; Byte shift in MEM addr, read a extra byte [addr-1]
+%macro MEM_VPSLLDDQ 2
+%define %%addr	  %1
+%define %%TMP_YMM %2
+	vmovdqu	%%TMP_YMM, [%%addr-1]
+	vmovdqu [%%addr], %%TMP_YMM
+	mov	[%%addr], byte 0
+%endmacro
+
+align 64
+
+; JOB* submit_job(MB_MGR *state, JOB_MD5 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global md5_mb_mgr_submit_avx512, function
+md5_mb_mgr_submit_avx512:
+	endbranch
+
+	sub	rsp, STACK_SPACE
+	; we need to save/restore all GPRs because lower layer clobbers them
+	mov	[rsp + 8*0], rbx
+	mov	[rsp + 8*1], rbp
+	mov	[rsp + 8*2], r12
+	mov	[rsp + 8*3], r13
+	mov	[rsp + 8*4], r14
+	mov	[rsp + 8*5], r15
+%ifidn __OUTPUT_FORMAT__, win64
+	mov	[rsp + 8*6], rsi
+	mov	[rsp + 8*7], rdi
+	vmovdqa  [rsp + 8*8 + 16*0], xmm6
+	vmovdqa  [rsp + 8*8 + 16*1], xmm7
+	vmovdqa  [rsp + 8*8 + 16*2], xmm8
+	vmovdqa  [rsp + 8*8 + 16*3], xmm9
+	vmovdqa  [rsp + 8*8 + 16*4], xmm10
+	vmovdqa  [rsp + 8*8 + 16*5], xmm11
+	vmovdqa  [rsp + 8*8 + 16*6], xmm12
+	vmovdqa  [rsp + 8*8 + 16*7], xmm13
+	vmovdqa  [rsp + 8*8 + 16*8], xmm14
+	vmovdqa  [rsp + 8*8 + 16*9], xmm15
+%endif
+
+	mov	lane, [state + _unused_lanes]
+	and	lane, 0x3F
+	MEM_VPSRLDDQ (state + _unused_lanes), unused_lanes
+	imul	lane_data, lane, _LANE_DATA_size
+	mov	dword [job + _status], STS_BEING_PROCESSED
+	lea	lane_data, [state + _ldata + lane_data]
+	mov	DWORD(len), [job + _len]
+
+	shl	len, 6	; low 5 bits store idx
+	or	len, lane
+
+	mov	[lane_data + _job_in_lane], job
+	mov	[state + _lens + 4*lane], DWORD(len)
+
+	; Load digest words from result_digest
+	vmovdqu	xmm0, [job + _result_digest + 0*16]
+	vmovd	[state + _args_digest + 4*lane + 0*4*16*2], xmm0
+	vpextrd [state + _args_digest + 4*lane + 1*4*16*2], xmm0, 1
+	vpextrd [state + _args_digest + 4*lane + 2*4*16*2], xmm0, 2
+	vpextrd [state + _args_digest + 4*lane + 3*4*16*2], xmm0, 3
+
+	mov	p, [job + _buffer]
+	mov	[state + _args_data_ptr + 8*lane], p
+
+	mov	DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+	add	num_lanes_inuse, 1
+	mov	[state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+	cmp	num_lanes_inuse, 32
+	jne	return_null
+
+start_loop:
+	; Find min length
+	vmovdqu ymm0, [state + _lens + 0*32]
+	vmovdqu ymm1, [state + _lens + 1*32]
+
+	vpminud ymm2, ymm0, ymm1	; ymm2 has {D,C,B,A}
+	vpalignr ymm3, ymm3, ymm2, 8    ; ymm3 has {x,x,D,C}
+	vpminud ymm2, ymm2, ymm3	; ymm2 has {x,x,E,F}
+	vpalignr ymm3, ymm3, ymm2, 4    ; ymm3 has {x,x,x,E}
+	vpminud ymm2, ymm2, ymm3	; ymm2 has min value in low dword
+	vperm2i128 ymm3, ymm2, ymm2, 1	; ymm3 has halves of ymm2 reversed
+	vpminud ymm2, ymm2, ymm3	; ymm2 has min value in low dword
+
+	; Find min length
+	vmovdqu ymm5, [state + _lens + 2*32]
+	vmovdqu ymm6, [state + _lens + 3*32]
+
+	vpminud ymm4, ymm5, ymm6	; ymm4 has {D,C,B,A}
+	vpalignr ymm3, ymm3, ymm4, 8    ; ymm3 has {x,x,D,C}
+	vpminud ymm4, ymm4, ymm3	; ymm4 has {x,x,E,F}
+	vpalignr ymm3, ymm3, ymm4, 4    ; ymm3 has {x,x,x,E}
+	vpminud ymm4, ymm4, ymm3	; ymm4 has min value in low dword
+	vperm2i128 ymm3, ymm4, ymm4, 1	; ymm3 has halves of ymm4 reversed
+	vpminud ymm4, ymm4, ymm3	; ymm4 has min value in low dword
+
+	vpminud ymm2, ymm2, ymm4	; ymm2 has min value in low dword
+	vmovd	DWORD(idx), xmm2
+	mov	len2, idx
+	and	idx, 0x3F
+	shr	len2, 6
+	jz	len_is_0
+
+	vpand	ymm2, ymm2, [rel clear_low_6bits]
+	vpshufd ymm2, ymm2, 0
+
+	vpsubd  ymm0, ymm0, ymm2
+	vpsubd  ymm1, ymm1, ymm2
+	vpsubd  ymm5, ymm5, ymm2
+	vpsubd  ymm6, ymm6, ymm2
+
+	vmovdqu [state + _lens + 0*32], ymm0
+	vmovdqu [state + _lens + 1*32], ymm1
+	vmovdqu [state + _lens + 2*32], ymm5
+	vmovdqu [state + _lens + 3*32], ymm6
+
+	; "state" and "args" are the same address, arg1
+	; len is arg2
+	call	md5_mb_x16x2_avx512
+	; state and idx are intact
+
+len_is_0:
+	; process completed job "idx"
+	imul	lane_data, idx, _LANE_DATA_size
+	lea	lane_data, [state + _ldata + lane_data]
+
+	mov	job_rax, [lane_data + _job_in_lane]
+	mov	lane, [state + _unused_lanes]
+	mov	qword [lane_data + _job_in_lane], 0
+	mov	dword [job_rax + _status], STS_COMPLETED
+
+	shl	lane, 8
+	or	 lane, idx
+	MEM_VPSLLDDQ	(state + _unused_lanes), unused_lanes
+	mov	[state + _unused_lanes], lane
+
+	mov	DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+	sub	num_lanes_inuse, 1
+	mov	[state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+	mov	dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+	vmovd	 xmm0, [state + _args_digest + 4*idx + 0*4*16*2]
+	vpinsrd  xmm0, [state + _args_digest + 4*idx + 1*4*16*2], 1
+	vpinsrd  xmm0, [state + _args_digest + 4*idx + 2*4*16*2], 2
+	vpinsrd  xmm0, [state + _args_digest + 4*idx + 3*4*16*2], 3
+
+	vmovdqa  [job_rax + _result_digest + 0*16], xmm0
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+	vmovdqa  xmm6, [rsp + 8*8 + 16*0]
+	vmovdqa  xmm7, [rsp + 8*8 + 16*1]
+	vmovdqa  xmm8, [rsp + 8*8 + 16*2]
+	vmovdqa  xmm9, [rsp + 8*8 + 16*3]
+	vmovdqa  xmm10, [rsp + 8*8 + 16*4]
+	vmovdqa  xmm11, [rsp + 8*8 + 16*5]
+	vmovdqa  xmm12, [rsp + 8*8 + 16*6]
+	vmovdqa  xmm13, [rsp + 8*8 + 16*7]
+	vmovdqa  xmm14, [rsp + 8*8 + 16*8]
+	vmovdqa  xmm15, [rsp + 8*8 + 16*9]
+	mov	rsi, [rsp + 8*6]
+	mov	rdi, [rsp + 8*7]
+%endif
+	mov	rbx, [rsp + 8*0]
+	mov	rbp, [rsp + 8*1]
+	mov	r12, [rsp + 8*2]
+	mov	r13, [rsp + 8*3]
+	mov	r14, [rsp + 8*4]
+	mov	r15, [rsp + 8*5]
+
+	add	rsp, STACK_SPACE
+
+	ret
+
+return_null:
+	xor	job_rax, job_rax
+	jmp	return
+
+
+section .data align=32
+
+align 32
+clear_low_6bits:
+	dq 0x00000000FFFFFFC0, 0x0000000000000000
+	dq 0x00000000FFFFFFC0, 0x0000000000000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_md5_mb_mgr_submit_avx512
+no_md5_mb_mgr_submit_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
-- 
cgit v1.2.3