;;
;; Copyright (c) 2012-2018, Intel Corporation
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are met:
;;
;;     * Redistributions of source code must retain the above copyright notice,
;;       this list of conditions and the following disclaimer.
;;     * Redistributions in binary form must reproduce the above copyright
;;       notice, this list of conditions and the following disclaimer in the
;;       documentation and/or other materials provided with the distribution.
;;     * Neither the name of Intel Corporation nor the names of its contributors
;;       may be used to endorse or promote products derived from this software
;;       without specific prior written permission.
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;

%include "os.asm"
%include "job_aes_hmac.asm"
%include "mb_mgr_datastruct.asm"

%include "reg_sizes.asm"

%include "memcpy.asm"

%ifndef AES_XCBC_X8
%define AES_XCBC_X8 aes_xcbc_mac_128_x8
%define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_avx
%endif

; void AES_XCBC_X8(AES_XCBC_ARGS_x8 *args, UINT64 len_in_bytes);
extern AES_XCBC_X8


section .data
default rel

align 16
dupw:	;ddq 0x01000100010001000100010001000100
	dq 0x0100010001000100, 0x0100010001000100
x80:    ;ddq 0x00000000000000000000000000000080
        dq 0x0000000000000080, 0x0000000000000000

section .text

%ifdef LINUX
%define arg1	rdi
%define arg2	rsi
%else
%define arg1	rcx
%define arg2	rdx
%endif

%define state	arg1
%define job	arg2
%define len2	arg2

%define job_rax          rax

%if 1
; idx needs to be in rbp
%define len              r11
%define idx              rbp
%define tmp2             rbp
%define tmp              r14

%define lane             r8
%define icv              r9
%define p2               r9

%define last_len         r10

%define lane_data        r12
%define p                r13

%define unused_lanes     rbx
%endif

; STACK_SPACE needs to be an odd multiple of 8
; This routine and its callee clobbers all GPRs
struc STACK
_gpr_save:	resq	8
_rsp_save:	resq	1
endstruc

; JOB* SUBMIT_JOB_AES_XCBC(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job)
; arg 1 : state
; arg 2 : job
MKGLOBAL(SUBMIT_JOB_AES_XCBC,function,internal)
SUBMIT_JOB_AES_XCBC:

        mov	rax, rsp
        sub	rsp, STACK_size
        and	rsp, -16

	mov	[rsp + _gpr_save + 8*0], rbx
	mov	[rsp + _gpr_save + 8*1], rbp
	mov	[rsp + _gpr_save + 8*2], r12
	mov	[rsp + _gpr_save + 8*3], r13
	mov	[rsp + _gpr_save + 8*4], r14
	mov	[rsp + _gpr_save + 8*5], r15
%ifndef LINUX
	mov	[rsp + _gpr_save + 8*6], rsi
	mov	[rsp + _gpr_save + 8*7], rdi
%endif
	mov	[rsp + _rsp_save], rax	; original SP

	mov	unused_lanes, [state + _aes_xcbc_unused_lanes]
	mov	lane, unused_lanes
	and	lane, 0xF
	shr	unused_lanes, 4
	imul	lane_data, lane, _XCBC_LANE_DATA_size
	lea	lane_data, [state + _aes_xcbc_ldata + lane_data]
	mov	len, [job + _msg_len_to_hash_in_bytes]
	mov	[state + _aes_xcbc_unused_lanes], unused_lanes
	mov	[lane_data + _xcbc_job_in_lane], job
	mov	dword [lane_data + _xcbc_final_done], 0
	mov	tmp, [job + _k1_expanded]
	mov	[state + _aes_xcbc_args_keys + lane*8], tmp
	mov	p, [job + _src]
	add	p, [job + _hash_start_src_offset_in_bytes]

	mov	last_len, len

	cmp	len, 16
	jle	small_buffer

	mov	[state + _aes_xcbc_args_in + lane*8], p
	add	p, len		; set point to end of data

	and	last_len, 15	; Check lsbs of msg len
	jnz	slow_copy	; if not 16B mult, do slow copy

fast_copy:
	vmovdqu	xmm0, [p - 16]	; load last block M[n]
        mov     tmp, [job + _k2] ; load K2 address
        vmovdqu xmm1, [tmp]     ; load K2
        vpxor   xmm0, xmm0, xmm1      ; M[n] XOR K2
	vmovdqa	[lane_data + _xcbc_final_block], xmm0
	sub	len, 16		; take last block off length
end_fast_copy:

	mov	[state + _aes_xcbc_lens + 2*lane], WORD(len)

	vpxor	xmm0, xmm0, xmm0
	shl	lane, 4	; multiply by 16
	vmovdqa	[state + _aes_xcbc_args_ICV + lane], xmm0

	cmp	unused_lanes, 0xf
	jne	return_null

start_loop:
	; Find min length
	vmovdqa	xmm0, [state + _aes_xcbc_lens]
	vphminposuw	xmm1, xmm0
	vpextrw	DWORD(len2), xmm1, 0	; min value
	vpextrw	DWORD(idx), xmm1, 1	; min index (0...7)
	cmp	len2, 0
	je	len_is_0

	vpshufb	xmm1, xmm1, [rel dupw]   ; duplicate words across all lanes
	vpsubw	xmm0, xmm0, xmm1
	vmovdqa	[state + _aes_xcbc_lens], xmm0

	; "state" and "args" are the same address, arg1
	; len is arg2
	call	AES_XCBC_X8
	; state and idx are intact

len_is_0:
	; process completed job "idx"
	imul	lane_data, idx, _XCBC_LANE_DATA_size
	lea	lane_data, [state + _aes_xcbc_ldata + lane_data]
	cmp	dword [lane_data + _xcbc_final_done], 0
	jne	end_loop

	mov	dword [lane_data + _xcbc_final_done], 1
	mov	word [state + _aes_xcbc_lens + 2*idx], 16
	lea	tmp, [lane_data + _xcbc_final_block]
	mov	[state + _aes_xcbc_args_in + 8*idx], tmp
	jmp	start_loop

end_loop:
	; process completed job "idx"
	mov	job_rax, [lane_data + _xcbc_job_in_lane]
	mov	icv, [job_rax + _auth_tag_output]
	mov	unused_lanes, [state + _aes_xcbc_unused_lanes]
	mov	qword [lane_data + _xcbc_job_in_lane], 0
	or	dword [job_rax + _status], STS_COMPLETED_HMAC
	shl	unused_lanes, 4
	or	unused_lanes, idx
	shl	idx, 4 ; multiply by 16
	mov	[state + _aes_xcbc_unused_lanes], unused_lanes

	; copy 12 bytes
	vmovdqa	xmm0, [state + _aes_xcbc_args_ICV + idx]
	vmovq	[icv], xmm0
	vpextrd	[icv + 8], xmm0, 2

return:

	mov	rbx, [rsp + _gpr_save + 8*0]
	mov	rbp, [rsp + _gpr_save + 8*1]
	mov	r12, [rsp + _gpr_save + 8*2]
	mov	r13, [rsp + _gpr_save + 8*3]
	mov	r14, [rsp + _gpr_save + 8*4]
	mov	r15, [rsp + _gpr_save + 8*5]
%ifndef LINUX
	mov	rsi, [rsp + _gpr_save + 8*6]
	mov	rdi, [rsp + _gpr_save + 8*7]
%endif
	mov	rsp, [rsp + _rsp_save]	; original SP

        ret

small_buffer:
	; For buffers <= 16 Bytes
	; The input data is set to final block
	lea	tmp, [lane_data + _xcbc_final_block] ; final block
	mov	[state + _aes_xcbc_args_in + lane*8], tmp
	add	p, len		; set point to end of data
	cmp	len, 16
	je	fast_copy

slow_copy:
	and	len, ~15	; take final block off len
	sub	p, last_len	; adjust data pointer
	lea	p2, [lane_data + _xcbc_final_block + 16] ; upper part of final
	sub	p2, last_len	; adjust data pointer backwards
	memcpy_avx_16_1 p2, p, last_len, tmp, tmp2
        vmovdqa	xmm0, [rel x80]	; fill reg with padding
	vmovdqu	[lane_data + _xcbc_final_block + 16], xmm0 ; add padding
	vmovdqu	xmm0, [p2]	; load final block to process
	mov	tmp, [job + _k3] ; load K3 address
	vmovdqu	xmm1, [tmp]	; load K3
	vpxor	xmm0, xmm0, xmm1	; M[n] XOR K3
	vmovdqu	[lane_data + _xcbc_final_block], xmm0	; write final block
	jmp	end_fast_copy

return_null:
	xor	job_rax, job_rax
	jmp	return


%ifdef LINUX
section .note.GNU-stack noalloc noexec nowrite progbits
%endif