/**********************************************************************
  Copyright(c) 2021 Arm Corporation All rights reserved.

  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
  are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in
      the documentation and/or other materials provided with the
      distribution.
    * Neither the name of Arm Corporation nor the names of its
      contributors may be used to endorse or promote products derived
      from this software without specific prior written permission.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/

	.arch armv8-a

// macro F = (D ^ (B & (C ^ D)))
.macro FUNC_F0
	eor	VF.16b, VC.16b, VD.16b
	and	VF.16b, VB.16b, VF.16b
	eor	VF.16b, VD.16b, VF.16b
.endm

// F = (B ^ C ^ D)
.macro FUNC_F1
	eor	VF.16b, VB.16b, VC.16b
	eor	VF.16b, VF.16b, VD.16b
.endm

// F = ((B & C) | (B & D) | (C & D))
.macro FUNC_F2
	and	vT0.16b, VB.16b, VC.16b
	and	vT1.16b, VB.16b, VD.16b
	and	vT2.16b, VC.16b, VD.16b
	orr	VF.16b, vT0.16b, vT1.16b
	orr	VF.16b, VF.16b, vT2.16b
.endm

// F = (B ^ C ^ D)
.macro FUNC_F3
	FUNC_F1
.endm

.altmacro
.macro load_next_word windex
	.if \windex < 16
		load_x4_word	\windex
	.endif
.endm

// FUNC_F0 is merged into STEP_00_15 for efficiency
.macro SHA1_STEP_00_15_F0 windex:req
	rev32	WORD\windex\().16b,WORD\windex\().16b
	next_word=\windex+1
	load_next_word %next_word
	// e = (a leftrotate 5) + f + e + k + w[i]
	ushr	VT.4s, VA.4s, 32 - 5
	add	VE.4s, VE.4s, VK.4s
	sli	VT.4s, VA.4s, 5
	eor	VF.16b, VC.16b, VD.16b
	add	VE.4s, VE.4s, WORD\windex\().4s
	and	VF.16b, VB.16b, VF.16b
	add	VE.4s, VE.4s, VT.4s
	eor	VF.16b, VD.16b, VF.16b
	ushr	VT.4s, VB.4s, 32 - 30
	add	VE.4s, VE.4s, VF.4s
	sli	VT.4s, VB.4s, 30
.endm

.macro SHA1_STEP_16_79 windex:req,func_f:req,reg_3:req,reg_8:req,reg_14:req,reg_16:req
	eor	vT0.16b,\reg_3\().16b,\reg_8\().16b
	eor	VT.16b,\reg_14\().16b,\reg_16\().16b
	eor	vT0.16b,vT0.16b,VT.16b
	// e = (a leftrotate 5) + f + e + k + w[i]
	ushr	VT.4s, vT0.4s, 32 - 1
	add	VE.4s, VE.4s, VK.4s
	ushr	vT1.4s, VA.4s, 32 - 5
	sli	VT.4s, vT0.4s, 1
	add	VE.4s, VE.4s, VT.4s
	sli	vT1.4s, VA.4s, 5
	mov	\reg_16\().16b,VT.16b
	add	VE.4s, VE.4s, vT1.4s
	ushr	VT.4s, VB.4s, 32 - 30
	\func_f
	add	VE.4s, VE.4s, VF.4s
	sli	VT.4s, VB.4s, 30
.endm

	VA	.req v0
	VB	.req v1
	VC	.req v2
	VD	.req v3
	VE	.req v4
	VT	.req v5
	VF	.req v6
	VK	.req v7
	WORD0	.req v8
	WORD1	.req v9
	WORD2	.req v10
	WORD3	.req v11
	WORD4	.req v12
	WORD5	.req v13
	WORD6	.req v14
	WORD7	.req v15
	WORD8	.req v16
	WORD9	.req v17
	WORD10	.req v18
	WORD11	.req v19
	WORD12	.req v20
	WORD13	.req v21
	WORD14	.req v22
	WORD15	.req v23
	vT0	.req v24
	vT1	.req v25
	vT2	.req v26
	vAA	.req v27
	vBB	.req v28
	vCC	.req v29
	vDD	.req v30
	vEE	.req v31
	TT	.req v0
	sha1key_adr	.req	x15

.macro SWAP_STATES
	// shifted VB is held in VT after each step
	.unreq TT
	TT .req VE
	.unreq VE
	VE .req VD
	.unreq VD
	VD .req VC
	.unreq VC
	VC .req VT
	.unreq	VT
	VT .req VB
	.unreq VB
	VB .req VA
	.unreq VA
	VA .req TT
.endm

.altmacro
.macro SHA1_STEP_16_79_WRAPPER windex:req,func_f:req,idx3:req,idx8:req,idx14:req,idx16:req
	SHA1_STEP_16_79 \windex,\func_f,WORD\idx3\(),WORD\idx8\(),WORD\idx14\(),WORD\idx16\()
.endm

.macro exec_step windex:req
	.if \windex <= 15
		SHA1_STEP_00_15_F0	windex
	.else
		idx14=((\windex - 14) & 15)
		idx8=((\windex - 8) & 15)
		idx3=((\windex - 3) & 15)
		idx16=(\windex & 15)
		.if \windex <= 19
			SHA1_STEP_16_79_WRAPPER	\windex,FUNC_F0,%idx3,%idx8,%idx14,%idx16
		.endif
		.if \windex >= 20 && \windex <= 39
			SHA1_STEP_16_79_WRAPPER	\windex,FUNC_F1,%idx3,%idx8,%idx14,%idx16
		.endif
		.if \windex >= 40 && \windex <= 59
			SHA1_STEP_16_79_WRAPPER	\windex,FUNC_F2,%idx3,%idx8,%idx14,%idx16
		.endif
		.if \windex >= 60 && \windex <= 79
			SHA1_STEP_16_79_WRAPPER	\windex,FUNC_F3,%idx3,%idx8,%idx14,%idx16
		.endif
	.endif

	SWAP_STATES

	.if \windex == 79
		// after 80 steps, the registers ABCDET has shifted from
		// its orignal order of 012345 to 341520
		// have to swap back for both compile- and run-time correctness
		mov	v0.16b,v3.16b
		.unreq VA
		VA	.req v0

		mov	vT0.16b,v2.16b
		mov	v2.16b,v1.16b
		mov	v1.16b,v4.16b
		.unreq VB
		VB	.req v1
		.unreq VC
		VC	.req v2

		mov	v3.16b,v5.16b
		.unreq VD
		VD	.req v3

		mov	v4.16b,vT0.16b
		.unreq VE
		VE	.req v4

		.unreq VT
		VT	.req v5
	.endif
.endm

.macro exec_steps idx:req,more:vararg
	exec_step	\idx
	.ifnb \more
		exec_steps	\more
	.endif
.endm

.macro sha1_single
	load_x4_word 0

	mov	vAA.16B, VA.16B
	mov	vBB.16B, VB.16B
	mov	vCC.16B, VC.16B
	mov	vDD.16B, VD.16B
	mov	vEE.16B, VE.16B

	adr	sha1key_adr, KEY_0
	ld1	{VK.4s}, [sha1key_adr]
	exec_steps	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19

	// 20 ~ 39
	adr	sha1key_adr, KEY_1
	ld1	{VK.4s}, [sha1key_adr]
	exec_steps	20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39

	// 40 ~ 59
	adr	sha1key_adr, KEY_2
	ld1	{VK.4s}, [sha1key_adr]
	exec_steps	40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59

	// 60 ~ 79
	adr	sha1key_adr, KEY_3
	ld1	{VK.4s}, [sha1key_adr]
	exec_steps	60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79

	add	VA.4s, vAA.4s, VA.4s
	add	VB.4s, vBB.4s, VB.4s
	add	VC.4s, vCC.4s, VC.4s
	add	VD.4s, vDD.4s, VD.4s
	add	VE.4s, vEE.4s, VE.4s
.endm

.macro sha1_asimd_save_stack
	stp	d8,d9,[sp, -64]!
	stp	d10,d11,[sp, 16]
	stp	d12,d13,[sp, 32]
	stp	d14,d15,[sp, 48]
.endm

.macro sha1_asimd_restore_stack
	ldp	d10,d11,[sp, 16]
	ldp	d12,d13,[sp, 32]
	ldp	d14,d15,[sp, 48]
	ldp	d8,d9,[sp],64
.endm