From 36d22d82aa202bb199967e9512281e9a53db42c9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 21:33:14 +0200 Subject: Adding upstream version 115.7.0esr. Signed-off-by: Daniel Baumann --- security/nss/lib/freebl/intel-gcm-x86-masm.asm | 1207 ++++++++++++++++++++++++ 1 file changed, 1207 insertions(+) create mode 100644 security/nss/lib/freebl/intel-gcm-x86-masm.asm (limited to 'security/nss/lib/freebl/intel-gcm-x86-masm.asm') diff --git a/security/nss/lib/freebl/intel-gcm-x86-masm.asm b/security/nss/lib/freebl/intel-gcm-x86-masm.asm new file mode 100644 index 0000000000..32f4257884 --- /dev/null +++ b/security/nss/lib/freebl/intel-gcm-x86-masm.asm @@ -0,0 +1,1207 @@ +; LICENSE: +; This submission to NSS is to be made available under the terms of the +; Mozilla Public License, v. 2.0. You can obtain one at http: +; //mozilla.org/MPL/2.0/. +;############################################################################### +; Copyright(c) 2014, Intel Corp. +; Developers and authors: +; Shay Gueron and Vlad Krasnov +; Intel Corporation, Israel Development Centre, Haifa, Israel +; Please send feedback directly to crypto.feedback.alias@intel.com + + +.MODEL FLAT, C +.XMM + +.DATA +ALIGN 16 +Lone dq 1,0 +Ltwo dq 2,0 +Lbswap_mask db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +Lshuff_mask dq 0f0f0f0f0f0f0f0fh, 0f0f0f0f0f0f0f0fh +Lpoly dq 01h, 0c200000000000000h + +.CODE + + +GFMUL MACRO DST, SRC1, SRC2, TMP1, TMP2, TMP3, TMP4 + vpclmulqdq TMP1, SRC2, SRC1, 0h + vpclmulqdq TMP4, SRC2, SRC1, 011h + + vpshufd TMP2, SRC2, 78 + vpshufd TMP3, SRC1, 78 + vpxor TMP2, TMP2, SRC2 + vpxor TMP3, TMP3, SRC1 + + vpclmulqdq TMP2, TMP2, TMP3, 0h + vpxor TMP2, TMP2, TMP1 + vpxor TMP2, TMP2, TMP4 + + vpslldq TMP3, TMP2, 8 + vpsrldq TMP2, TMP2, 8 + + vpxor TMP1, TMP1, TMP3 + vpxor TMP4, TMP4, TMP2 + + vpclmulqdq TMP2, TMP1, [Lpoly], 010h + vpshufd TMP3, TMP1, 78 + vpxor TMP1, TMP2, TMP3 + + vpclmulqdq TMP2, TMP1, [Lpoly], 010h + vpshufd TMP3, TMP1, 78 + vpxor TMP1, TMP2, TMP3 + + vpxor DST, TMP1, TMP4 + + ENDM + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Generates the final GCM tag +; void intel_aes_gcmTAG(unsigned char Htbl[16*16], +; unsigned char *Tp, +; unsigned int Mlen, +; unsigned int Alen, +; unsigned char* X0, +; unsigned char* TAG); +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +ALIGN 16 +intel_aes_gcmTAG PROC + +Htbl textequ +Tp textequ +X0 textequ +TAG textequ + +T textequ +TMP0 textequ + + push ebx + + mov Htbl, [esp + 2*4 + 0*4] + mov Tp, [esp + 2*4 + 1*4] + mov X0, [esp + 2*4 + 4*4] + mov TAG, [esp + 2*4 + 5*4] + + vzeroupper + vmovdqu T, XMMWORD PTR[Tp] + + vpxor TMP0, TMP0, TMP0 + vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 2*4], 0 + vpinsrd TMP0, TMP0, DWORD PTR[esp + 2*4 + 3*4], 2 + vpsllq TMP0, TMP0, 3 + + vpxor T, T, TMP0 + vmovdqu TMP0, XMMWORD PTR[Htbl] + GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 + + vpshufb T, T, [Lbswap_mask] + vpxor T, T, [X0] + vmovdqu XMMWORD PTR[TAG], T + vzeroupper + + pop ebx + + ret + +intel_aes_gcmTAG ENDP + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Generates the H table +; void intel_aes_gcmINIT(unsigned char Htbl[16*16], unsigned char *KS, int NR); +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +ALIGN 16 +intel_aes_gcmINIT PROC + +Htbl textequ +KS textequ +NR textequ + +T textequ +TMP0 textequ + + mov Htbl, [esp + 4*1 + 0*4] + mov KS, [esp + 4*1 + 1*4] + mov NR, [esp + 4*1 + 2*4] + + vzeroupper + ; AES-ENC(0) + vmovdqu T, XMMWORD PTR[KS] + lea KS, [16 + KS] + dec NR +Lenc_loop: + vaesenc T, T, [KS] + lea KS, [16 + KS] + dec NR + jnz Lenc_loop + + vaesenclast T, T, [KS] + vpshufb T, T, [Lbswap_mask] + + ;Calculate H` = GFMUL(H, 2) + vpsrad xmm3, T, 31 + vpshufd xmm3, xmm3, 0ffh + vpand xmm5, xmm3, [Lpoly] + vpsrld xmm3, T, 31 + vpslld xmm4, T, 1 + vpslldq xmm3, xmm3, 4 + vpxor T, xmm4, xmm3 + vpxor T, T, xmm5 + + vmovdqu TMP0, T + vmovdqu XMMWORD PTR[Htbl + 0*16], T + + vpshufd xmm2, T, 78 + vpxor xmm2, xmm2, T + vmovdqu XMMWORD PTR[Htbl + 8*16 + 0*16], xmm2 + + i = 1 + WHILE i LT 8 + GFMUL T, T, TMP0, xmm2, xmm3, xmm4, xmm5 + vmovdqu XMMWORD PTR[Htbl + i*16], T + vpshufd xmm2, T, 78 + vpxor xmm2, xmm2, T + vmovdqu XMMWORD PTR[Htbl + 8*16 + i*16], xmm2 + i = i+1 + ENDM + vzeroupper + ret +intel_aes_gcmINIT ENDP + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Authenticate only +; void intel_aes_gcmAAD(unsigned char Htbl[16*16], unsigned char *AAD, unsigned int Alen, unsigned char *Tp); +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +ALIGN 16 +intel_aes_gcmAAD PROC + +Htbl textequ +inp textequ +len textequ +Tp textequ +hlp0 textequ + +DATA textequ +T textequ +TMP0 textequ +TMP1 textequ +TMP2 textequ +TMP3 textequ +TMP4 textequ +Xhi textequ + +KARATSUBA_AAD MACRO i + vpclmulqdq TMP3, DATA, [Htbl + i*16], 0h + vpxor TMP0, TMP0, TMP3 + vpclmulqdq TMP3, DATA, [Htbl + i*16], 011h + vpxor TMP1, TMP1, TMP3 + vpshufd TMP3, DATA, 78 + vpxor TMP3, TMP3, DATA + vpclmulqdq TMP3, TMP3, [Htbl + 8*16 + i*16], 0h + vpxor TMP2, TMP2, TMP3 +ENDM + + cmp DWORD PTR[esp + 1*3 + 2*4], 0 + jnz LbeginAAD + ret + +LbeginAAD: + push ebx + push esi + + mov Htbl, [esp + 4*3 + 0*4] + mov inp, [esp + 4*3 + 1*4] + mov len, [esp + 4*3 + 2*4] + mov Tp, [esp + 4*3 + 3*4] + + vzeroupper + + vpxor Xhi, Xhi, Xhi + + vmovdqu T, XMMWORD PTR[Tp] + ;we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first + mov hlp0, len + and hlp0, 128-1 + jz Lmod_loop + + and len, -128 + sub hlp0, 16 + + ; Prefix block + vmovdqu DATA, XMMWORD PTR[inp] + vpshufb DATA, DATA, [Lbswap_mask] + vpxor DATA, DATA, T + + vpclmulqdq TMP0, DATA, XMMWORD PTR[Htbl + hlp0], 0h + vpclmulqdq TMP1, DATA, XMMWORD PTR[Htbl + hlp0], 011h + vpshufd TMP3, DATA, 78 + vpxor TMP3, TMP3, DATA + vpclmulqdq TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h + + lea inp, [inp+16] + test hlp0, hlp0 + jnz Lpre_loop + jmp Lred1 + + ;hash remaining prefix bocks (up to 7 total prefix blocks) +Lpre_loop: + + sub hlp0, 16 + + vmovdqu DATA, XMMWORD PTR[inp] + vpshufb DATA, DATA, [Lbswap_mask] + + vpclmulqdq TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 0h + vpxor TMP0, TMP0, TMP3 + vpclmulqdq TMP3, DATA, XMMWORD PTR[Htbl + hlp0], 011h + vpxor TMP1, TMP1, TMP3 + vpshufd TMP3, DATA, 78 + vpxor TMP3, TMP3, DATA + vpclmulqdq TMP3, TMP3, XMMWORD PTR[Htbl + 8*16 + hlp0], 0h + vpxor TMP2, TMP2, TMP3 + + test hlp0, hlp0 + lea inp, [inp+16] + jnz Lpre_loop + +Lred1: + + vpxor TMP2, TMP2, TMP0 + vpxor TMP2, TMP2, TMP1 + vpsrldq TMP3, TMP2, 8 + vpslldq TMP2, TMP2, 8 + + vpxor Xhi, TMP1, TMP3 + vpxor T, TMP0, TMP2 + +Lmod_loop: + + sub len, 16*8 + jb Ldone + ; Block #0 + vmovdqu DATA, XMMWORD PTR[inp + 16*7] + vpshufb DATA, DATA, XMMWORD PTR[Lbswap_mask] + + vpclmulqdq TMP0, DATA, XMMWORD PTR[Htbl + 0*16], 0h + vpclmulqdq TMP1, DATA, XMMWORD PTR[Htbl + 0*16], 011h + vpshufd TMP3, DATA, 78 + vpxor TMP3, TMP3, DATA + vpclmulqdq TMP2, TMP3, XMMWORD PTR[Htbl + 8*16 + 0*16], 0h + + ; Block #1 + vmovdqu DATA, XMMWORD PTR[inp + 16*6] + vpshufb DATA, DATA, [Lbswap_mask] + KARATSUBA_AAD 1 + + ; Block #2 + vmovdqu DATA, XMMWORD PTR[inp + 16*5] + vpshufb DATA, DATA, [Lbswap_mask] + + vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 1a + vpalignr T, T, T, 8 + + KARATSUBA_AAD 2 + + vpxor T, T, TMP4 ;reduction stage 1b + + ; Block #3 + vmovdqu DATA, XMMWORD PTR[inp + 16*4] + vpshufb DATA, DATA, [Lbswap_mask] + KARATSUBA_AAD 3 + ; Block #4 + vmovdqu DATA, XMMWORD PTR[inp + 16*3] + vpshufb DATA, DATA, [Lbswap_mask] + + vpclmulqdq TMP4, T, [Lpoly], 010h ;reduction stage 2a + vpalignr T, T, T, 8 + + KARATSUBA_AAD 4 + + vpxor T, T, TMP4 ;reduction stage 2b + ; Block #5 + vmovdqu DATA, XMMWORD PTR[inp + 16*2] + vpshufb DATA, DATA, [Lbswap_mask] + KARATSUBA_AAD 5 + + vpxor T, T, Xhi ;reduction finalize + ; Block #6 + vmovdqu DATA, XMMWORD PTR[inp + 16*1] + vpshufb DATA, DATA, [Lbswap_mask] + KARATSUBA_AAD 6 + ; Block #7 + vmovdqu DATA, XMMWORD PTR[inp + 16*0] + vpshufb DATA, DATA, [Lbswap_mask] + vpxor DATA, DATA, T + KARATSUBA_AAD 7 + ; Aggregated 8 blocks, now karatsuba fixup + vpxor TMP2, TMP2, TMP0 + vpxor TMP2, TMP2, TMP1 + vpsrldq TMP3, TMP2, 8 + vpslldq TMP2, TMP2, 8 + + vpxor Xhi, TMP1, TMP3 + vpxor T, TMP0, TMP2 + + lea inp, [inp + 16*8] + jmp Lmod_loop + +Ldone: + vpclmulqdq TMP4, T, [Lpoly], 010h + vpalignr T, T, T, 8 + vpxor T, T, TMP4 + + vpclmulqdq TMP4, T, [Lpoly], 010h + vpalignr T, T, T, 8 + vpxor T, T, TMP4 + + vpxor T, T, Xhi + vmovdqu XMMWORD PTR[Tp], T + vzeroupper + + pop esi + pop ebx + ret + +intel_aes_gcmAAD ENDP + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Encrypt and Authenticate +; void intel_aes_gcmENC(unsigned char* PT, unsigned char* CT, void *Gctx, unsigned int len); +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +ALIGN 16 +intel_aes_gcmENC PROC + +PT textequ +CT textequ +Htbl textequ +Gctx textequ +len textequ +KS textequ +NR textequ + +aluCTR textequ +aluTMP textequ + +T textequ +TMP0 textequ +TMP1 textequ +TMP2 textequ +TMP3 textequ +TMP4 textequ +TMP5 textequ + +CTR0 textequ +CTR1 textequ +CTR2 textequ +CTR3 textequ +CTR4 textequ +CTR5 textequ +CTR6 textequ + +ROUND MACRO i + vmovdqu xmm7, XMMWORD PTR[i*16 + KS] + vaesenc CTR0, CTR0, xmm7 + vaesenc CTR1, CTR1, xmm7 + vaesenc CTR2, CTR2, xmm7 + vaesenc CTR3, CTR3, xmm7 + vaesenc CTR4, CTR4, xmm7 + vaesenc CTR5, CTR5, xmm7 + vaesenc CTR6, CTR6, xmm7 +ENDM + +KARATSUBA MACRO i + vpshufd TMP4, TMP5, 78 + vpxor TMP4, TMP4, TMP5 + vpclmulqdq TMP3, TMP4, XMMWORD PTR[i*16 + 8*16 + Htbl], 000h + vpxor TMP0, TMP0, TMP3 + vmovdqu TMP4, XMMWORD PTR[i*16 + Htbl] + vpclmulqdq TMP3, TMP5, TMP4, 011h + vpxor TMP1, TMP1, TMP3 + vpclmulqdq TMP3, TMP5, TMP4, 000h + vpxor TMP2, TMP2, TMP3 +ENDM + +NEXTCTR MACRO i + add aluCTR, 1 + mov aluTMP, aluCTR + bswap aluTMP + xor aluTMP, [3*4 + KS] + mov [3*4 + 8*16 + i*16 + esp], aluTMP +ENDM + + cmp DWORD PTR[1*4 + 3*4 + esp], 0 + jne LbeginENC + ret + +LbeginENC: + + vzeroupper + push ebp + push ebx + push esi + push edi + + mov ebp, esp + sub esp, 16*16 + and esp, -16 + + mov PT, [ebp + 5*4 + 0*4] + mov CT, [ebp + 5*4 + 1*4] + mov Gctx, [ebp + 5*4 + 2*4] + + mov KS, [16*16 + 3*16 + Gctx] + + mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] + bswap aluCTR + + + vmovdqu TMP0, XMMWORD PTR[0*16 + KS] + vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] + vmovdqu XMMWORD PTR[8*16 + 0*16 + esp], TMP0 + + cmp len, 16*7 + jb LEncDataSingles +; Prepare the "top" counters + vmovdqu XMMWORD PTR[8*16 + 1*16 + esp], TMP0 + vmovdqu XMMWORD PTR[8*16 + 2*16 + esp], TMP0 + vmovdqu XMMWORD PTR[8*16 + 3*16 + esp], TMP0 + vmovdqu XMMWORD PTR[8*16 + 4*16 + esp], TMP0 + vmovdqu XMMWORD PTR[8*16 + 5*16 + esp], TMP0 + vmovdqu XMMWORD PTR[8*16 + 6*16 + esp], TMP0 + + vmovdqu CTR0, XMMWORD PTR[16*16 + 2*16 + Gctx] + vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] +; Encrypt the initial 7 blocks + sub len, 16*7 + vpaddd CTR1, CTR0, XMMWORD PTR[Lone] + vpaddd CTR2, CTR0, XMMWORD PTR[Ltwo] + vpaddd CTR3, CTR2, XMMWORD PTR[Lone] + vpaddd CTR4, CTR2, XMMWORD PTR[Ltwo] + vpaddd CTR5, CTR4, XMMWORD PTR[Lone] + vpaddd CTR6, CTR4, XMMWORD PTR[Ltwo] + + vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] + vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask] + vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask] + vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask] + vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask] + vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask] + vpshufb CTR6, CTR6, XMMWORD PTR[Lbswap_mask] + + vmovdqu xmm7, XMMWORD PTR[0*16 + KS] + vpxor CTR0, CTR0, xmm7 + vpxor CTR1, CTR1, xmm7 + vpxor CTR2, CTR2, xmm7 + vpxor CTR3, CTR3, xmm7 + vpxor CTR4, CTR4, xmm7 + vpxor CTR5, CTR5, xmm7 + vpxor CTR6, CTR6, xmm7 + + ROUND 1 + + add aluCTR, 7 + mov aluTMP, aluCTR + bswap aluTMP + xor aluTMP, [KS + 3*4] + mov [8*16 + 0*16 + 3*4 + esp], aluTMP + + ROUND 2 + NEXTCTR 1 + ROUND 3 + NEXTCTR 2 + ROUND 4 + NEXTCTR 3 + ROUND 5 + NEXTCTR 4 + ROUND 6 + NEXTCTR 5 + ROUND 7 + NEXTCTR 6 + ROUND 8 + ROUND 9 + vmovdqu xmm7, XMMWORD PTR[10*16 + KS] + cmp NR, 10 + je @f + + ROUND 10 + ROUND 11 + vmovdqu xmm7, XMMWORD PTR[12*16 + KS] + cmp NR, 12 + je @f + + ROUND 12 + ROUND 13 + vmovdqu xmm7, XMMWORD PTR[14*16 + KS] +@@: + vaesenclast CTR0, CTR0, xmm7 + vaesenclast CTR1, CTR1, xmm7 + vaesenclast CTR2, CTR2, xmm7 + vaesenclast CTR3, CTR3, xmm7 + vaesenclast CTR4, CTR4, xmm7 + vaesenclast CTR5, CTR5, xmm7 + vaesenclast CTR6, CTR6, xmm7 + + vpxor CTR0, CTR0, XMMWORD PTR[0*16 + PT] + vpxor CTR1, CTR1, XMMWORD PTR[1*16 + PT] + vpxor CTR2, CTR2, XMMWORD PTR[2*16 + PT] + vpxor CTR3, CTR3, XMMWORD PTR[3*16 + PT] + vpxor CTR4, CTR4, XMMWORD PTR[4*16 + PT] + vpxor CTR5, CTR5, XMMWORD PTR[5*16 + PT] + vpxor CTR6, CTR6, XMMWORD PTR[6*16 + PT] + + vmovdqu XMMWORD PTR[0*16 + CT], CTR0 + vmovdqu XMMWORD PTR[1*16 + CT], CTR1 + vmovdqu XMMWORD PTR[2*16 + CT], CTR2 + vmovdqu XMMWORD PTR[3*16 + CT], CTR3 + vmovdqu XMMWORD PTR[4*16 + CT], CTR4 + vmovdqu XMMWORD PTR[5*16 + CT], CTR5 + vmovdqu XMMWORD PTR[6*16 + CT], CTR6 + + vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] + vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask] + vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask] + vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask] + vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask] + vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask] + vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask] + + vmovdqa XMMWORD PTR[1*16 + esp], CTR5 + vmovdqa XMMWORD PTR[2*16 + esp], CTR4 + vmovdqa XMMWORD PTR[3*16 + esp], CTR3 + vmovdqa XMMWORD PTR[4*16 + esp], CTR2 + vmovdqa XMMWORD PTR[5*16 + esp], CTR1 + vmovdqa XMMWORD PTR[6*16 + esp], CTR0 + + lea CT, [7*16 + CT] + lea PT, [7*16 + PT] + jmp LEncData7 + +LEncData7: + cmp len, 16*7 + jb LEndEnc7 + sub len, 16*7 + + vpshufd TMP4, TMP5, 78 + vpxor TMP4, TMP4, TMP5 + vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h + vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] + vpclmulqdq TMP1, TMP5, TMP4, 011h + vpclmulqdq TMP2, TMP5, TMP4, 000h + + vmovdqu TMP5, XMMWORD PTR[1*16 + esp] + KARATSUBA 1 + vmovdqu TMP5, XMMWORD PTR[2*16 + esp] + KARATSUBA 2 + vmovdqu TMP5, XMMWORD PTR[3*16 + esp] + KARATSUBA 3 + vmovdqu TMP5, XMMWORD PTR[4*16 + esp] + KARATSUBA 4 + vmovdqu TMP5, XMMWORD PTR[5*16 + esp] + KARATSUBA 5 + vmovdqu TMP5, XMMWORD PTR[6*16 + esp] + vpxor TMP5, TMP5, T + KARATSUBA 6 + + vpxor TMP0, TMP0, TMP1 + vpxor TMP0, TMP0, TMP2 + vpsrldq TMP3, TMP0, 8 + vpxor TMP4, TMP1, TMP3 + vpslldq TMP3, TMP0, 8 + vpxor TMP5, TMP2, TMP3 + + vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h + vpalignr TMP5,TMP5,TMP5,8 + vpxor TMP5, TMP5, TMP1 + + vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h + vpalignr TMP5,TMP5,TMP5,8 + vpxor TMP5, TMP5, TMP1 + + vpxor TMP5, TMP5, TMP4 + vmovdqu T, TMP5 + + vmovdqa CTR0, XMMWORD PTR[8*16 + 0*16 + esp] + vmovdqa CTR1, XMMWORD PTR[8*16 + 1*16 + esp] + vmovdqa CTR2, XMMWORD PTR[8*16 + 2*16 + esp] + vmovdqa CTR3, XMMWORD PTR[8*16 + 3*16 + esp] + vmovdqa CTR4, XMMWORD PTR[8*16 + 4*16 + esp] + vmovdqa CTR5, XMMWORD PTR[8*16 + 5*16 + esp] + vmovdqa CTR6, XMMWORD PTR[8*16 + 6*16 + esp] + + ROUND 1 + NEXTCTR 0 + ROUND 2 + NEXTCTR 1 + ROUND 3 + NEXTCTR 2 + ROUND 4 + NEXTCTR 3 + ROUND 5 + NEXTCTR 4 + ROUND 6 + NEXTCTR 5 + ROUND 7 + NEXTCTR 6 + + ROUND 8 + ROUND 9 + + vmovdqu xmm7, XMMWORD PTR[10*16 + KS] + cmp NR, 10 + je @f + + ROUND 10 + ROUND 11 + vmovdqu xmm7, XMMWORD PTR[12*16 + KS] + cmp NR, 12 + je @f + + ROUND 12 + ROUND 13 + vmovdqu xmm7, XMMWORD PTR[14*16 + KS] +@@: + vaesenclast CTR0, CTR0, xmm7 + vaesenclast CTR1, CTR1, xmm7 + vaesenclast CTR2, CTR2, xmm7 + vaesenclast CTR3, CTR3, xmm7 + vaesenclast CTR4, CTR4, xmm7 + vaesenclast CTR5, CTR5, xmm7 + vaesenclast CTR6, CTR6, xmm7 + + vpxor CTR0, CTR0, XMMWORD PTR[0*16 + PT] + vpxor CTR1, CTR1, XMMWORD PTR[1*16 + PT] + vpxor CTR2, CTR2, XMMWORD PTR[2*16 + PT] + vpxor CTR3, CTR3, XMMWORD PTR[3*16 + PT] + vpxor CTR4, CTR4, XMMWORD PTR[4*16 + PT] + vpxor CTR5, CTR5, XMMWORD PTR[5*16 + PT] + vpxor CTR6, CTR6, XMMWORD PTR[6*16 + PT] + + vmovdqu XMMWORD PTR[0*16 + CT], CTR0 + vmovdqu XMMWORD PTR[1*16 + CT], CTR1 + vmovdqu XMMWORD PTR[2*16 + CT], CTR2 + vmovdqu XMMWORD PTR[3*16 + CT], CTR3 + vmovdqu XMMWORD PTR[4*16 + CT], CTR4 + vmovdqu XMMWORD PTR[5*16 + CT], CTR5 + vmovdqu XMMWORD PTR[6*16 + CT], CTR6 + + vpshufb CTR0, CTR0, XMMWORD PTR[Lbswap_mask] + vpshufb CTR1, CTR1, XMMWORD PTR[Lbswap_mask] + vpshufb CTR2, CTR2, XMMWORD PTR[Lbswap_mask] + vpshufb CTR3, CTR3, XMMWORD PTR[Lbswap_mask] + vpshufb CTR4, CTR4, XMMWORD PTR[Lbswap_mask] + vpshufb CTR5, CTR5, XMMWORD PTR[Lbswap_mask] + vpshufb TMP5, CTR6, XMMWORD PTR[Lbswap_mask] + + vmovdqa XMMWORD PTR[1*16 + esp], CTR5 + vmovdqa XMMWORD PTR[2*16 + esp], CTR4 + vmovdqa XMMWORD PTR[3*16 + esp], CTR3 + vmovdqa XMMWORD PTR[4*16 + esp], CTR2 + vmovdqa XMMWORD PTR[5*16 + esp], CTR1 + vmovdqa XMMWORD PTR[6*16 + esp], CTR0 + + lea CT, [7*16 + CT] + lea PT, [7*16 + PT] + jmp LEncData7 + +LEndEnc7: + + vpshufd TMP4, TMP5, 78 + vpxor TMP4, TMP4, TMP5 + vpclmulqdq TMP0, TMP4, XMMWORD PTR[0*16 + 8*16 + Htbl], 000h + vmovdqu TMP4, XMMWORD PTR[0*16 + Htbl] + vpclmulqdq TMP1, TMP5, TMP4, 011h + vpclmulqdq TMP2, TMP5, TMP4, 000h + + vmovdqu TMP5, XMMWORD PTR[1*16 + esp] + KARATSUBA 1 + vmovdqu TMP5, XMMWORD PTR[2*16 + esp] + KARATSUBA 2 + vmovdqu TMP5, XMMWORD PTR[3*16 + esp] + KARATSUBA 3 + vmovdqu TMP5, XMMWORD PTR[4*16 + esp] + KARATSUBA 4 + vmovdqu TMP5, XMMWORD PTR[5*16 + esp] + KARATSUBA 5 + vmovdqu TMP5, XMMWORD PTR[6*16 + esp] + vpxor TMP5, TMP5, T + KARATSUBA 6 + + vpxor TMP0, TMP0, TMP1 + vpxor TMP0, TMP0, TMP2 + vpsrldq TMP3, TMP0, 8 + vpxor TMP4, TMP1, TMP3 + vpslldq TMP3, TMP0, 8 + vpxor TMP5, TMP2, TMP3 + + vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h + vpalignr TMP5,TMP5,TMP5,8 + vpxor TMP5, TMP5, TMP1 + + vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h + vpalignr TMP5,TMP5,TMP5,8 + vpxor TMP5, TMP5, TMP1 + + vpxor TMP5, TMP5, TMP4 + vmovdqu T, TMP5 + + sub aluCTR, 6 + +LEncDataSingles: + + cmp len, 16 + jb LEncDataTail + sub len, 16 + + vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp] + NEXTCTR 0 + + vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] + vmovdqu TMP2, XMMWORD PTR[10*16 + KS] + cmp NR, 10 + je @f + vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] + vmovdqu TMP2, XMMWORD PTR[12*16 + KS] + cmp NR, 12 + je @f + vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] + vmovdqu TMP2, XMMWORD PTR[14*16 + KS] +@@: + vaesenclast TMP1, TMP1, TMP2 + vpxor TMP1, TMP1, XMMWORD PTR[PT] + vmovdqu XMMWORD PTR[CT], TMP1 + + lea PT, [16+PT] + lea CT, [16+CT] + + vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] + vpxor TMP1, TMP1, T + + vmovdqu TMP0, XMMWORD PTR[Htbl] + GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 + vmovdqu T, TMP1 + + jmp LEncDataSingles + +LEncDataTail: + + cmp len, 0 + je LEncDataEnd + + vmovdqa TMP1, XMMWORD PTR[8*16 + 0*16 + esp] + + vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] + vmovdqu TMP2, XMMWORD PTR[10*16 + KS] + cmp NR, 10 + je @f + vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] + vmovdqu TMP2, XMMWORD PTR[12*16 + KS] + cmp NR, 12 + je @f + vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] + vmovdqu TMP2, XMMWORD PTR[14*16 + KS] +@@: + vaesenclast TMP1, TMP1, TMP2 +; zero a temp location + vpxor TMP2, TMP2, TMP2 + vmovdqa XMMWORD PTR[esp], TMP2 +; copy as many bytes as needed + xor KS, KS + mov aluTMP, edx +@@: + cmp len, KS + je @f + mov dl, BYTE PTR[PT + KS] + mov BYTE PTR[esp + KS], dl + inc KS + jmp @b +@@: + vpxor TMP1, TMP1, XMMWORD PTR[esp] + vmovdqa XMMWORD PTR[esp], TMP1 + xor KS, KS +@@: + cmp len, KS + je @f + mov dl, BYTE PTR[esp + KS] + mov BYTE PTR[CT + KS], dl + inc KS + jmp @b +@@: + cmp KS, 16 + je @f + mov BYTE PTR[esp + KS], 0 + inc KS + jmp @b +@@: + mov edx, aluTMP + vmovdqa TMP1, XMMWORD PTR[esp] + vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] + vpxor TMP1, TMP1, T + + vmovdqu TMP0, XMMWORD PTR[Htbl] + GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 + vmovdqu T, TMP1 + +LEncDataEnd: + inc aluCTR + bswap aluCTR + mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR + + mov esp, ebp + pop edi + pop esi + pop ebx + pop ebp + + + vzeroupper + + ret +intel_aes_gcmENC ENDP + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Decrypt and Authenticate +; void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx, unsigned int len); +; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + +NEXTCTR MACRO i + add aluCTR, 1 + mov aluTMP, aluCTR + bswap aluTMP + xor aluTMP, [3*4 + KS] + mov [3*4 + i*16 + esp], aluTMP +ENDM + +intel_aes_gcmDEC PROC + + cmp DWORD PTR[1*4 + 3*4 + esp], 0 + jne LbeginDEC + ret + +LbeginDEC: + + vzeroupper + push ebp + push ebx + push esi + push edi + + mov ebp, esp + sub esp, 8*16 + and esp, -16 + + mov CT, [ebp + 5*4 + 0*4] + mov PT, [ebp + 5*4 + 1*4] + mov Gctx, [ebp + 5*4 + 2*4] + + mov KS, [16*16 + 3*16 + Gctx] + + mov aluCTR, [16*16 + 2*16 + 3*4 + Gctx] + bswap aluCTR + + + vmovdqu TMP0, XMMWORD PTR[0*16 + KS] + vpxor TMP0, TMP0, XMMWORD PTR[16*16 + 2*16 + Gctx] + vmovdqu XMMWORD PTR[0*16 + esp], TMP0 + + cmp len, 16*7 + jb LDecDataSingles + vmovdqu XMMWORD PTR[1*16 + esp], TMP0 + vmovdqu XMMWORD PTR[2*16 + esp], TMP0 + vmovdqu XMMWORD PTR[3*16 + esp], TMP0 + vmovdqu XMMWORD PTR[4*16 + esp], TMP0 + vmovdqu XMMWORD PTR[5*16 + esp], TMP0 + vmovdqu XMMWORD PTR[6*16 + esp], TMP0 + dec aluCTR + +LDecData7: + cmp len, 16*7 + jb LDecData7End + sub len, 16*7 + + vmovdqu TMP5, XMMWORD PTR[0*16 + CT] + vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] + vpxor TMP5, TMP5, T + vpshufd TMP4, TMP5, 78 + vpxor TMP4, TMP4, TMP5 + vpclmulqdq TMP0, TMP4, XMMWORD PTR[6*16 + 8*16 + Htbl], 000h + vmovdqu TMP4, XMMWORD PTR[6*16 + Htbl] + vpclmulqdq TMP1, TMP5, TMP4, 011h + vpclmulqdq TMP2, TMP5, TMP4, 000h + + NEXTCTR 0 + vmovdqu TMP5, XMMWORD PTR[1*16 + CT] + vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] + KARATSUBA 5 + NEXTCTR 1 + vmovdqu TMP5, XMMWORD PTR[2*16 + CT] + vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] + KARATSUBA 4 + NEXTCTR 2 + vmovdqu TMP5, XMMWORD PTR[3*16 + CT] + vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] + KARATSUBA 3 + NEXTCTR 3 + vmovdqu TMP5, XMMWORD PTR[4*16 + CT] + vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] + KARATSUBA 2 + NEXTCTR 4 + vmovdqu TMP5, XMMWORD PTR[5*16 + CT] + vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] + KARATSUBA 1 + NEXTCTR 5 + vmovdqu TMP5, XMMWORD PTR[6*16 + CT] + vpshufb TMP5, TMP5, XMMWORD PTR[Lbswap_mask] + KARATSUBA 0 + NEXTCTR 6 + + vpxor TMP0, TMP0, TMP1 + vpxor TMP0, TMP0, TMP2 + vpsrldq TMP3, TMP0, 8 + vpxor TMP4, TMP1, TMP3 + vpslldq TMP3, TMP0, 8 + vpxor TMP5, TMP2, TMP3 + + vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h + vpalignr TMP5,TMP5,TMP5,8 + vpxor TMP5, TMP5, TMP1 + + vpclmulqdq TMP1, TMP5, XMMWORD PTR[Lpoly], 010h + vpalignr TMP5,TMP5,TMP5,8 + vpxor TMP5, TMP5, TMP1 + + vpxor TMP5, TMP5, TMP4 + vmovdqu T, TMP5 + + vmovdqa CTR0, XMMWORD PTR[0*16 + esp] + vmovdqa CTR1, XMMWORD PTR[1*16 + esp] + vmovdqa CTR2, XMMWORD PTR[2*16 + esp] + vmovdqa CTR3, XMMWORD PTR[3*16 + esp] + vmovdqa CTR4, XMMWORD PTR[4*16 + esp] + vmovdqa CTR5, XMMWORD PTR[5*16 + esp] + vmovdqa CTR6, XMMWORD PTR[6*16 + esp] + + ROUND 1 + ROUND 2 + ROUND 3 + ROUND 4 + ROUND 5 + ROUND 6 + ROUND 7 + ROUND 8 + ROUND 9 + vmovdqu xmm7, XMMWORD PTR[10*16 + KS] + cmp NR, 10 + je @f + + ROUND 10 + ROUND 11 + vmovdqu xmm7, XMMWORD PTR[12*16 + KS] + cmp NR, 12 + je @f + + ROUND 12 + ROUND 13 + vmovdqu xmm7, XMMWORD PTR[14*16 + KS] +@@: + vaesenclast CTR0, CTR0, xmm7 + vaesenclast CTR1, CTR1, xmm7 + vaesenclast CTR2, CTR2, xmm7 + vaesenclast CTR3, CTR3, xmm7 + vaesenclast CTR4, CTR4, xmm7 + vaesenclast CTR5, CTR5, xmm7 + vaesenclast CTR6, CTR6, xmm7 + + vpxor CTR0, CTR0, XMMWORD PTR[0*16 + CT] + vpxor CTR1, CTR1, XMMWORD PTR[1*16 + CT] + vpxor CTR2, CTR2, XMMWORD PTR[2*16 + CT] + vpxor CTR3, CTR3, XMMWORD PTR[3*16 + CT] + vpxor CTR4, CTR4, XMMWORD PTR[4*16 + CT] + vpxor CTR5, CTR5, XMMWORD PTR[5*16 + CT] + vpxor CTR6, CTR6, XMMWORD PTR[6*16 + CT] + + vmovdqu XMMWORD PTR[0*16 + PT], CTR0 + vmovdqu XMMWORD PTR[1*16 + PT], CTR1 + vmovdqu XMMWORD PTR[2*16 + PT], CTR2 + vmovdqu XMMWORD PTR[3*16 + PT], CTR3 + vmovdqu XMMWORD PTR[4*16 + PT], CTR4 + vmovdqu XMMWORD PTR[5*16 + PT], CTR5 + vmovdqu XMMWORD PTR[6*16 + PT], CTR6 + + lea CT, [7*16 + CT] + lea PT, [7*16 + PT] + jmp LDecData7 + +LDecData7End: + + NEXTCTR 0 + +LDecDataSingles: + + cmp len, 16 + jb LDecDataTail + sub len, 16 + + vmovdqu TMP1, XMMWORD PTR[CT] + vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] + vpxor TMP1, TMP1, T + + vmovdqu TMP0, XMMWORD PTR[Htbl] + GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 + vmovdqu T, TMP1 + + vmovdqa TMP1, XMMWORD PTR[0*16 + esp] + NEXTCTR 0 + + vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] + vmovdqu TMP2, XMMWORD PTR[10*16 + KS] + cmp NR, 10 + je @f + vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] + vmovdqu TMP2, XMMWORD PTR[12*16 + KS] + cmp NR, 12 + je @f + vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] + vmovdqu TMP2, XMMWORD PTR[14*16 + KS] +@@: + vaesenclast TMP1, TMP1, TMP2 + vpxor TMP1, TMP1, XMMWORD PTR[CT] + vmovdqu XMMWORD PTR[PT], TMP1 + + lea PT, [16+PT] + lea CT, [16+CT] + jmp LDecDataSingles + +LDecDataTail: + + cmp len, 0 + je LDecDataEnd + + vmovdqa TMP1, XMMWORD PTR[0*16 + esp] + inc aluCTR + vaesenc TMP1, TMP1, XMMWORD PTR[1*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[2*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[3*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[4*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[5*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[6*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[7*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[8*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[9*16 + KS] + vmovdqu TMP2, XMMWORD PTR[10*16 + KS] + cmp NR, 10 + je @f + vaesenc TMP1, TMP1, XMMWORD PTR[10*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[11*16 + KS] + vmovdqu TMP2, XMMWORD PTR[12*16 + KS] + cmp NR, 12 + je @f + vaesenc TMP1, TMP1, XMMWORD PTR[12*16 + KS] + vaesenc TMP1, TMP1, XMMWORD PTR[13*16 + KS] + vmovdqu TMP2, XMMWORD PTR[14*16 + KS] +@@: + vaesenclast xmm7, TMP1, TMP2 + +; copy as many bytes as needed + xor KS, KS + mov aluTMP, edx +@@: + cmp len, KS + je @f + mov dl, BYTE PTR[CT + KS] + mov BYTE PTR[esp + KS], dl + inc KS + jmp @b +@@: + cmp KS, 16 + je @f + mov BYTE PTR[esp + KS], 0 + inc KS + jmp @b +@@: + mov edx, aluTMP + vmovdqa TMP1, XMMWORD PTR[esp] + vpshufb TMP1, TMP1, XMMWORD PTR[Lbswap_mask] + vpxor TMP1, TMP1, T + + vmovdqu TMP0, XMMWORD PTR[Htbl] + GFMUL TMP1, TMP1, TMP0, TMP5, TMP2, TMP3, TMP4 + vmovdqu T, TMP1 + + vpxor xmm7, xmm7, XMMWORD PTR[esp] + vmovdqa XMMWORD PTR[esp], xmm7 + xor KS, KS + mov aluTMP, edx +@@: + cmp len, KS + je @f + mov dl, BYTE PTR[esp + KS] + mov BYTE PTR[PT + KS], dl + inc KS + jmp @b +@@: + mov edx, aluTMP + +LDecDataEnd: + + bswap aluCTR + mov [16*16 + 2*16 + 3*4 + Gctx], aluCTR + + mov esp, ebp + pop edi + pop esi + pop ebx + pop ebp + + vzeroupper + + ret +intel_aes_gcmDEC ENDP + + +END -- cgit v1.2.3