summaryrefslogtreecommitdiffstats
path: root/security/nss/lib/freebl/intel-gcm.s
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 19:33:14 +0000
commit36d22d82aa202bb199967e9512281e9a53db42c9 (patch)
tree105e8c98ddea1c1e4784a60a5a6410fa416be2de /security/nss/lib/freebl/intel-gcm.s
parentInitial commit. (diff)
downloadfirefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.tar.xz
firefox-esr-36d22d82aa202bb199967e9512281e9a53db42c9.zip
Adding upstream version 115.7.0esr.upstream/115.7.0esr
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'security/nss/lib/freebl/intel-gcm.s')
-rw-r--r--security/nss/lib/freebl/intel-gcm.s1340
1 files changed, 1340 insertions, 0 deletions
diff --git a/security/nss/lib/freebl/intel-gcm.s b/security/nss/lib/freebl/intel-gcm.s
new file mode 100644
index 0000000000..5b5cf5d4bb
--- /dev/null
+++ b/security/nss/lib/freebl/intel-gcm.s
@@ -0,0 +1,1340 @@
+# LICENSE:
+# This submission to NSS is to be made available under the terms of the
+# Mozilla Public License, v. 2.0. You can obtain one at http:
+# //mozilla.org/MPL/2.0/.
+################################################################################
+# Copyright(c) 2012, Intel Corp.
+
+.align 16
+.Lone:
+.quad 1,0
+.Ltwo:
+.quad 2,0
+.Lbswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lshuff_mask:
+.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+.Lpoly:
+.quad 0x1, 0xc200000000000000
+
+
+################################################################################
+# Generates the final GCM tag
+# void intel_aes_gcmTAG(uint8_t Htbl[16*16], uint8_t *Tp, uint64_t Mlen, uint64_t Alen, uint8_t* X0, uint8_t* TAG);
+.type intel_aes_gcmTAG,@function
+.globl intel_aes_gcmTAG
+.align 16
+intel_aes_gcmTAG:
+
+.set Htbl, %rdi
+.set Tp, %rsi
+.set Mlen, %rdx
+.set Alen, %rcx
+.set X0, %r8
+.set TAG, %r9
+
+.set T,%xmm0
+.set TMP0,%xmm1
+
+ vmovdqu (Tp), T
+ vpshufb .Lbswap_mask(%rip), T, T
+ vpxor TMP0, TMP0, TMP0
+ shl $3, Mlen
+ shl $3, Alen
+ vpinsrq $0, Mlen, TMP0, TMP0
+ vpinsrq $1, Alen, TMP0, TMP0
+ vpxor TMP0, T, T
+ vmovdqu (Htbl), TMP0
+ call GFMUL
+ vpshufb .Lbswap_mask(%rip), T, T
+ vpxor (X0), T, T
+ vmovdqu T, (TAG)
+
+ret
+.size intel_aes_gcmTAG, .-intel_aes_gcmTAG
+################################################################################
+# Generates the H table
+# void intel_aes_gcmINIT(uint8_t Htbl[16*16], uint8_t *KS, int NR);
+.type intel_aes_gcmINIT,@function
+.globl intel_aes_gcmINIT
+.align 16
+intel_aes_gcmINIT:
+
+.set Htbl, %rdi
+.set KS, %rsi
+.set NR, %edx
+
+.set T,%xmm0
+.set TMP0,%xmm1
+
+CALCULATE_POWERS_OF_H:
+ vmovdqu 16*0(KS), T
+ vaesenc 16*1(KS), T, T
+ vaesenc 16*2(KS), T, T
+ vaesenc 16*3(KS), T, T
+ vaesenc 16*4(KS), T, T
+ vaesenc 16*5(KS), T, T
+ vaesenc 16*6(KS), T, T
+ vaesenc 16*7(KS), T, T
+ vaesenc 16*8(KS), T, T
+ vaesenc 16*9(KS), T, T
+ vmovdqu 16*10(KS), TMP0
+ cmp $10, NR
+ je .LH0done
+ vaesenc 16*10(KS), T, T
+ vaesenc 16*11(KS), T, T
+ vmovdqu 16*12(KS), TMP0
+ cmp $12, NR
+ je .LH0done
+ vaesenc 16*12(KS), T, T
+ vaesenc 16*13(KS), T, T
+ vmovdqu 16*14(KS), TMP0
+
+.LH0done:
+ vaesenclast TMP0, T, T
+
+ vpshufb .Lbswap_mask(%rip), T, T
+
+ vmovdqu T, TMP0
+ # Calculate H` = GFMUL(H, 2)
+ vpsrld $7 , T , %xmm3
+ vmovdqu .Lshuff_mask(%rip), %xmm4
+ vpshufb %xmm4, %xmm3 , %xmm3
+ movq $0xff00 , %rax
+ vmovq %rax, %xmm4
+ vpshufb %xmm3, %xmm4 , %xmm4
+ vmovdqu .Lpoly(%rip), %xmm5
+ vpand %xmm4, %xmm5, %xmm5
+ vpsrld $31, T, %xmm3
+ vpslld $1, T, %xmm4
+ vpslldq $4, %xmm3, %xmm3
+ vpxor %xmm3, %xmm4, T #xmm1 holds now p(x)<<1
+
+ #adding p(x)<<1 to xmm5
+ vpxor %xmm5, T , T
+ vmovdqu T, TMP0
+ vmovdqu T, (Htbl) # H * 2
+ call GFMUL
+ vmovdqu T, 16(Htbl) # H^2 * 2
+ call GFMUL
+ vmovdqu T, 32(Htbl) # H^3 * 2
+ call GFMUL
+ vmovdqu T, 48(Htbl) # H^4 * 2
+ call GFMUL
+ vmovdqu T, 64(Htbl) # H^5 * 2
+ call GFMUL
+ vmovdqu T, 80(Htbl) # H^6 * 2
+ call GFMUL
+ vmovdqu T, 96(Htbl) # H^7 * 2
+ call GFMUL
+ vmovdqu T, 112(Htbl) # H^8 * 2
+
+ # Precalculations for the reduce 4 step
+ vpshufd $78, (Htbl), %xmm8
+ vpshufd $78, 16(Htbl), %xmm9
+ vpshufd $78, 32(Htbl), %xmm10
+ vpshufd $78, 48(Htbl), %xmm11
+ vpshufd $78, 64(Htbl), %xmm12
+ vpshufd $78, 80(Htbl), %xmm13
+ vpshufd $78, 96(Htbl), %xmm14
+ vpshufd $78, 112(Htbl), %xmm15
+
+ vpxor (Htbl), %xmm8, %xmm8
+ vpxor 16(Htbl), %xmm9, %xmm9
+ vpxor 32(Htbl), %xmm10, %xmm10
+ vpxor 48(Htbl), %xmm11, %xmm11
+ vpxor 64(Htbl), %xmm12, %xmm12
+ vpxor 80(Htbl), %xmm13, %xmm13
+ vpxor 96(Htbl), %xmm14, %xmm14
+ vpxor 112(Htbl), %xmm15, %xmm15
+
+ vmovdqu %xmm8, 128(Htbl)
+ vmovdqu %xmm9, 144(Htbl)
+ vmovdqu %xmm10, 160(Htbl)
+ vmovdqu %xmm11, 176(Htbl)
+ vmovdqu %xmm12, 192(Htbl)
+ vmovdqu %xmm13, 208(Htbl)
+ vmovdqu %xmm14, 224(Htbl)
+ vmovdqu %xmm15, 240(Htbl)
+
+ ret
+.size intel_aes_gcmINIT, .-intel_aes_gcmINIT
+################################################################################
+# Authenticate only
+# void intel_aes_gcmAAD(uint8_t Htbl[16*16], uint8_t *AAD, uint64_t Alen, uint8_t *Tp);
+
+.globl intel_aes_gcmAAD
+.type intel_aes_gcmAAD,@function
+.align 16
+intel_aes_gcmAAD:
+
+.set DATA, %xmm0
+.set T, %xmm1
+.set BSWAP_MASK, %xmm2
+.set TMP0, %xmm3
+.set TMP1, %xmm4
+.set TMP2, %xmm5
+.set TMP3, %xmm6
+.set TMP4, %xmm7
+.set Xhi, %xmm9
+
+.set Htbl, %rdi
+.set inp, %rsi
+.set len, %rdx
+.set Tp, %rcx
+
+.set hlp0, %r11
+
+.macro KARATSUBA_AAD i
+ vpclmulqdq $0x00, 16*\i(Htbl), DATA, TMP3
+ vpxor TMP3, TMP0, TMP0
+ vpclmulqdq $0x11, 16*\i(Htbl), DATA, TMP3
+ vpxor TMP3, TMP1, TMP1
+ vpshufd $78, DATA, TMP3
+ vpxor DATA, TMP3, TMP3
+ vpclmulqdq $0x00, 16*(\i+8)(Htbl), TMP3, TMP3
+ vpxor TMP3, TMP2, TMP2
+.endm
+
+ test len, len
+ jnz .LbeginAAD
+ ret
+
+.LbeginAAD:
+
+ push hlp0
+ vzeroupper
+
+ vmovdqa .Lbswap_mask(%rip), BSWAP_MASK
+
+ vpxor Xhi, Xhi, Xhi
+
+ vmovdqu (Tp),T
+ vpshufb BSWAP_MASK,T,T
+
+ # we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first
+ mov len, hlp0
+ and $~-128, hlp0
+
+ jz .Lmod_loop
+
+ sub hlp0, len
+ sub $16, hlp0
+
+ #hash first prefix block
+ vmovdqu (inp), DATA
+ vpshufb BSWAP_MASK, DATA, DATA
+ vpxor T, DATA, DATA
+
+ vpclmulqdq $0x00, (Htbl, hlp0), DATA, TMP0
+ vpclmulqdq $0x11, (Htbl, hlp0), DATA, TMP1
+ vpshufd $78, DATA, TMP2
+ vpxor DATA, TMP2, TMP2
+ vpclmulqdq $0x00, 16*8(Htbl, hlp0), TMP2, TMP2
+
+ lea 16(inp), inp
+ test hlp0, hlp0
+ jnz .Lpre_loop
+ jmp .Lred1
+
+ #hash remaining prefix bocks (up to 7 total prefix blocks)
+.align 64
+.Lpre_loop:
+
+ sub $16, hlp0
+
+ vmovdqu (inp),DATA # next data block
+ vpshufb BSWAP_MASK,DATA,DATA
+
+ vpclmulqdq $0x00, (Htbl,hlp0), DATA, TMP3
+ vpxor TMP3, TMP0, TMP0
+ vpclmulqdq $0x11, (Htbl,hlp0), DATA, TMP3
+ vpxor TMP3, TMP1, TMP1
+ vpshufd $78, DATA, TMP3
+ vpxor DATA, TMP3, TMP3
+ vpclmulqdq $0x00, 16*8(Htbl,hlp0), TMP3, TMP3
+ vpxor TMP3, TMP2, TMP2
+
+ test hlp0, hlp0
+
+ lea 16(inp), inp
+
+ jnz .Lpre_loop
+
+.Lred1:
+ vpxor TMP0, TMP2, TMP2
+ vpxor TMP1, TMP2, TMP2
+ vpsrldq $8, TMP2, TMP3
+ vpslldq $8, TMP2, TMP2
+
+ vpxor TMP3, TMP1, Xhi
+ vpxor TMP2, TMP0, T
+
+.align 64
+.Lmod_loop:
+ sub $0x80, len
+ jb .Ldone
+
+ vmovdqu 16*7(inp),DATA # Ii
+ vpshufb BSWAP_MASK,DATA,DATA
+
+ vpclmulqdq $0x00, (Htbl), DATA, TMP0
+ vpclmulqdq $0x11, (Htbl), DATA, TMP1
+ vpshufd $78, DATA, TMP2
+ vpxor DATA, TMP2, TMP2
+ vpclmulqdq $0x00, 16*8(Htbl), TMP2, TMP2
+ #########################################################
+ vmovdqu 16*6(inp),DATA
+ vpshufb BSWAP_MASK,DATA,DATA
+ KARATSUBA_AAD 1
+ #########################################################
+ vmovdqu 16*5(inp),DATA
+ vpshufb BSWAP_MASK,DATA,DATA
+
+ vpclmulqdq $0x10, .Lpoly(%rip), T, TMP4 #reduction stage 1a
+ vpalignr $8, T, T, T
+
+ KARATSUBA_AAD 2
+
+ vpxor TMP4, T, T #reduction stage 1b
+ #########################################################
+ vmovdqu 16*4(inp),DATA
+ vpshufb BSWAP_MASK,DATA,DATA
+
+ KARATSUBA_AAD 3
+ #########################################################
+ vmovdqu 16*3(inp),DATA
+ vpshufb BSWAP_MASK,DATA,DATA
+
+ vpclmulqdq $0x10, .Lpoly(%rip), T, TMP4 #reduction stage 2a
+ vpalignr $8, T, T, T
+
+ KARATSUBA_AAD 4
+
+ vpxor TMP4, T, T #reduction stage 2b
+ #########################################################
+ vmovdqu 16*2(inp),DATA
+ vpshufb BSWAP_MASK,DATA,DATA
+
+ KARATSUBA_AAD 5
+
+ vpxor Xhi, T, T #reduction finalize
+ #########################################################
+ vmovdqu 16*1(inp),DATA
+ vpshufb BSWAP_MASK,DATA,DATA
+
+ KARATSUBA_AAD 6
+ #########################################################
+ vmovdqu 16*0(inp),DATA
+ vpshufb BSWAP_MASK,DATA,DATA
+ vpxor T,DATA,DATA
+
+ KARATSUBA_AAD 7
+ #########################################################
+ vpxor TMP0, TMP2, TMP2 # karatsuba fixup
+ vpxor TMP1, TMP2, TMP2
+ vpsrldq $8, TMP2, TMP3
+ vpslldq $8, TMP2, TMP2
+
+ vpxor TMP3, TMP1, Xhi
+ vpxor TMP2, TMP0, T
+
+ lea 16*8(inp), inp
+ jmp .Lmod_loop
+ #########################################################
+
+.Ldone:
+ vpclmulqdq $0x10, .Lpoly(%rip), T, TMP3
+ vpalignr $8, T, T, T
+ vpxor TMP3, T, T
+
+ vpclmulqdq $0x10, .Lpoly(%rip), T, TMP3
+ vpalignr $8, T, T, T
+ vpxor TMP3, T, T
+
+ vpxor Xhi, T, T
+
+.Lsave:
+ vpshufb BSWAP_MASK,T, T
+ vmovdqu T,(Tp)
+ vzeroupper
+
+ pop hlp0
+ ret
+.size intel_aes_gcmAAD,.-intel_aes_gcmAAD
+
+################################################################################
+# Encrypt and Authenticate
+# void intel_aes_gcmENC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len);
+.type intel_aes_gcmENC,@function
+.globl intel_aes_gcmENC
+.align 16
+intel_aes_gcmENC:
+
+.set PT,%rdi
+.set CT,%rsi
+.set Htbl, %rdx
+.set len, %rcx
+.set KS,%r9
+.set NR,%r10d
+
+.set Gctx, %rdx
+
+.set T,%xmm0
+.set TMP0,%xmm1
+.set TMP1,%xmm2
+.set TMP2,%xmm3
+.set TMP3,%xmm4
+.set TMP4,%xmm5
+.set TMP5,%xmm6
+.set CTR0,%xmm7
+.set CTR1,%xmm8
+.set CTR2,%xmm9
+.set CTR3,%xmm10
+.set CTR4,%xmm11
+.set CTR5,%xmm12
+.set CTR6,%xmm13
+.set CTR7,%xmm14
+.set CTR,%xmm15
+
+.macro ROUND i
+ vmovdqu \i*16(KS), TMP3
+ vaesenc TMP3, CTR0, CTR0
+ vaesenc TMP3, CTR1, CTR1
+ vaesenc TMP3, CTR2, CTR2
+ vaesenc TMP3, CTR3, CTR3
+ vaesenc TMP3, CTR4, CTR4
+ vaesenc TMP3, CTR5, CTR5
+ vaesenc TMP3, CTR6, CTR6
+ vaesenc TMP3, CTR7, CTR7
+.endm
+
+.macro ROUNDMUL i
+
+ vmovdqu \i*16(%rsp), TMP5
+ vmovdqu \i*16(KS), TMP3
+
+ vaesenc TMP3, CTR0, CTR0
+ vaesenc TMP3, CTR1, CTR1
+ vaesenc TMP3, CTR2, CTR2
+ vaesenc TMP3, CTR3, CTR3
+
+ vpshufd $78, TMP5, TMP4
+ vpxor TMP5, TMP4, TMP4
+
+ vaesenc TMP3, CTR4, CTR4
+ vaesenc TMP3, CTR5, CTR5
+ vaesenc TMP3, CTR6, CTR6
+ vaesenc TMP3, CTR7, CTR7
+
+ vpclmulqdq $0x00, 128+\i*16(Htbl), TMP4, TMP3
+ vpxor TMP3, TMP0, TMP0
+ vmovdqa \i*16(Htbl), TMP4
+ vpclmulqdq $0x11, TMP4, TMP5, TMP3
+ vpxor TMP3, TMP1, TMP1
+ vpclmulqdq $0x00, TMP4, TMP5, TMP3
+ vpxor TMP3, TMP2, TMP2
+
+.endm
+
+.macro KARATSUBA i
+ vmovdqu \i*16(%rsp), TMP5
+
+ vpclmulqdq $0x11, 16*\i(Htbl), TMP5, TMP3
+ vpxor TMP3, TMP1, TMP1
+ vpclmulqdq $0x00, 16*\i(Htbl), TMP5, TMP3
+ vpxor TMP3, TMP2, TMP2
+ vpshufd $78, TMP5, TMP3
+ vpxor TMP5, TMP3, TMP5
+ vpclmulqdq $0x00, 128+\i*16(Htbl), TMP5, TMP3
+ vpxor TMP3, TMP0, TMP0
+.endm
+
+ test len, len
+ jnz .Lbegin
+ ret
+
+.Lbegin:
+
+ vzeroupper
+ push %rbp
+ push %rbx
+
+ movq %rsp, %rbp
+ sub $128, %rsp
+ andq $-16, %rsp
+
+ vmovdqu 288(Gctx), CTR
+ vmovdqu 272(Gctx), T
+ mov 304(Gctx), KS
+# AESContext->Nr
+ mov 244(KS), NR
+
+ vpshufb .Lbswap_mask(%rip), CTR, CTR
+ vpshufb .Lbswap_mask(%rip), T, T
+
+ cmp $128, len
+ jb .LDataSingles
+
+# Encrypt the first eight blocks
+ sub $128, len
+ vmovdqa CTR, CTR0
+ vpaddd .Lone(%rip), CTR0, CTR1
+ vpaddd .Ltwo(%rip), CTR0, CTR2
+ vpaddd .Lone(%rip), CTR2, CTR3
+ vpaddd .Ltwo(%rip), CTR2, CTR4
+ vpaddd .Lone(%rip), CTR4, CTR5
+ vpaddd .Ltwo(%rip), CTR4, CTR6
+ vpaddd .Lone(%rip), CTR6, CTR7
+ vpaddd .Ltwo(%rip), CTR6, CTR
+
+ vpshufb .Lbswap_mask(%rip), CTR0, CTR0
+ vpshufb .Lbswap_mask(%rip), CTR1, CTR1
+ vpshufb .Lbswap_mask(%rip), CTR2, CTR2
+ vpshufb .Lbswap_mask(%rip), CTR3, CTR3
+ vpshufb .Lbswap_mask(%rip), CTR4, CTR4
+ vpshufb .Lbswap_mask(%rip), CTR5, CTR5
+ vpshufb .Lbswap_mask(%rip), CTR6, CTR6
+ vpshufb .Lbswap_mask(%rip), CTR7, CTR7
+
+ vpxor (KS), CTR0, CTR0
+ vpxor (KS), CTR1, CTR1
+ vpxor (KS), CTR2, CTR2
+ vpxor (KS), CTR3, CTR3
+ vpxor (KS), CTR4, CTR4
+ vpxor (KS), CTR5, CTR5
+ vpxor (KS), CTR6, CTR6
+ vpxor (KS), CTR7, CTR7
+
+ ROUND 1
+ ROUND 2
+ ROUND 3
+ ROUND 4
+ ROUND 5
+ ROUND 6
+ ROUND 7
+ ROUND 8
+ ROUND 9
+
+ vmovdqu 160(KS), TMP5
+ cmp $12, NR
+ jb .LLast1
+
+ ROUND 10
+ ROUND 11
+
+ vmovdqu 192(KS), TMP5
+ cmp $14, NR
+ jb .LLast1
+
+ ROUND 12
+ ROUND 13
+
+ vmovdqu 224(KS), TMP5
+
+.LLast1:
+
+ vpxor (PT), TMP5, TMP3
+ vaesenclast TMP3, CTR0, CTR0
+ vpxor 16(PT), TMP5, TMP3
+ vaesenclast TMP3, CTR1, CTR1
+ vpxor 32(PT), TMP5, TMP3
+ vaesenclast TMP3, CTR2, CTR2
+ vpxor 48(PT), TMP5, TMP3
+ vaesenclast TMP3, CTR3, CTR3
+ vpxor 64(PT), TMP5, TMP3
+ vaesenclast TMP3, CTR4, CTR4
+ vpxor 80(PT), TMP5, TMP3
+ vaesenclast TMP3, CTR5, CTR5
+ vpxor 96(PT), TMP5, TMP3
+ vaesenclast TMP3, CTR6, CTR6
+ vpxor 112(PT), TMP5, TMP3
+ vaesenclast TMP3, CTR7, CTR7
+
+ vmovdqu .Lbswap_mask(%rip), TMP3
+
+ vmovdqu CTR0, (CT)
+ vpshufb TMP3, CTR0, CTR0
+ vmovdqu CTR1, 16(CT)
+ vpshufb TMP3, CTR1, CTR1
+ vmovdqu CTR2, 32(CT)
+ vpshufb TMP3, CTR2, CTR2
+ vmovdqu CTR3, 48(CT)
+ vpshufb TMP3, CTR3, CTR3
+ vmovdqu CTR4, 64(CT)
+ vpshufb TMP3, CTR4, CTR4
+ vmovdqu CTR5, 80(CT)
+ vpshufb TMP3, CTR5, CTR5
+ vmovdqu CTR6, 96(CT)
+ vpshufb TMP3, CTR6, CTR6
+ vmovdqu CTR7, 112(CT)
+ vpshufb TMP3, CTR7, CTR7
+
+ lea 128(CT), CT
+ lea 128(PT), PT
+ jmp .LDataOctets
+
+# Encrypt 8 blocks each time while hashing previous 8 blocks
+.align 64
+.LDataOctets:
+ cmp $128, len
+ jb .LEndOctets
+ sub $128, len
+
+ vmovdqa CTR7, TMP5
+ vmovdqa CTR6, 1*16(%rsp)
+ vmovdqa CTR5, 2*16(%rsp)
+ vmovdqa CTR4, 3*16(%rsp)
+ vmovdqa CTR3, 4*16(%rsp)
+ vmovdqa CTR2, 5*16(%rsp)
+ vmovdqa CTR1, 6*16(%rsp)
+ vmovdqa CTR0, 7*16(%rsp)
+
+ vmovdqa CTR, CTR0
+ vpaddd .Lone(%rip), CTR0, CTR1
+ vpaddd .Ltwo(%rip), CTR0, CTR2
+ vpaddd .Lone(%rip), CTR2, CTR3
+ vpaddd .Ltwo(%rip), CTR2, CTR4
+ vpaddd .Lone(%rip), CTR4, CTR5
+ vpaddd .Ltwo(%rip), CTR4, CTR6
+ vpaddd .Lone(%rip), CTR6, CTR7
+ vpaddd .Ltwo(%rip), CTR6, CTR
+
+ vmovdqu (KS), TMP4
+ vpshufb TMP3, CTR0, CTR0
+ vpxor TMP4, CTR0, CTR0
+ vpshufb TMP3, CTR1, CTR1
+ vpxor TMP4, CTR1, CTR1
+ vpshufb TMP3, CTR2, CTR2
+ vpxor TMP4, CTR2, CTR2
+ vpshufb TMP3, CTR3, CTR3
+ vpxor TMP4, CTR3, CTR3
+ vpshufb TMP3, CTR4, CTR4
+ vpxor TMP4, CTR4, CTR4
+ vpshufb TMP3, CTR5, CTR5
+ vpxor TMP4, CTR5, CTR5
+ vpshufb TMP3, CTR6, CTR6
+ vpxor TMP4, CTR6, CTR6
+ vpshufb TMP3, CTR7, CTR7
+ vpxor TMP4, CTR7, CTR7
+
+ vmovdqu 16*0(Htbl), TMP3
+ vpclmulqdq $0x11, TMP3, TMP5, TMP1
+ vpclmulqdq $0x00, TMP3, TMP5, TMP2
+ vpshufd $78, TMP5, TMP3
+ vpxor TMP5, TMP3, TMP5
+ vmovdqu 128+0*16(Htbl), TMP3
+ vpclmulqdq $0x00, TMP3, TMP5, TMP0
+
+ ROUNDMUL 1
+
+ ROUNDMUL 2
+
+ ROUNDMUL 3
+
+ ROUNDMUL 4
+
+ ROUNDMUL 5
+
+ ROUNDMUL 6
+
+ vpxor 7*16(%rsp), T, TMP5
+ vmovdqu 7*16(KS), TMP3
+
+ vaesenc TMP3, CTR0, CTR0
+ vaesenc TMP3, CTR1, CTR1
+ vaesenc TMP3, CTR2, CTR2
+ vaesenc TMP3, CTR3, CTR3
+
+ vpshufd $78, TMP5, TMP4
+ vpxor TMP5, TMP4, TMP4
+
+ vaesenc TMP3, CTR4, CTR4
+ vaesenc TMP3, CTR5, CTR5
+ vaesenc TMP3, CTR6, CTR6
+ vaesenc TMP3, CTR7, CTR7
+
+ vpclmulqdq $0x11, 7*16(Htbl), TMP5, TMP3
+ vpxor TMP3, TMP1, TMP1
+ vpclmulqdq $0x00, 7*16(Htbl), TMP5, TMP3
+ vpxor TMP3, TMP2, TMP2
+ vpclmulqdq $0x00, 128+7*16(Htbl), TMP4, TMP3
+ vpxor TMP3, TMP0, TMP0
+
+ ROUND 8
+ vmovdqa .Lpoly(%rip), TMP5
+
+ vpxor TMP1, TMP0, TMP0
+ vpxor TMP2, TMP0, TMP0
+ vpsrldq $8, TMP0, TMP3
+ vpxor TMP3, TMP1, TMP4
+ vpslldq $8, TMP0, TMP3
+ vpxor TMP3, TMP2, T
+
+ vpclmulqdq $0x10, TMP5, T, TMP1
+ vpalignr $8, T, T, T
+ vpxor T, TMP1, T
+
+ ROUND 9
+
+ vpclmulqdq $0x10, TMP5, T, TMP1
+ vpalignr $8, T, T, T
+ vpxor T, TMP1, T
+
+ vmovdqu 160(KS), TMP5
+ cmp $10, NR
+ jbe .LLast2
+
+ ROUND 10
+ ROUND 11
+
+ vmovdqu 192(KS), TMP5
+ cmp $12, NR
+ jbe .LLast2
+
+ ROUND 12
+ ROUND 13
+
+ vmovdqu 224(KS), TMP5
+
+.LLast2:
+
+ vpxor (PT), TMP5, TMP3
+ vaesenclast TMP3, CTR0, CTR0
+ vpxor 16(PT), TMP5, TMP3
+ vaesenclast TMP3, CTR1, CTR1
+ vpxor 32(PT), TMP5, TMP3
+ vaesenclast TMP3, CTR2, CTR2
+ vpxor 48(PT), TMP5, TMP3
+ vaesenclast TMP3, CTR3, CTR3
+ vpxor 64(PT), TMP5, TMP3
+ vaesenclast TMP3, CTR4, CTR4
+ vpxor 80(PT), TMP5, TMP3
+ vaesenclast TMP3, CTR5, CTR5
+ vpxor 96(PT), TMP5, TMP3
+ vaesenclast TMP3, CTR6, CTR6
+ vpxor 112(PT), TMP5, TMP3
+ vaesenclast TMP3, CTR7, CTR7
+
+ vmovdqu .Lbswap_mask(%rip), TMP3
+
+ vmovdqu CTR0, (CT)
+ vpshufb TMP3, CTR0, CTR0
+ vmovdqu CTR1, 16(CT)
+ vpshufb TMP3, CTR1, CTR1
+ vmovdqu CTR2, 32(CT)
+ vpshufb TMP3, CTR2, CTR2
+ vmovdqu CTR3, 48(CT)
+ vpshufb TMP3, CTR3, CTR3
+ vmovdqu CTR4, 64(CT)
+ vpshufb TMP3, CTR4, CTR4
+ vmovdqu CTR5, 80(CT)
+ vpshufb TMP3, CTR5, CTR5
+ vmovdqu CTR6, 96(CT)
+ vpshufb TMP3, CTR6, CTR6
+ vmovdqu CTR7,112(CT)
+ vpshufb TMP3, CTR7, CTR7
+
+ vpxor TMP4, T, T
+
+ lea 128(CT), CT
+ lea 128(PT), PT
+ jmp .LDataOctets
+
+.LEndOctets:
+
+ vmovdqa CTR7, TMP5
+ vmovdqa CTR6, 1*16(%rsp)
+ vmovdqa CTR5, 2*16(%rsp)
+ vmovdqa CTR4, 3*16(%rsp)
+ vmovdqa CTR3, 4*16(%rsp)
+ vmovdqa CTR2, 5*16(%rsp)
+ vmovdqa CTR1, 6*16(%rsp)
+ vmovdqa CTR0, 7*16(%rsp)
+
+ vmovdqu 16*0(Htbl), TMP3
+ vpclmulqdq $0x11, TMP3, TMP5, TMP1
+ vpclmulqdq $0x00, TMP3, TMP5, TMP2
+ vpshufd $78, TMP5, TMP3
+ vpxor TMP5, TMP3, TMP5
+ vmovdqu 128+0*16(Htbl), TMP3
+ vpclmulqdq $0x00, TMP3, TMP5, TMP0
+
+ KARATSUBA 1
+ KARATSUBA 2
+ KARATSUBA 3
+ KARATSUBA 4
+ KARATSUBA 5
+ KARATSUBA 6
+
+ vmovdqu 7*16(%rsp), TMP5
+ vpxor T, TMP5, TMP5
+ vmovdqu 16*7(Htbl), TMP4
+ vpclmulqdq $0x11, TMP4, TMP5, TMP3
+ vpxor TMP3, TMP1, TMP1
+ vpclmulqdq $0x00, TMP4, TMP5, TMP3
+ vpxor TMP3, TMP2, TMP2
+ vpshufd $78, TMP5, TMP3
+ vpxor TMP5, TMP3, TMP5
+ vmovdqu 128+7*16(Htbl), TMP4
+ vpclmulqdq $0x00, TMP4, TMP5, TMP3
+ vpxor TMP3, TMP0, TMP0
+
+ vpxor TMP1, TMP0, TMP0
+ vpxor TMP2, TMP0, TMP0
+
+ vpsrldq $8, TMP0, TMP3
+ vpxor TMP3, TMP1, TMP4
+ vpslldq $8, TMP0, TMP3
+ vpxor TMP3, TMP2, T
+
+ vmovdqa .Lpoly(%rip), TMP2
+
+ vpalignr $8, T, T, TMP1
+ vpclmulqdq $0x10, TMP2, T, T
+ vpxor T, TMP1, T
+
+ vpalignr $8, T, T, TMP1
+ vpclmulqdq $0x10, TMP2, T, T
+ vpxor T, TMP1, T
+
+ vpxor TMP4, T, T
+
+#Here we encrypt any remaining whole block
+.LDataSingles:
+
+ cmp $16, len
+ jb .LDataTail
+ sub $16, len
+
+ vpshufb .Lbswap_mask(%rip), CTR, TMP1
+ vpaddd .Lone(%rip), CTR, CTR
+
+ vpxor (KS), TMP1, TMP1
+ vaesenc 16*1(KS), TMP1, TMP1
+ vaesenc 16*2(KS), TMP1, TMP1
+ vaesenc 16*3(KS), TMP1, TMP1
+ vaesenc 16*4(KS), TMP1, TMP1
+ vaesenc 16*5(KS), TMP1, TMP1
+ vaesenc 16*6(KS), TMP1, TMP1
+ vaesenc 16*7(KS), TMP1, TMP1
+ vaesenc 16*8(KS), TMP1, TMP1
+ vaesenc 16*9(KS), TMP1, TMP1
+ vmovdqu 16*10(KS), TMP2
+ cmp $10, NR
+ je .LLast3
+ vaesenc 16*10(KS), TMP1, TMP1
+ vaesenc 16*11(KS), TMP1, TMP1
+ vmovdqu 16*12(KS), TMP2
+ cmp $12, NR
+ je .LLast3
+ vaesenc 16*12(KS), TMP1, TMP1
+ vaesenc 16*13(KS), TMP1, TMP1
+ vmovdqu 16*14(KS), TMP2
+
+.LLast3:
+ vaesenclast TMP2, TMP1, TMP1
+
+ vpxor (PT), TMP1, TMP1
+ vmovdqu TMP1, (CT)
+ addq $16, CT
+ addq $16, PT
+
+ vpshufb .Lbswap_mask(%rip), TMP1, TMP1
+ vpxor TMP1, T, T
+ vmovdqu (Htbl), TMP0
+ call GFMUL
+
+ jmp .LDataSingles
+
+#Here we encypt the final partial block, if there is one
+.LDataTail:
+
+ test len, len
+ jz DATA_END
+# First prepare the counter block
+ vpshufb .Lbswap_mask(%rip), CTR, TMP1
+ vpaddd .Lone(%rip), CTR, CTR
+
+ vpxor (KS), TMP1, TMP1
+ vaesenc 16*1(KS), TMP1, TMP1
+ vaesenc 16*2(KS), TMP1, TMP1
+ vaesenc 16*3(KS), TMP1, TMP1
+ vaesenc 16*4(KS), TMP1, TMP1
+ vaesenc 16*5(KS), TMP1, TMP1
+ vaesenc 16*6(KS), TMP1, TMP1
+ vaesenc 16*7(KS), TMP1, TMP1
+ vaesenc 16*8(KS), TMP1, TMP1
+ vaesenc 16*9(KS), TMP1, TMP1
+ vmovdqu 16*10(KS), TMP2
+ cmp $10, NR
+ je .LLast4
+ vaesenc 16*10(KS), TMP1, TMP1
+ vaesenc 16*11(KS), TMP1, TMP1
+ vmovdqu 16*12(KS), TMP2
+ cmp $12, NR
+ je .LLast4
+ vaesenc 16*12(KS), TMP1, TMP1
+ vaesenc 16*13(KS), TMP1, TMP1
+ vmovdqu 16*14(KS), TMP2
+
+.LLast4:
+ vaesenclast TMP2, TMP1, TMP1
+#Zero a temp location
+ vpxor TMP2, TMP2, TMP2
+ vmovdqa TMP2, (%rsp)
+
+# Copy the required bytes only (could probably use rep movsb)
+ xor KS, KS
+.LEncCpy:
+ cmp KS, len
+ je .LEncCpyEnd
+ movb (PT, KS, 1), %r8b
+ movb %r8b, (%rsp, KS, 1)
+ inc KS
+ jmp .LEncCpy
+.LEncCpyEnd:
+# Xor with the counter block
+ vpxor (%rsp), TMP1, TMP0
+# Again, store at temp location
+ vmovdqa TMP0, (%rsp)
+# Copy only the required bytes to CT, and zero the rest for the hash
+ xor KS, KS
+.LEncCpy2:
+ cmp KS, len
+ je .LEncCpy3
+ movb (%rsp, KS, 1), %r8b
+ movb %r8b, (CT, KS, 1)
+ inc KS
+ jmp .LEncCpy2
+.LEncCpy3:
+ cmp $16, KS
+ je .LEndCpy3
+ movb $0, (%rsp, KS, 1)
+ inc KS
+ jmp .LEncCpy3
+.LEndCpy3:
+ vmovdqa (%rsp), TMP0
+
+ vpshufb .Lbswap_mask(%rip), TMP0, TMP0
+ vpxor TMP0, T, T
+ vmovdqu (Htbl), TMP0
+ call GFMUL
+
+DATA_END:
+
+ vpshufb .Lbswap_mask(%rip), T, T
+ vpshufb .Lbswap_mask(%rip), CTR, CTR
+ vmovdqu T, 272(Gctx)
+ vmovdqu CTR, 288(Gctx)
+
+ movq %rbp, %rsp
+
+ popq %rbx
+ popq %rbp
+ ret
+ .size intel_aes_gcmENC, .-intel_aes_gcmENC
+
+#########################
+# Decrypt and Authenticate
+# void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len);
+.type intel_aes_gcmDEC,@function
+.globl intel_aes_gcmDEC
+.align 16
+intel_aes_gcmDEC:
+# parameter 1: CT # input
+# parameter 2: PT # output
+# parameter 3: %rdx # Gctx
+# parameter 4: %rcx # len
+
+.macro DEC_KARATSUBA i
+ vmovdqu (7-\i)*16(CT), TMP5
+ vpshufb .Lbswap_mask(%rip), TMP5, TMP5
+
+ vpclmulqdq $0x11, 16*\i(Htbl), TMP5, TMP3
+ vpxor TMP3, TMP1, TMP1
+ vpclmulqdq $0x00, 16*\i(Htbl), TMP5, TMP3
+ vpxor TMP3, TMP2, TMP2
+ vpshufd $78, TMP5, TMP3
+ vpxor TMP5, TMP3, TMP5
+ vpclmulqdq $0x00, 128+\i*16(Htbl), TMP5, TMP3
+ vpxor TMP3, TMP0, TMP0
+.endm
+
+.set PT,%rsi
+.set CT,%rdi
+.set Htbl, %rdx
+.set len, %rcx
+.set KS,%r9
+.set NR,%r10d
+
+.set Gctx, %rdx
+
+.set T,%xmm0
+.set TMP0,%xmm1
+.set TMP1,%xmm2
+.set TMP2,%xmm3
+.set TMP3,%xmm4
+.set TMP4,%xmm5
+.set TMP5,%xmm6
+.set CTR0,%xmm7
+.set CTR1,%xmm8
+.set CTR2,%xmm9
+.set CTR3,%xmm10
+.set CTR4,%xmm11
+.set CTR5,%xmm12
+.set CTR6,%xmm13
+.set CTR7,%xmm14
+.set CTR,%xmm15
+
+ test len, len
+ jnz .LbeginDec
+ ret
+
+.LbeginDec:
+
+ pushq %rbp
+ pushq %rbx
+ movq %rsp, %rbp
+ sub $128, %rsp
+ andq $-16, %rsp
+ vmovdqu 288(Gctx), CTR
+ vmovdqu 272(Gctx), T
+ mov 304(Gctx), KS
+# AESContext->Nr
+ mov 244(KS), NR
+
+ vpshufb .Lbswap_mask(%rip), CTR, CTR
+ vpshufb .Lbswap_mask(%rip), T, T
+
+ vmovdqu .Lbswap_mask(%rip), TMP3
+ jmp .LDECOctets
+
+# Decrypt 8 blocks each time while hashing them at the same time
+.align 64
+.LDECOctets:
+
+ cmp $128, len
+ jb .LDECSingles
+ sub $128, len
+
+ vmovdqa CTR, CTR0
+ vpaddd .Lone(%rip), CTR0, CTR1
+ vpaddd .Ltwo(%rip), CTR0, CTR2
+ vpaddd .Lone(%rip), CTR2, CTR3
+ vpaddd .Ltwo(%rip), CTR2, CTR4
+ vpaddd .Lone(%rip), CTR4, CTR5
+ vpaddd .Ltwo(%rip), CTR4, CTR6
+ vpaddd .Lone(%rip), CTR6, CTR7
+ vpaddd .Ltwo(%rip), CTR6, CTR
+
+ vpshufb TMP3, CTR0, CTR0
+ vpshufb TMP3, CTR1, CTR1
+ vpshufb TMP3, CTR2, CTR2
+ vpshufb TMP3, CTR3, CTR3
+ vpshufb TMP3, CTR4, CTR4
+ vpshufb TMP3, CTR5, CTR5
+ vpshufb TMP3, CTR6, CTR6
+ vpshufb TMP3, CTR7, CTR7
+
+ vmovdqu (KS), TMP3
+ vpxor TMP3, CTR0, CTR0
+ vpxor TMP3, CTR1, CTR1
+ vpxor TMP3, CTR2, CTR2
+ vpxor TMP3, CTR3, CTR3
+ vpxor TMP3, CTR4, CTR4
+ vpxor TMP3, CTR5, CTR5
+ vpxor TMP3, CTR6, CTR6
+ vpxor TMP3, CTR7, CTR7
+
+ vmovdqu 7*16(CT), TMP5
+ vpshufb .Lbswap_mask(%rip), TMP5, TMP5
+ vmovdqu 16*0(Htbl), TMP3
+ vpclmulqdq $0x11, TMP3, TMP5, TMP1
+ vpclmulqdq $0x00, TMP3, TMP5, TMP2
+ vpshufd $78, TMP5, TMP3
+ vpxor TMP5, TMP3, TMP5
+ vmovdqu 128+0*16(Htbl), TMP3
+ vpclmulqdq $0x00, TMP3, TMP5, TMP0
+
+ ROUND 1
+ DEC_KARATSUBA 1
+
+ ROUND 2
+ DEC_KARATSUBA 2
+
+ ROUND 3
+ DEC_KARATSUBA 3
+
+ ROUND 4
+ DEC_KARATSUBA 4
+
+ ROUND 5
+ DEC_KARATSUBA 5
+
+ ROUND 6
+ DEC_KARATSUBA 6
+
+ ROUND 7
+
+ vmovdqu 0*16(CT), TMP5
+ vpshufb .Lbswap_mask(%rip), TMP5, TMP5
+ vpxor T, TMP5, TMP5
+ vmovdqu 16*7(Htbl), TMP4
+
+ vpclmulqdq $0x11, TMP4, TMP5, TMP3
+ vpxor TMP3, TMP1, TMP1
+ vpclmulqdq $0x00, TMP4, TMP5, TMP3
+ vpxor TMP3, TMP2, TMP2
+
+ vpshufd $78, TMP5, TMP3
+ vpxor TMP5, TMP3, TMP5
+ vmovdqu 128+7*16(Htbl), TMP4
+
+ vpclmulqdq $0x00, TMP4, TMP5, TMP3
+ vpxor TMP3, TMP0, TMP0
+
+ ROUND 8
+
+ vpxor TMP1, TMP0, TMP0
+ vpxor TMP2, TMP0, TMP0
+
+ vpsrldq $8, TMP0, TMP3
+ vpxor TMP3, TMP1, TMP4
+ vpslldq $8, TMP0, TMP3
+ vpxor TMP3, TMP2, T
+ vmovdqa .Lpoly(%rip), TMP2
+
+ vpalignr $8, T, T, TMP1
+ vpclmulqdq $0x10, TMP2, T, T
+ vpxor T, TMP1, T
+
+ ROUND 9
+
+ vpalignr $8, T, T, TMP1
+ vpclmulqdq $0x10, TMP2, T, T
+ vpxor T, TMP1, T
+
+ vmovdqu 160(KS), TMP5
+ cmp $10, NR
+
+ jbe .LDECLast1
+
+ ROUND 10
+ ROUND 11
+
+ vmovdqu 192(KS), TMP5
+ cmp $12, NR
+
+ jbe .LDECLast1
+
+ ROUND 12
+ ROUND 13
+
+ vmovdqu 224(KS), TMP5
+
+.LDECLast1:
+
+ vpxor (CT), TMP5, TMP3
+ vaesenclast TMP3, CTR0, CTR0
+ vpxor 16(CT), TMP5, TMP3
+ vaesenclast TMP3, CTR1, CTR1
+ vpxor 32(CT), TMP5, TMP3
+ vaesenclast TMP3, CTR2, CTR2
+ vpxor 48(CT), TMP5, TMP3
+ vaesenclast TMP3, CTR3, CTR3
+ vpxor 64(CT), TMP5, TMP3
+ vaesenclast TMP3, CTR4, CTR4
+ vpxor 80(CT), TMP5, TMP3
+ vaesenclast TMP3, CTR5, CTR5
+ vpxor 96(CT), TMP5, TMP3
+ vaesenclast TMP3, CTR6, CTR6
+ vpxor 112(CT), TMP5, TMP3
+ vaesenclast TMP3, CTR7, CTR7
+
+ vmovdqu .Lbswap_mask(%rip), TMP3
+
+ vmovdqu CTR0, (PT)
+ vmovdqu CTR1, 16(PT)
+ vmovdqu CTR2, 32(PT)
+ vmovdqu CTR3, 48(PT)
+ vmovdqu CTR4, 64(PT)
+ vmovdqu CTR5, 80(PT)
+ vmovdqu CTR6, 96(PT)
+ vmovdqu CTR7,112(PT)
+
+ vpxor TMP4, T, T
+
+ lea 128(CT), CT
+ lea 128(PT), PT
+ jmp .LDECOctets
+
+#Here we decrypt and hash any remaining whole block
+.LDECSingles:
+
+ cmp $16, len
+ jb .LDECTail
+ sub $16, len
+
+ vmovdqu (CT), TMP1
+ vpshufb .Lbswap_mask(%rip), TMP1, TMP1
+ vpxor TMP1, T, T
+ vmovdqu (Htbl), TMP0
+ call GFMUL
+
+
+ vpshufb .Lbswap_mask(%rip), CTR, TMP1
+ vpaddd .Lone(%rip), CTR, CTR
+
+ vpxor (KS), TMP1, TMP1
+ vaesenc 16*1(KS), TMP1, TMP1
+ vaesenc 16*2(KS), TMP1, TMP1
+ vaesenc 16*3(KS), TMP1, TMP1
+ vaesenc 16*4(KS), TMP1, TMP1
+ vaesenc 16*5(KS), TMP1, TMP1
+ vaesenc 16*6(KS), TMP1, TMP1
+ vaesenc 16*7(KS), TMP1, TMP1
+ vaesenc 16*8(KS), TMP1, TMP1
+ vaesenc 16*9(KS), TMP1, TMP1
+ vmovdqu 16*10(KS), TMP2
+ cmp $10, NR
+ je .LDECLast2
+ vaesenc 16*10(KS), TMP1, TMP1
+ vaesenc 16*11(KS), TMP1, TMP1
+ vmovdqu 16*12(KS), TMP2
+ cmp $12, NR
+ je .LDECLast2
+ vaesenc 16*12(KS), TMP1, TMP1
+ vaesenc 16*13(KS), TMP1, TMP1
+ vmovdqu 16*14(KS), TMP2
+.LDECLast2:
+ vaesenclast TMP2, TMP1, TMP1
+
+ vpxor (CT), TMP1, TMP1
+ vmovdqu TMP1, (PT)
+ addq $16, CT
+ addq $16, PT
+ jmp .LDECSingles
+
+#Here we decrypt the final partial block, if there is one
+.LDECTail:
+ test len, len
+ jz .LDEC_END
+
+ vpshufb .Lbswap_mask(%rip), CTR, TMP1
+ vpaddd .Lone(%rip), CTR, CTR
+
+ vpxor (KS), TMP1, TMP1
+ vaesenc 16*1(KS), TMP1, TMP1
+ vaesenc 16*2(KS), TMP1, TMP1
+ vaesenc 16*3(KS), TMP1, TMP1
+ vaesenc 16*4(KS), TMP1, TMP1
+ vaesenc 16*5(KS), TMP1, TMP1
+ vaesenc 16*6(KS), TMP1, TMP1
+ vaesenc 16*7(KS), TMP1, TMP1
+ vaesenc 16*8(KS), TMP1, TMP1
+ vaesenc 16*9(KS), TMP1, TMP1
+ vmovdqu 16*10(KS), TMP2
+ cmp $10, NR
+ je .LDECLast3
+ vaesenc 16*10(KS), TMP1, TMP1
+ vaesenc 16*11(KS), TMP1, TMP1
+ vmovdqu 16*12(KS), TMP2
+ cmp $12, NR
+ je .LDECLast3
+ vaesenc 16*12(KS), TMP1, TMP1
+ vaesenc 16*13(KS), TMP1, TMP1
+ vmovdqu 16*14(KS), TMP2
+
+.LDECLast3:
+ vaesenclast TMP2, TMP1, TMP1
+
+ vpxor TMP2, TMP2, TMP2
+ vmovdqa TMP2, (%rsp)
+# Copy the required bytes only (could probably use rep movsb)
+ xor KS, KS
+.LDecCpy:
+ cmp KS, len
+ je .LDecCpy2
+ movb (CT, KS, 1), %r8b
+ movb %r8b, (%rsp, KS, 1)
+ inc KS
+ jmp .LDecCpy
+.LDecCpy2:
+ cmp $16, KS
+ je .LDecCpyEnd
+ movb $0, (%rsp, KS, 1)
+ inc KS
+ jmp .LDecCpy2
+.LDecCpyEnd:
+# Xor with the counter block
+ vmovdqa (%rsp), TMP0
+ vpxor TMP0, TMP1, TMP1
+# Again, store at temp location
+ vmovdqa TMP1, (%rsp)
+# Copy only the required bytes to PT, and zero the rest for the hash
+ xor KS, KS
+.LDecCpy3:
+ cmp KS, len
+ je .LDecCpyEnd3
+ movb (%rsp, KS, 1), %r8b
+ movb %r8b, (PT, KS, 1)
+ inc KS
+ jmp .LDecCpy3
+.LDecCpyEnd3:
+ vpshufb .Lbswap_mask(%rip), TMP0, TMP0
+ vpxor TMP0, T, T
+ vmovdqu (Htbl), TMP0
+ call GFMUL
+.LDEC_END:
+
+ vpshufb .Lbswap_mask(%rip), T, T
+ vpshufb .Lbswap_mask(%rip), CTR, CTR
+ vmovdqu T, 272(Gctx)
+ vmovdqu CTR, 288(Gctx)
+
+ movq %rbp, %rsp
+
+ popq %rbx
+ popq %rbp
+ ret
+ .size intel_aes_gcmDEC, .-intel_aes_gcmDEC
+#########################
+# a = T
+# b = TMP0 - remains unchanged
+# res = T
+# uses also TMP1,TMP2,TMP3,TMP4
+# __m128i GFMUL(__m128i A, __m128i B);
+.type GFMUL,@function
+.globl GFMUL
+GFMUL:
+ vpclmulqdq $0x00, TMP0, T, TMP1
+ vpclmulqdq $0x11, TMP0, T, TMP4
+
+ vpshufd $78, T, TMP2
+ vpshufd $78, TMP0, TMP3
+ vpxor T, TMP2, TMP2
+ vpxor TMP0, TMP3, TMP3
+
+ vpclmulqdq $0x00, TMP3, TMP2, TMP2
+ vpxor TMP1, TMP2, TMP2
+ vpxor TMP4, TMP2, TMP2
+
+ vpslldq $8, TMP2, TMP3
+ vpsrldq $8, TMP2, TMP2
+
+ vpxor TMP3, TMP1, TMP1
+ vpxor TMP2, TMP4, TMP4
+
+ vpclmulqdq $0x10, .Lpoly(%rip), TMP1, TMP2
+ vpshufd $78, TMP1, TMP3
+ vpxor TMP3, TMP2, TMP1
+
+ vpclmulqdq $0x10, .Lpoly(%rip), TMP1, TMP2
+ vpshufd $78, TMP1, TMP3
+ vpxor TMP3, TMP2, TMP1
+
+ vpxor TMP4, TMP1, T
+ ret
+.size GFMUL, .-GFMUL
+