# LICENSE: # This submission to NSS is to be made available under the terms of the # Mozilla Public License, v. 2.0. You can obtain one at http: # //mozilla.org/MPL/2.0/. ################################################################################ # Copyright(c) 2012, Intel Corp. .align 16 .Lone: .quad 1,0 .Ltwo: .quad 2,0 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .Lshuff_mask: .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f .Lpoly: .quad 0x1, 0xc200000000000000 ################################################################################ # Generates the final GCM tag # void intel_aes_gcmTAG(uint8_t Htbl[16*16], uint8_t *Tp, uint64_t Mlen, uint64_t Alen, uint8_t* X0, uint8_t* TAG); .type intel_aes_gcmTAG,@function .globl intel_aes_gcmTAG .align 16 intel_aes_gcmTAG: .set Htbl, %rdi .set Tp, %rsi .set Mlen, %rdx .set Alen, %rcx .set X0, %r8 .set TAG, %r9 .set T,%xmm0 .set TMP0,%xmm1 vmovdqu (Tp), T vpshufb .Lbswap_mask(%rip), T, T vpxor TMP0, TMP0, TMP0 shl $3, Mlen shl $3, Alen vpinsrq $0, Mlen, TMP0, TMP0 vpinsrq $1, Alen, TMP0, TMP0 vpxor TMP0, T, T vmovdqu (Htbl), TMP0 call GFMUL vpshufb .Lbswap_mask(%rip), T, T vpxor (X0), T, T vmovdqu T, (TAG) ret .size intel_aes_gcmTAG, .-intel_aes_gcmTAG ################################################################################ # Generates the H table # void intel_aes_gcmINIT(uint8_t Htbl[16*16], uint8_t *KS, int NR); .type intel_aes_gcmINIT,@function .globl intel_aes_gcmINIT .align 16 intel_aes_gcmINIT: .set Htbl, %rdi .set KS, %rsi .set NR, %edx .set T,%xmm0 .set TMP0,%xmm1 CALCULATE_POWERS_OF_H: vmovdqu 16*0(KS), T vaesenc 16*1(KS), T, T vaesenc 16*2(KS), T, T vaesenc 16*3(KS), T, T vaesenc 16*4(KS), T, T vaesenc 16*5(KS), T, T vaesenc 16*6(KS), T, T vaesenc 16*7(KS), T, T vaesenc 16*8(KS), T, T vaesenc 16*9(KS), T, T vmovdqu 16*10(KS), TMP0 cmp $10, NR je .LH0done vaesenc 16*10(KS), T, T vaesenc 16*11(KS), T, T vmovdqu 16*12(KS), TMP0 cmp $12, NR je .LH0done vaesenc 16*12(KS), T, T vaesenc 16*13(KS), T, T vmovdqu 16*14(KS), TMP0 .LH0done: vaesenclast TMP0, T, T vpshufb .Lbswap_mask(%rip), T, T vmovdqu T, TMP0 # Calculate H` = GFMUL(H, 2) vpsrld $7 , T , %xmm3 vmovdqu .Lshuff_mask(%rip), %xmm4 vpshufb %xmm4, %xmm3 , %xmm3 movq $0xff00 , %rax vmovq %rax, %xmm4 vpshufb %xmm3, %xmm4 , %xmm4 vmovdqu .Lpoly(%rip), %xmm5 vpand %xmm4, %xmm5, %xmm5 vpsrld $31, T, %xmm3 vpslld $1, T, %xmm4 vpslldq $4, %xmm3, %xmm3 vpxor %xmm3, %xmm4, T #xmm1 holds now p(x)<<1 #adding p(x)<<1 to xmm5 vpxor %xmm5, T , T vmovdqu T, TMP0 vmovdqu T, (Htbl) # H * 2 call GFMUL vmovdqu T, 16(Htbl) # H^2 * 2 call GFMUL vmovdqu T, 32(Htbl) # H^3 * 2 call GFMUL vmovdqu T, 48(Htbl) # H^4 * 2 call GFMUL vmovdqu T, 64(Htbl) # H^5 * 2 call GFMUL vmovdqu T, 80(Htbl) # H^6 * 2 call GFMUL vmovdqu T, 96(Htbl) # H^7 * 2 call GFMUL vmovdqu T, 112(Htbl) # H^8 * 2 # Precalculations for the reduce 4 step vpshufd $78, (Htbl), %xmm8 vpshufd $78, 16(Htbl), %xmm9 vpshufd $78, 32(Htbl), %xmm10 vpshufd $78, 48(Htbl), %xmm11 vpshufd $78, 64(Htbl), %xmm12 vpshufd $78, 80(Htbl), %xmm13 vpshufd $78, 96(Htbl), %xmm14 vpshufd $78, 112(Htbl), %xmm15 vpxor (Htbl), %xmm8, %xmm8 vpxor 16(Htbl), %xmm9, %xmm9 vpxor 32(Htbl), %xmm10, %xmm10 vpxor 48(Htbl), %xmm11, %xmm11 vpxor 64(Htbl), %xmm12, %xmm12 vpxor 80(Htbl), %xmm13, %xmm13 vpxor 96(Htbl), %xmm14, %xmm14 vpxor 112(Htbl), %xmm15, %xmm15 vmovdqu %xmm8, 128(Htbl) vmovdqu %xmm9, 144(Htbl) vmovdqu %xmm10, 160(Htbl) vmovdqu %xmm11, 176(Htbl) vmovdqu %xmm12, 192(Htbl) vmovdqu %xmm13, 208(Htbl) vmovdqu %xmm14, 224(Htbl) vmovdqu %xmm15, 240(Htbl) ret .size intel_aes_gcmINIT, .-intel_aes_gcmINIT ################################################################################ # Authenticate only # void intel_aes_gcmAAD(uint8_t Htbl[16*16], uint8_t *AAD, uint64_t Alen, uint8_t *Tp); .globl intel_aes_gcmAAD .type intel_aes_gcmAAD,@function .align 16 intel_aes_gcmAAD: .set DATA, %xmm0 .set T, %xmm1 .set BSWAP_MASK, %xmm2 .set TMP0, %xmm3 .set TMP1, %xmm4 .set TMP2, %xmm5 .set TMP3, %xmm6 .set TMP4, %xmm7 .set Xhi, %xmm9 .set Htbl, %rdi .set inp, %rsi .set len, %rdx .set Tp, %rcx .set hlp0, %r11 .macro KARATSUBA_AAD i vpclmulqdq $0x00, 16*\i(Htbl), DATA, TMP3 vpxor TMP3, TMP0, TMP0 vpclmulqdq $0x11, 16*\i(Htbl), DATA, TMP3 vpxor TMP3, TMP1, TMP1 vpshufd $78, DATA, TMP3 vpxor DATA, TMP3, TMP3 vpclmulqdq $0x00, 16*(\i+8)(Htbl), TMP3, TMP3 vpxor TMP3, TMP2, TMP2 .endm test len, len jnz .LbeginAAD ret .LbeginAAD: push hlp0 vzeroupper vmovdqa .Lbswap_mask(%rip), BSWAP_MASK vpxor Xhi, Xhi, Xhi vmovdqu (Tp),T vpshufb BSWAP_MASK,T,T # we hash 8 block each iteration, if the total amount of blocks is not a multiple of 8, we hash the first n%8 blocks first mov len, hlp0 and $~-128, hlp0 jz .Lmod_loop sub hlp0, len sub $16, hlp0 #hash first prefix block vmovdqu (inp), DATA vpshufb BSWAP_MASK, DATA, DATA vpxor T, DATA, DATA vpclmulqdq $0x00, (Htbl, hlp0), DATA, TMP0 vpclmulqdq $0x11, (Htbl, hlp0), DATA, TMP1 vpshufd $78, DATA, TMP2 vpxor DATA, TMP2, TMP2 vpclmulqdq $0x00, 16*8(Htbl, hlp0), TMP2, TMP2 lea 16(inp), inp test hlp0, hlp0 jnz .Lpre_loop jmp .Lred1 #hash remaining prefix bocks (up to 7 total prefix blocks) .align 64 .Lpre_loop: sub $16, hlp0 vmovdqu (inp),DATA # next data block vpshufb BSWAP_MASK,DATA,DATA vpclmulqdq $0x00, (Htbl,hlp0), DATA, TMP3 vpxor TMP3, TMP0, TMP0 vpclmulqdq $0x11, (Htbl,hlp0), DATA, TMP3 vpxor TMP3, TMP1, TMP1 vpshufd $78, DATA, TMP3 vpxor DATA, TMP3, TMP3 vpclmulqdq $0x00, 16*8(Htbl,hlp0), TMP3, TMP3 vpxor TMP3, TMP2, TMP2 test hlp0, hlp0 lea 16(inp), inp jnz .Lpre_loop .Lred1: vpxor TMP0, TMP2, TMP2 vpxor TMP1, TMP2, TMP2 vpsrldq $8, TMP2, TMP3 vpslldq $8, TMP2, TMP2 vpxor TMP3, TMP1, Xhi vpxor TMP2, TMP0, T .align 64 .Lmod_loop: sub $0x80, len jb .Ldone vmovdqu 16*7(inp),DATA # Ii vpshufb BSWAP_MASK,DATA,DATA vpclmulqdq $0x00, (Htbl), DATA, TMP0 vpclmulqdq $0x11, (Htbl), DATA, TMP1 vpshufd $78, DATA, TMP2 vpxor DATA, TMP2, TMP2 vpclmulqdq $0x00, 16*8(Htbl), TMP2, TMP2 ######################################################### vmovdqu 16*6(inp),DATA vpshufb BSWAP_MASK,DATA,DATA KARATSUBA_AAD 1 ######################################################### vmovdqu 16*5(inp),DATA vpshufb BSWAP_MASK,DATA,DATA vpclmulqdq $0x10, .Lpoly(%rip), T, TMP4 #reduction stage 1a vpalignr $8, T, T, T KARATSUBA_AAD 2 vpxor TMP4, T, T #reduction stage 1b ######################################################### vmovdqu 16*4(inp),DATA vpshufb BSWAP_MASK,DATA,DATA KARATSUBA_AAD 3 ######################################################### vmovdqu 16*3(inp),DATA vpshufb BSWAP_MASK,DATA,DATA vpclmulqdq $0x10, .Lpoly(%rip), T, TMP4 #reduction stage 2a vpalignr $8, T, T, T KARATSUBA_AAD 4 vpxor TMP4, T, T #reduction stage 2b ######################################################### vmovdqu 16*2(inp),DATA vpshufb BSWAP_MASK,DATA,DATA KARATSUBA_AAD 5 vpxor Xhi, T, T #reduction finalize ######################################################### vmovdqu 16*1(inp),DATA vpshufb BSWAP_MASK,DATA,DATA KARATSUBA_AAD 6 ######################################################### vmovdqu 16*0(inp),DATA vpshufb BSWAP_MASK,DATA,DATA vpxor T,DATA,DATA KARATSUBA_AAD 7 ######################################################### vpxor TMP0, TMP2, TMP2 # karatsuba fixup vpxor TMP1, TMP2, TMP2 vpsrldq $8, TMP2, TMP3 vpslldq $8, TMP2, TMP2 vpxor TMP3, TMP1, Xhi vpxor TMP2, TMP0, T lea 16*8(inp), inp jmp .Lmod_loop ######################################################### .Ldone: vpclmulqdq $0x10, .Lpoly(%rip), T, TMP3 vpalignr $8, T, T, T vpxor TMP3, T, T vpclmulqdq $0x10, .Lpoly(%rip), T, TMP3 vpalignr $8, T, T, T vpxor TMP3, T, T vpxor Xhi, T, T .Lsave: vpshufb BSWAP_MASK,T, T vmovdqu T,(Tp) vzeroupper pop hlp0 ret .size intel_aes_gcmAAD,.-intel_aes_gcmAAD ################################################################################ # Encrypt and Authenticate # void intel_aes_gcmENC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len); .type intel_aes_gcmENC,@function .globl intel_aes_gcmENC .align 16 intel_aes_gcmENC: .set PT,%rdi .set CT,%rsi .set Htbl, %rdx .set len, %rcx .set KS,%r9 .set NR,%r10d .set Gctx, %rdx .set T,%xmm0 .set TMP0,%xmm1 .set TMP1,%xmm2 .set TMP2,%xmm3 .set TMP3,%xmm4 .set TMP4,%xmm5 .set TMP5,%xmm6 .set CTR0,%xmm7 .set CTR1,%xmm8 .set CTR2,%xmm9 .set CTR3,%xmm10 .set CTR4,%xmm11 .set CTR5,%xmm12 .set CTR6,%xmm13 .set CTR7,%xmm14 .set CTR,%xmm15 .macro ROUND i vmovdqu \i*16(KS), TMP3 vaesenc TMP3, CTR0, CTR0 vaesenc TMP3, CTR1, CTR1 vaesenc TMP3, CTR2, CTR2 vaesenc TMP3, CTR3, CTR3 vaesenc TMP3, CTR4, CTR4 vaesenc TMP3, CTR5, CTR5 vaesenc TMP3, CTR6, CTR6 vaesenc TMP3, CTR7, CTR7 .endm .macro ROUNDMUL i vmovdqu \i*16(%rsp), TMP5 vmovdqu \i*16(KS), TMP3 vaesenc TMP3, CTR0, CTR0 vaesenc TMP3, CTR1, CTR1 vaesenc TMP3, CTR2, CTR2 vaesenc TMP3, CTR3, CTR3 vpshufd $78, TMP5, TMP4 vpxor TMP5, TMP4, TMP4 vaesenc TMP3, CTR4, CTR4 vaesenc TMP3, CTR5, CTR5 vaesenc TMP3, CTR6, CTR6 vaesenc TMP3, CTR7, CTR7 vpclmulqdq $0x00, 128+\i*16(Htbl), TMP4, TMP3 vpxor TMP3, TMP0, TMP0 vmovdqa \i*16(Htbl), TMP4 vpclmulqdq $0x11, TMP4, TMP5, TMP3 vpxor TMP3, TMP1, TMP1 vpclmulqdq $0x00, TMP4, TMP5, TMP3 vpxor TMP3, TMP2, TMP2 .endm .macro KARATSUBA i vmovdqu \i*16(%rsp), TMP5 vpclmulqdq $0x11, 16*\i(Htbl), TMP5, TMP3 vpxor TMP3, TMP1, TMP1 vpclmulqdq $0x00, 16*\i(Htbl), TMP5, TMP3 vpxor TMP3, TMP2, TMP2 vpshufd $78, TMP5, TMP3 vpxor TMP5, TMP3, TMP5 vpclmulqdq $0x00, 128+\i*16(Htbl), TMP5, TMP3 vpxor TMP3, TMP0, TMP0 .endm test len, len jnz .Lbegin ret .Lbegin: vzeroupper push %rbp push %rbx movq %rsp, %rbp sub $128, %rsp andq $-16, %rsp vmovdqu 288(Gctx), CTR vmovdqu 272(Gctx), T mov 304(Gctx), KS # AESContext->Nr mov 244(KS), NR vpshufb .Lbswap_mask(%rip), CTR, CTR vpshufb .Lbswap_mask(%rip), T, T cmp $128, len jb .LDataSingles # Encrypt the first eight blocks sub $128, len vmovdqa CTR, CTR0 vpaddd .Lone(%rip), CTR0, CTR1 vpaddd .Ltwo(%rip), CTR0, CTR2 vpaddd .Lone(%rip), CTR2, CTR3 vpaddd .Ltwo(%rip), CTR2, CTR4 vpaddd .Lone(%rip), CTR4, CTR5 vpaddd .Ltwo(%rip), CTR4, CTR6 vpaddd .Lone(%rip), CTR6, CTR7 vpaddd .Ltwo(%rip), CTR6, CTR vpshufb .Lbswap_mask(%rip), CTR0, CTR0 vpshufb .Lbswap_mask(%rip), CTR1, CTR1 vpshufb .Lbswap_mask(%rip), CTR2, CTR2 vpshufb .Lbswap_mask(%rip), CTR3, CTR3 vpshufb .Lbswap_mask(%rip), CTR4, CTR4 vpshufb .Lbswap_mask(%rip), CTR5, CTR5 vpshufb .Lbswap_mask(%rip), CTR6, CTR6 vpshufb .Lbswap_mask(%rip), CTR7, CTR7 vpxor (KS), CTR0, CTR0 vpxor (KS), CTR1, CTR1 vpxor (KS), CTR2, CTR2 vpxor (KS), CTR3, CTR3 vpxor (KS), CTR4, CTR4 vpxor (KS), CTR5, CTR5 vpxor (KS), CTR6, CTR6 vpxor (KS), CTR7, CTR7 ROUND 1 ROUND 2 ROUND 3 ROUND 4 ROUND 5 ROUND 6 ROUND 7 ROUND 8 ROUND 9 vmovdqu 160(KS), TMP5 cmp $12, NR jb .LLast1 ROUND 10 ROUND 11 vmovdqu 192(KS), TMP5 cmp $14, NR jb .LLast1 ROUND 12 ROUND 13 vmovdqu 224(KS), TMP5 .LLast1: vpxor (PT), TMP5, TMP3 vaesenclast TMP3, CTR0, CTR0 vpxor 16(PT), TMP5, TMP3 vaesenclast TMP3, CTR1, CTR1 vpxor 32(PT), TMP5, TMP3 vaesenclast TMP3, CTR2, CTR2 vpxor 48(PT), TMP5, TMP3 vaesenclast TMP3, CTR3, CTR3 vpxor 64(PT), TMP5, TMP3 vaesenclast TMP3, CTR4, CTR4 vpxor 80(PT), TMP5, TMP3 vaesenclast TMP3, CTR5, CTR5 vpxor 96(PT), TMP5, TMP3 vaesenclast TMP3, CTR6, CTR6 vpxor 112(PT), TMP5, TMP3 vaesenclast TMP3, CTR7, CTR7 vmovdqu .Lbswap_mask(%rip), TMP3 vmovdqu CTR0, (CT) vpshufb TMP3, CTR0, CTR0 vmovdqu CTR1, 16(CT) vpshufb TMP3, CTR1, CTR1 vmovdqu CTR2, 32(CT) vpshufb TMP3, CTR2, CTR2 vmovdqu CTR3, 48(CT) vpshufb TMP3, CTR3, CTR3 vmovdqu CTR4, 64(CT) vpshufb TMP3, CTR4, CTR4 vmovdqu CTR5, 80(CT) vpshufb TMP3, CTR5, CTR5 vmovdqu CTR6, 96(CT) vpshufb TMP3, CTR6, CTR6 vmovdqu CTR7, 112(CT) vpshufb TMP3, CTR7, CTR7 lea 128(CT), CT lea 128(PT), PT jmp .LDataOctets # Encrypt 8 blocks each time while hashing previous 8 blocks .align 64 .LDataOctets: cmp $128, len jb .LEndOctets sub $128, len vmovdqa CTR7, TMP5 vmovdqa CTR6, 1*16(%rsp) vmovdqa CTR5, 2*16(%rsp) vmovdqa CTR4, 3*16(%rsp) vmovdqa CTR3, 4*16(%rsp) vmovdqa CTR2, 5*16(%rsp) vmovdqa CTR1, 6*16(%rsp) vmovdqa CTR0, 7*16(%rsp) vmovdqa CTR, CTR0 vpaddd .Lone(%rip), CTR0, CTR1 vpaddd .Ltwo(%rip), CTR0, CTR2 vpaddd .Lone(%rip), CTR2, CTR3 vpaddd .Ltwo(%rip), CTR2, CTR4 vpaddd .Lone(%rip), CTR4, CTR5 vpaddd .Ltwo(%rip), CTR4, CTR6 vpaddd .Lone(%rip), CTR6, CTR7 vpaddd .Ltwo(%rip), CTR6, CTR vmovdqu (KS), TMP4 vpshufb TMP3, CTR0, CTR0 vpxor TMP4, CTR0, CTR0 vpshufb TMP3, CTR1, CTR1 vpxor TMP4, CTR1, CTR1 vpshufb TMP3, CTR2, CTR2 vpxor TMP4, CTR2, CTR2 vpshufb TMP3, CTR3, CTR3 vpxor TMP4, CTR3, CTR3 vpshufb TMP3, CTR4, CTR4 vpxor TMP4, CTR4, CTR4 vpshufb TMP3, CTR5, CTR5 vpxor TMP4, CTR5, CTR5 vpshufb TMP3, CTR6, CTR6 vpxor TMP4, CTR6, CTR6 vpshufb TMP3, CTR7, CTR7 vpxor TMP4, CTR7, CTR7 vmovdqu 16*0(Htbl), TMP3 vpclmulqdq $0x11, TMP3, TMP5, TMP1 vpclmulqdq $0x00, TMP3, TMP5, TMP2 vpshufd $78, TMP5, TMP3 vpxor TMP5, TMP3, TMP5 vmovdqu 128+0*16(Htbl), TMP3 vpclmulqdq $0x00, TMP3, TMP5, TMP0 ROUNDMUL 1 ROUNDMUL 2 ROUNDMUL 3 ROUNDMUL 4 ROUNDMUL 5 ROUNDMUL 6 vpxor 7*16(%rsp), T, TMP5 vmovdqu 7*16(KS), TMP3 vaesenc TMP3, CTR0, CTR0 vaesenc TMP3, CTR1, CTR1 vaesenc TMP3, CTR2, CTR2 vaesenc TMP3, CTR3, CTR3 vpshufd $78, TMP5, TMP4 vpxor TMP5, TMP4, TMP4 vaesenc TMP3, CTR4, CTR4 vaesenc TMP3, CTR5, CTR5 vaesenc TMP3, CTR6, CTR6 vaesenc TMP3, CTR7, CTR7 vpclmulqdq $0x11, 7*16(Htbl), TMP5, TMP3 vpxor TMP3, TMP1, TMP1 vpclmulqdq $0x00, 7*16(Htbl), TMP5, TMP3 vpxor TMP3, TMP2, TMP2 vpclmulqdq $0x00, 128+7*16(Htbl), TMP4, TMP3 vpxor TMP3, TMP0, TMP0 ROUND 8 vmovdqa .Lpoly(%rip), TMP5 vpxor TMP1, TMP0, TMP0 vpxor TMP2, TMP0, TMP0 vpsrldq $8, TMP0, TMP3 vpxor TMP3, TMP1, TMP4 vpslldq $8, TMP0, TMP3 vpxor TMP3, TMP2, T vpclmulqdq $0x10, TMP5, T, TMP1 vpalignr $8, T, T, T vpxor T, TMP1, T ROUND 9 vpclmulqdq $0x10, TMP5, T, TMP1 vpalignr $8, T, T, T vpxor T, TMP1, T vmovdqu 160(KS), TMP5 cmp $10, NR jbe .LLast2 ROUND 10 ROUND 11 vmovdqu 192(KS), TMP5 cmp $12, NR jbe .LLast2 ROUND 12 ROUND 13 vmovdqu 224(KS), TMP5 .LLast2: vpxor (PT), TMP5, TMP3 vaesenclast TMP3, CTR0, CTR0 vpxor 16(PT), TMP5, TMP3 vaesenclast TMP3, CTR1, CTR1 vpxor 32(PT), TMP5, TMP3 vaesenclast TMP3, CTR2, CTR2 vpxor 48(PT), TMP5, TMP3 vaesenclast TMP3, CTR3, CTR3 vpxor 64(PT), TMP5, TMP3 vaesenclast TMP3, CTR4, CTR4 vpxor 80(PT), TMP5, TMP3 vaesenclast TMP3, CTR5, CTR5 vpxor 96(PT), TMP5, TMP3 vaesenclast TMP3, CTR6, CTR6 vpxor 112(PT), TMP5, TMP3 vaesenclast TMP3, CTR7, CTR7 vmovdqu .Lbswap_mask(%rip), TMP3 vmovdqu CTR0, (CT) vpshufb TMP3, CTR0, CTR0 vmovdqu CTR1, 16(CT) vpshufb TMP3, CTR1, CTR1 vmovdqu CTR2, 32(CT) vpshufb TMP3, CTR2, CTR2 vmovdqu CTR3, 48(CT) vpshufb TMP3, CTR3, CTR3 vmovdqu CTR4, 64(CT) vpshufb TMP3, CTR4, CTR4 vmovdqu CTR5, 80(CT) vpshufb TMP3, CTR5, CTR5 vmovdqu CTR6, 96(CT) vpshufb TMP3, CTR6, CTR6 vmovdqu CTR7,112(CT) vpshufb TMP3, CTR7, CTR7 vpxor TMP4, T, T lea 128(CT), CT lea 128(PT), PT jmp .LDataOctets .LEndOctets: vmovdqa CTR7, TMP5 vmovdqa CTR6, 1*16(%rsp) vmovdqa CTR5, 2*16(%rsp) vmovdqa CTR4, 3*16(%rsp) vmovdqa CTR3, 4*16(%rsp) vmovdqa CTR2, 5*16(%rsp) vmovdqa CTR1, 6*16(%rsp) vmovdqa CTR0, 7*16(%rsp) vmovdqu 16*0(Htbl), TMP3 vpclmulqdq $0x11, TMP3, TMP5, TMP1 vpclmulqdq $0x00, TMP3, TMP5, TMP2 vpshufd $78, TMP5, TMP3 vpxor TMP5, TMP3, TMP5 vmovdqu 128+0*16(Htbl), TMP3 vpclmulqdq $0x00, TMP3, TMP5, TMP0 KARATSUBA 1 KARATSUBA 2 KARATSUBA 3 KARATSUBA 4 KARATSUBA 5 KARATSUBA 6 vmovdqu 7*16(%rsp), TMP5 vpxor T, TMP5, TMP5 vmovdqu 16*7(Htbl), TMP4 vpclmulqdq $0x11, TMP4, TMP5, TMP3 vpxor TMP3, TMP1, TMP1 vpclmulqdq $0x00, TMP4, TMP5, TMP3 vpxor TMP3, TMP2, TMP2 vpshufd $78, TMP5, TMP3 vpxor TMP5, TMP3, TMP5 vmovdqu 128+7*16(Htbl), TMP4 vpclmulqdq $0x00, TMP4, TMP5, TMP3 vpxor TMP3, TMP0, TMP0 vpxor TMP1, TMP0, TMP0 vpxor TMP2, TMP0, TMP0 vpsrldq $8, TMP0, TMP3 vpxor TMP3, TMP1, TMP4 vpslldq $8, TMP0, TMP3 vpxor TMP3, TMP2, T vmovdqa .Lpoly(%rip), TMP2 vpalignr $8, T, T, TMP1 vpclmulqdq $0x10, TMP2, T, T vpxor T, TMP1, T vpalignr $8, T, T, TMP1 vpclmulqdq $0x10, TMP2, T, T vpxor T, TMP1, T vpxor TMP4, T, T #Here we encrypt any remaining whole block .LDataSingles: cmp $16, len jb .LDataTail sub $16, len vpshufb .Lbswap_mask(%rip), CTR, TMP1 vpaddd .Lone(%rip), CTR, CTR vpxor (KS), TMP1, TMP1 vaesenc 16*1(KS), TMP1, TMP1 vaesenc 16*2(KS), TMP1, TMP1 vaesenc 16*3(KS), TMP1, TMP1 vaesenc 16*4(KS), TMP1, TMP1 vaesenc 16*5(KS), TMP1, TMP1 vaesenc 16*6(KS), TMP1, TMP1 vaesenc 16*7(KS), TMP1, TMP1 vaesenc 16*8(KS), TMP1, TMP1 vaesenc 16*9(KS), TMP1, TMP1 vmovdqu 16*10(KS), TMP2 cmp $10, NR je .LLast3 vaesenc 16*10(KS), TMP1, TMP1 vaesenc 16*11(KS), TMP1, TMP1 vmovdqu 16*12(KS), TMP2 cmp $12, NR je .LLast3 vaesenc 16*12(KS), TMP1, TMP1 vaesenc 16*13(KS), TMP1, TMP1 vmovdqu 16*14(KS), TMP2 .LLast3: vaesenclast TMP2, TMP1, TMP1 vpxor (PT), TMP1, TMP1 vmovdqu TMP1, (CT) addq $16, CT addq $16, PT vpshufb .Lbswap_mask(%rip), TMP1, TMP1 vpxor TMP1, T, T vmovdqu (Htbl), TMP0 call GFMUL jmp .LDataSingles #Here we encypt the final partial block, if there is one .LDataTail: test len, len jz DATA_END # First prepare the counter block vpshufb .Lbswap_mask(%rip), CTR, TMP1 vpaddd .Lone(%rip), CTR, CTR vpxor (KS), TMP1, TMP1 vaesenc 16*1(KS), TMP1, TMP1 vaesenc 16*2(KS), TMP1, TMP1 vaesenc 16*3(KS), TMP1, TMP1 vaesenc 16*4(KS), TMP1, TMP1 vaesenc 16*5(KS), TMP1, TMP1 vaesenc 16*6(KS), TMP1, TMP1 vaesenc 16*7(KS), TMP1, TMP1 vaesenc 16*8(KS), TMP1, TMP1 vaesenc 16*9(KS), TMP1, TMP1 vmovdqu 16*10(KS), TMP2 cmp $10, NR je .LLast4 vaesenc 16*10(KS), TMP1, TMP1 vaesenc 16*11(KS), TMP1, TMP1 vmovdqu 16*12(KS), TMP2 cmp $12, NR je .LLast4 vaesenc 16*12(KS), TMP1, TMP1 vaesenc 16*13(KS), TMP1, TMP1 vmovdqu 16*14(KS), TMP2 .LLast4: vaesenclast TMP2, TMP1, TMP1 #Zero a temp location vpxor TMP2, TMP2, TMP2 vmovdqa TMP2, (%rsp) # Copy the required bytes only (could probably use rep movsb) xor KS, KS .LEncCpy: cmp KS, len je .LEncCpyEnd movb (PT, KS, 1), %r8b movb %r8b, (%rsp, KS, 1) inc KS jmp .LEncCpy .LEncCpyEnd: # Xor with the counter block vpxor (%rsp), TMP1, TMP0 # Again, store at temp location vmovdqa TMP0, (%rsp) # Copy only the required bytes to CT, and zero the rest for the hash xor KS, KS .LEncCpy2: cmp KS, len je .LEncCpy3 movb (%rsp, KS, 1), %r8b movb %r8b, (CT, KS, 1) inc KS jmp .LEncCpy2 .LEncCpy3: cmp $16, KS je .LEndCpy3 movb $0, (%rsp, KS, 1) inc KS jmp .LEncCpy3 .LEndCpy3: vmovdqa (%rsp), TMP0 vpshufb .Lbswap_mask(%rip), TMP0, TMP0 vpxor TMP0, T, T vmovdqu (Htbl), TMP0 call GFMUL DATA_END: vpshufb .Lbswap_mask(%rip), T, T vpshufb .Lbswap_mask(%rip), CTR, CTR vmovdqu T, 272(Gctx) vmovdqu CTR, 288(Gctx) movq %rbp, %rsp popq %rbx popq %rbp ret .size intel_aes_gcmENC, .-intel_aes_gcmENC ######################### # Decrypt and Authenticate # void intel_aes_gcmDEC(uint8_t* PT, uint8_t* CT, void *Gctx,uint64_t len); .type intel_aes_gcmDEC,@function .globl intel_aes_gcmDEC .align 16 intel_aes_gcmDEC: # parameter 1: CT # input # parameter 2: PT # output # parameter 3: %rdx # Gctx # parameter 4: %rcx # len .macro DEC_KARATSUBA i vmovdqu (7-\i)*16(CT), TMP5 vpshufb .Lbswap_mask(%rip), TMP5, TMP5 vpclmulqdq $0x11, 16*\i(Htbl), TMP5, TMP3 vpxor TMP3, TMP1, TMP1 vpclmulqdq $0x00, 16*\i(Htbl), TMP5, TMP3 vpxor TMP3, TMP2, TMP2 vpshufd $78, TMP5, TMP3 vpxor TMP5, TMP3, TMP5 vpclmulqdq $0x00, 128+\i*16(Htbl), TMP5, TMP3 vpxor TMP3, TMP0, TMP0 .endm .set PT,%rsi .set CT,%rdi .set Htbl, %rdx .set len, %rcx .set KS,%r9 .set NR,%r10d .set Gctx, %rdx .set T,%xmm0 .set TMP0,%xmm1 .set TMP1,%xmm2 .set TMP2,%xmm3 .set TMP3,%xmm4 .set TMP4,%xmm5 .set TMP5,%xmm6 .set CTR0,%xmm7 .set CTR1,%xmm8 .set CTR2,%xmm9 .set CTR3,%xmm10 .set CTR4,%xmm11 .set CTR5,%xmm12 .set CTR6,%xmm13 .set CTR7,%xmm14 .set CTR,%xmm15 test len, len jnz .LbeginDec ret .LbeginDec: pushq %rbp pushq %rbx movq %rsp, %rbp sub $128, %rsp andq $-16, %rsp vmovdqu 288(Gctx), CTR vmovdqu 272(Gctx), T mov 304(Gctx), KS # AESContext->Nr mov 244(KS), NR vpshufb .Lbswap_mask(%rip), CTR, CTR vpshufb .Lbswap_mask(%rip), T, T vmovdqu .Lbswap_mask(%rip), TMP3 jmp .LDECOctets # Decrypt 8 blocks each time while hashing them at the same time .align 64 .LDECOctets: cmp $128, len jb .LDECSingles sub $128, len vmovdqa CTR, CTR0 vpaddd .Lone(%rip), CTR0, CTR1 vpaddd .Ltwo(%rip), CTR0, CTR2 vpaddd .Lone(%rip), CTR2, CTR3 vpaddd .Ltwo(%rip), CTR2, CTR4 vpaddd .Lone(%rip), CTR4, CTR5 vpaddd .Ltwo(%rip), CTR4, CTR6 vpaddd .Lone(%rip), CTR6, CTR7 vpaddd .Ltwo(%rip), CTR6, CTR vpshufb TMP3, CTR0, CTR0 vpshufb TMP3, CTR1, CTR1 vpshufb TMP3, CTR2, CTR2 vpshufb TMP3, CTR3, CTR3 vpshufb TMP3, CTR4, CTR4 vpshufb TMP3, CTR5, CTR5 vpshufb TMP3, CTR6, CTR6 vpshufb TMP3, CTR7, CTR7 vmovdqu (KS), TMP3 vpxor TMP3, CTR0, CTR0 vpxor TMP3, CTR1, CTR1 vpxor TMP3, CTR2, CTR2 vpxor TMP3, CTR3, CTR3 vpxor TMP3, CTR4, CTR4 vpxor TMP3, CTR5, CTR5 vpxor TMP3, CTR6, CTR6 vpxor TMP3, CTR7, CTR7 vmovdqu 7*16(CT), TMP5 vpshufb .Lbswap_mask(%rip), TMP5, TMP5 vmovdqu 16*0(Htbl), TMP3 vpclmulqdq $0x11, TMP3, TMP5, TMP1 vpclmulqdq $0x00, TMP3, TMP5, TMP2 vpshufd $78, TMP5, TMP3 vpxor TMP5, TMP3, TMP5 vmovdqu 128+0*16(Htbl), TMP3 vpclmulqdq $0x00, TMP3, TMP5, TMP0 ROUND 1 DEC_KARATSUBA 1 ROUND 2 DEC_KARATSUBA 2 ROUND 3 DEC_KARATSUBA 3 ROUND 4 DEC_KARATSUBA 4 ROUND 5 DEC_KARATSUBA 5 ROUND 6 DEC_KARATSUBA 6 ROUND 7 vmovdqu 0*16(CT), TMP5 vpshufb .Lbswap_mask(%rip), TMP5, TMP5 vpxor T, TMP5, TMP5 vmovdqu 16*7(Htbl), TMP4 vpclmulqdq $0x11, TMP4, TMP5, TMP3 vpxor TMP3, TMP1, TMP1 vpclmulqdq $0x00, TMP4, TMP5, TMP3 vpxor TMP3, TMP2, TMP2 vpshufd $78, TMP5, TMP3 vpxor TMP5, TMP3, TMP5 vmovdqu 128+7*16(Htbl), TMP4 vpclmulqdq $0x00, TMP4, TMP5, TMP3 vpxor TMP3, TMP0, TMP0 ROUND 8 vpxor TMP1, TMP0, TMP0 vpxor TMP2, TMP0, TMP0 vpsrldq $8, TMP0, TMP3 vpxor TMP3, TMP1, TMP4 vpslldq $8, TMP0, TMP3 vpxor TMP3, TMP2, T vmovdqa .Lpoly(%rip), TMP2 vpalignr $8, T, T, TMP1 vpclmulqdq $0x10, TMP2, T, T vpxor T, TMP1, T ROUND 9 vpalignr $8, T, T, TMP1 vpclmulqdq $0x10, TMP2, T, T vpxor T, TMP1, T vmovdqu 160(KS), TMP5 cmp $10, NR jbe .LDECLast1 ROUND 10 ROUND 11 vmovdqu 192(KS), TMP5 cmp $12, NR jbe .LDECLast1 ROUND 12 ROUND 13 vmovdqu 224(KS), TMP5 .LDECLast1: vpxor (CT), TMP5, TMP3 vaesenclast TMP3, CTR0, CTR0 vpxor 16(CT), TMP5, TMP3 vaesenclast TMP3, CTR1, CTR1 vpxor 32(CT), TMP5, TMP3 vaesenclast TMP3, CTR2, CTR2 vpxor 48(CT), TMP5, TMP3 vaesenclast TMP3, CTR3, CTR3 vpxor 64(CT), TMP5, TMP3 vaesenclast TMP3, CTR4, CTR4 vpxor 80(CT), TMP5, TMP3 vaesenclast TMP3, CTR5, CTR5 vpxor 96(CT), TMP5, TMP3 vaesenclast TMP3, CTR6, CTR6 vpxor 112(CT), TMP5, TMP3 vaesenclast TMP3, CTR7, CTR7 vmovdqu .Lbswap_mask(%rip), TMP3 vmovdqu CTR0, (PT) vmovdqu CTR1, 16(PT) vmovdqu CTR2, 32(PT) vmovdqu CTR3, 48(PT) vmovdqu CTR4, 64(PT) vmovdqu CTR5, 80(PT) vmovdqu CTR6, 96(PT) vmovdqu CTR7,112(PT) vpxor TMP4, T, T lea 128(CT), CT lea 128(PT), PT jmp .LDECOctets #Here we decrypt and hash any remaining whole block .LDECSingles: cmp $16, len jb .LDECTail sub $16, len vmovdqu (CT), TMP1 vpshufb .Lbswap_mask(%rip), TMP1, TMP1 vpxor TMP1, T, T vmovdqu (Htbl), TMP0 call GFMUL vpshufb .Lbswap_mask(%rip), CTR, TMP1 vpaddd .Lone(%rip), CTR, CTR vpxor (KS), TMP1, TMP1 vaesenc 16*1(KS), TMP1, TMP1 vaesenc 16*2(KS), TMP1, TMP1 vaesenc 16*3(KS), TMP1, TMP1 vaesenc 16*4(KS), TMP1, TMP1 vaesenc 16*5(KS), TMP1, TMP1 vaesenc 16*6(KS), TMP1, TMP1 vaesenc 16*7(KS), TMP1, TMP1 vaesenc 16*8(KS), TMP1, TMP1 vaesenc 16*9(KS), TMP1, TMP1 vmovdqu 16*10(KS), TMP2 cmp $10, NR je .LDECLast2 vaesenc 16*10(KS), TMP1, TMP1 vaesenc 16*11(KS), TMP1, TMP1 vmovdqu 16*12(KS), TMP2 cmp $12, NR je .LDECLast2 vaesenc 16*12(KS), TMP1, TMP1 vaesenc 16*13(KS), TMP1, TMP1 vmovdqu 16*14(KS), TMP2 .LDECLast2: vaesenclast TMP2, TMP1, TMP1 vpxor (CT), TMP1, TMP1 vmovdqu TMP1, (PT) addq $16, CT addq $16, PT jmp .LDECSingles #Here we decrypt the final partial block, if there is one .LDECTail: test len, len jz .LDEC_END vpshufb .Lbswap_mask(%rip), CTR, TMP1 vpaddd .Lone(%rip), CTR, CTR vpxor (KS), TMP1, TMP1 vaesenc 16*1(KS), TMP1, TMP1 vaesenc 16*2(KS), TMP1, TMP1 vaesenc 16*3(KS), TMP1, TMP1 vaesenc 16*4(KS), TMP1, TMP1 vaesenc 16*5(KS), TMP1, TMP1 vaesenc 16*6(KS), TMP1, TMP1 vaesenc 16*7(KS), TMP1, TMP1 vaesenc 16*8(KS), TMP1, TMP1 vaesenc 16*9(KS), TMP1, TMP1 vmovdqu 16*10(KS), TMP2 cmp $10, NR je .LDECLast3 vaesenc 16*10(KS), TMP1, TMP1 vaesenc 16*11(KS), TMP1, TMP1 vmovdqu 16*12(KS), TMP2 cmp $12, NR je .LDECLast3 vaesenc 16*12(KS), TMP1, TMP1 vaesenc 16*13(KS), TMP1, TMP1 vmovdqu 16*14(KS), TMP2 .LDECLast3: vaesenclast TMP2, TMP1, TMP1 vpxor TMP2, TMP2, TMP2 vmovdqa TMP2, (%rsp) # Copy the required bytes only (could probably use rep movsb) xor KS, KS .LDecCpy: cmp KS, len je .LDecCpy2 movb (CT, KS, 1), %r8b movb %r8b, (%rsp, KS, 1) inc KS jmp .LDecCpy .LDecCpy2: cmp $16, KS je .LDecCpyEnd movb $0, (%rsp, KS, 1) inc KS jmp .LDecCpy2 .LDecCpyEnd: # Xor with the counter block vmovdqa (%rsp), TMP0 vpxor TMP0, TMP1, TMP1 # Again, store at temp location vmovdqa TMP1, (%rsp) # Copy only the required bytes to PT, and zero the rest for the hash xor KS, KS .LDecCpy3: cmp KS, len je .LDecCpyEnd3 movb (%rsp, KS, 1), %r8b movb %r8b, (PT, KS, 1) inc KS jmp .LDecCpy3 .LDecCpyEnd3: vpshufb .Lbswap_mask(%rip), TMP0, TMP0 vpxor TMP0, T, T vmovdqu (Htbl), TMP0 call GFMUL .LDEC_END: vpshufb .Lbswap_mask(%rip), T, T vpshufb .Lbswap_mask(%rip), CTR, CTR vmovdqu T, 272(Gctx) vmovdqu CTR, 288(Gctx) movq %rbp, %rsp popq %rbx popq %rbp ret .size intel_aes_gcmDEC, .-intel_aes_gcmDEC ######################### # a = T # b = TMP0 - remains unchanged # res = T # uses also TMP1,TMP2,TMP3,TMP4 # __m128i GFMUL(__m128i A, __m128i B); .type GFMUL,@function .globl GFMUL GFMUL: vpclmulqdq $0x00, TMP0, T, TMP1 vpclmulqdq $0x11, TMP0, T, TMP4 vpshufd $78, T, TMP2 vpshufd $78, TMP0, TMP3 vpxor T, TMP2, TMP2 vpxor TMP0, TMP3, TMP3 vpclmulqdq $0x00, TMP3, TMP2, TMP2 vpxor TMP1, TMP2, TMP2 vpxor TMP4, TMP2, TMP2 vpslldq $8, TMP2, TMP3 vpsrldq $8, TMP2, TMP2 vpxor TMP3, TMP1, TMP1 vpxor TMP2, TMP4, TMP4 vpclmulqdq $0x10, .Lpoly(%rip), TMP1, TMP2 vpshufd $78, TMP1, TMP3 vpxor TMP3, TMP2, TMP1 vpclmulqdq $0x10, .Lpoly(%rip), TMP1, TMP2 vpshufd $78, TMP1, TMP3 vpxor TMP3, TMP2, TMP1 vpxor TMP4, TMP1, T ret .size GFMUL, .-GFMUL