diff options
Diffstat (limited to '')
-rw-r--r-- | security/nss/lib/freebl/intel-aes-x86-masm.asm | 942 |
1 files changed, 942 insertions, 0 deletions
diff --git a/security/nss/lib/freebl/intel-aes-x86-masm.asm b/security/nss/lib/freebl/intel-aes-x86-masm.asm new file mode 100644 index 0000000000..790c951e7c --- /dev/null +++ b/security/nss/lib/freebl/intel-aes-x86-masm.asm @@ -0,0 +1,942 @@ +; LICENSE: +; This submission to NSS is to be made available under the terms of the +; Mozilla Public License, v. 2.0. You can obtain one at http: +; //mozilla.org/MPL/2.0/. +;############################################################################### +; Copyright(c) 2014, Intel Corp. +; Developers and authors: +; Shay Gueron and Vlad Krasnov +; Intel Corporation, Israel Development Centre, Haifa, Israel +; Please send feedback directly to crypto.feedback.alias@intel.com + + +.MODEL FLAT, C +.XMM + +.DATA +ALIGN 16 +Lmask dd 0c0f0e0dh,0c0f0e0dh,0c0f0e0dh,0c0f0e0dh +Lmask192 dd 004070605h, 004070605h, 004070605h, 004070605h +Lmask256 dd 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh, 00c0f0e0dh +Lcon1 dd 1,1,1,1 +Lcon2 dd 1bh,1bh,1bh,1bh + +.CODE + +ctx textequ <ecx> +output textequ <edx> +input textequ <eax> +inputLen textequ <edi> + + +aes_rnd MACRO i + movdqu xmm7, [i*16 + ctx] + aesenc xmm0, xmm7 + aesenc xmm1, xmm7 + aesenc xmm2, xmm7 + aesenc xmm3, xmm7 + aesenc xmm4, xmm7 + aesenc xmm5, xmm7 + aesenc xmm6, xmm7 + ENDM + +aes_last_rnd MACRO i + movdqu xmm7, [i*16 + ctx] + aesenclast xmm0, xmm7 + aesenclast xmm1, xmm7 + aesenclast xmm2, xmm7 + aesenclast xmm3, xmm7 + aesenclast xmm4, xmm7 + aesenclast xmm5, xmm7 + aesenclast xmm6, xmm7 + ENDM + +aes_dec_rnd MACRO i + movdqu xmm7, [i*16 + ctx] + aesdec xmm0, xmm7 + aesdec xmm1, xmm7 + aesdec xmm2, xmm7 + aesdec xmm3, xmm7 + aesdec xmm4, xmm7 + aesdec xmm5, xmm7 + aesdec xmm6, xmm7 + ENDM + +aes_dec_last_rnd MACRO i + movdqu xmm7, [i*16 + ctx] + aesdeclast xmm0, xmm7 + aesdeclast xmm1, xmm7 + aesdeclast xmm2, xmm7 + aesdeclast xmm3, xmm7 + aesdeclast xmm4, xmm7 + aesdeclast xmm5, xmm7 + aesdeclast xmm6, xmm7 + ENDM + + +gen_aes_ecb_func MACRO enc, rnds + +LOCAL loop7 +LOCAL loop1 +LOCAL bail + + push inputLen + + mov ctx, [esp + 2*4 + 0*4] + mov output, [esp + 2*4 + 1*4] + mov input, [esp + 2*4 + 4*4] + mov inputLen, [esp + 2*4 + 5*4] + +loop7: + cmp inputLen, 7*16 + jb loop1 + + movdqu xmm0, [0*16 + input] + movdqu xmm1, [1*16 + input] + movdqu xmm2, [2*16 + input] + movdqu xmm3, [3*16 + input] + movdqu xmm4, [4*16 + input] + movdqu xmm5, [5*16 + input] + movdqu xmm6, [6*16 + input] + + movdqu xmm7, [0*16 + ctx] + pxor xmm0, xmm7 + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm3, xmm7 + pxor xmm4, xmm7 + pxor xmm5, xmm7 + pxor xmm6, xmm7 + +IF enc eq 1 + rnd textequ <aes_rnd> + lastrnd textequ <aes_last_rnd> + aesinst textequ <aesenc> + aeslastinst textequ <aesenclast> +ELSE + rnd textequ <aes_dec_rnd> + lastrnd textequ <aes_dec_last_rnd> + aesinst textequ <aesdec> + aeslastinst textequ <aesdeclast> +ENDIF + + i = 1 + WHILE i LT rnds + rnd i + i = i+1 + ENDM + lastrnd rnds + + movdqu [0*16 + output], xmm0 + movdqu [1*16 + output], xmm1 + movdqu [2*16 + output], xmm2 + movdqu [3*16 + output], xmm3 + movdqu [4*16 + output], xmm4 + movdqu [5*16 + output], xmm5 + movdqu [6*16 + output], xmm6 + + lea input, [7*16 + input] + lea output, [7*16 + output] + sub inputLen, 7*16 + jmp loop7 + +loop1: + cmp inputLen, 1*16 + jb bail + + movdqu xmm0, [input] + movdqu xmm7, [0*16 + ctx] + pxor xmm0, xmm7 + + i = 1 + WHILE i LT rnds + movdqu xmm7, [i*16 + ctx] + aesinst xmm0, xmm7 + i = i+1 + ENDM + movdqu xmm7, [rnds*16 + ctx] + aeslastinst xmm0, xmm7 + + movdqu [output], xmm0 + + lea input, [1*16 + input] + lea output, [1*16 + output] + sub inputLen, 1*16 + jmp loop1 + +bail: + xor eax, eax + pop inputLen + ret + +ENDM + +ALIGN 16 +intel_aes_encrypt_ecb_128 PROC +gen_aes_ecb_func 1, 10 +intel_aes_encrypt_ecb_128 ENDP + +ALIGN 16 +intel_aes_encrypt_ecb_192 PROC +gen_aes_ecb_func 1, 12 +intel_aes_encrypt_ecb_192 ENDP + +ALIGN 16 +intel_aes_encrypt_ecb_256 PROC +gen_aes_ecb_func 1, 14 +intel_aes_encrypt_ecb_256 ENDP + +ALIGN 16 +intel_aes_decrypt_ecb_128 PROC +gen_aes_ecb_func 0, 10 +intel_aes_decrypt_ecb_128 ENDP + +ALIGN 16 +intel_aes_decrypt_ecb_192 PROC +gen_aes_ecb_func 0, 12 +intel_aes_decrypt_ecb_192 ENDP + +ALIGN 16 +intel_aes_decrypt_ecb_256 PROC +gen_aes_ecb_func 0, 14 +intel_aes_decrypt_ecb_256 ENDP + + +KEY textequ <ecx> +KS textequ <edx> +ITR textequ <eax> + +ALIGN 16 +intel_aes_encrypt_init_128 PROC + + mov KEY, [esp + 1*4 + 0*4] + mov KS, [esp + 1*4 + 1*4] + + + movdqu xmm1, [KEY] + movdqu [KS], xmm1 + movdqa xmm2, xmm1 + + lea ITR, Lcon1 + movdqa xmm0, [ITR] + lea ITR, Lmask + movdqa xmm4, [ITR] + + mov ITR, 8 + +Lenc_128_ks_loop: + lea KS, [16 + KS] + dec ITR + + pshufb xmm2, xmm4 + aesenclast xmm2, xmm0 + pslld xmm0, 1 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu [KS], xmm1 + movdqa xmm2, xmm1 + + jne Lenc_128_ks_loop + + lea ITR, Lcon2 + movdqa xmm0, [ITR] + + pshufb xmm2, xmm4 + aesenclast xmm2, xmm0 + pslld xmm0, 1 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu [16 + KS], xmm1 + movdqa xmm2, xmm1 + + pshufb xmm2, xmm4 + aesenclast xmm2, xmm0 + movdqa xmm3, xmm1 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pslldq xmm3, 4 + pxor xmm1, xmm3 + pxor xmm1, xmm2 + movdqu [32 + KS], xmm1 + movdqa xmm2, xmm1 + + ret +intel_aes_encrypt_init_128 ENDP + + +ALIGN 16 +intel_aes_decrypt_init_128 PROC + + mov KEY, [esp + 1*4 + 0*4] + mov KS, [esp + 1*4 + 1*4] + + push KS + push KEY + + call intel_aes_encrypt_init_128 + + pop KEY + pop KS + + movdqu xmm0, [0*16 + KS] + movdqu xmm1, [10*16 + KS] + movdqu [10*16 + KS], xmm0 + movdqu [0*16 + KS], xmm1 + + i = 1 + WHILE i LT 5 + movdqu xmm0, [i*16 + KS] + movdqu xmm1, [(10-i)*16 + KS] + + aesimc xmm0, xmm0 + aesimc xmm1, xmm1 + + movdqu [(10-i)*16 + KS], xmm0 + movdqu [i*16 + KS], xmm1 + + i = i+1 + ENDM + + movdqu xmm0, [5*16 + KS] + aesimc xmm0, xmm0 + movdqu [5*16 + KS], xmm0 + ret +intel_aes_decrypt_init_128 ENDP + + +ALIGN 16 +intel_aes_encrypt_init_192 PROC + + mov KEY, [esp + 1*4 + 0*4] + mov KS, [esp + 1*4 + 1*4] + + pxor xmm3, xmm3 + movdqu xmm1, [KEY] + pinsrd xmm3, DWORD PTR [16 + KEY], 0 + pinsrd xmm3, DWORD PTR [20 + KEY], 1 + + movdqu [KS], xmm1 + movdqa xmm5, xmm3 + + lea ITR, Lcon1 + movdqu xmm0, [ITR] + lea ITR, Lmask192 + movdqu xmm4, [ITR] + + mov ITR, 4 + +Lenc_192_ks_loop: + movdqa xmm2, xmm3 + pshufb xmm2, xmm4 + aesenclast xmm2, xmm0 + pslld xmm0, 1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm3 + pslldq xmm6, 4 + pslldq xmm7, 4 + pxor xmm1, xmm6 + pxor xmm3, xmm7 + pslldq xmm6, 4 + pxor xmm1, xmm6 + pslldq xmm6, 4 + pxor xmm1, xmm6 + pxor xmm1, xmm2 + pshufd xmm2, xmm1, 0ffh + pxor xmm3, xmm2 + + movdqa xmm6, xmm1 + shufpd xmm5, xmm1, 00h + shufpd xmm6, xmm3, 01h + + movdqu [16 + KS], xmm5 + movdqu [32 + KS], xmm6 + + movdqa xmm2, xmm3 + pshufb xmm2, xmm4 + aesenclast xmm2, xmm0 + pslld xmm0, 1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm3 + pslldq xmm6, 4 + pslldq xmm7, 4 + pxor xmm1, xmm6 + pxor xmm3, xmm7 + pslldq xmm6, 4 + pxor xmm1, xmm6 + pslldq xmm6, 4 + pxor xmm1, xmm6 + pxor xmm1, xmm2 + pshufd xmm2, xmm1, 0ffh + pxor xmm3, xmm2 + + movdqu [48 + KS], xmm1 + movdqa xmm5, xmm3 + + lea KS, [48 + KS] + + dec ITR + jnz Lenc_192_ks_loop + + movdqu [16 + KS], xmm5 +ret +intel_aes_encrypt_init_192 ENDP + +ALIGN 16 +intel_aes_decrypt_init_192 PROC + mov KEY, [esp + 1*4 + 0*4] + mov KS, [esp + 1*4 + 1*4] + + push KS + push KEY + + call intel_aes_encrypt_init_192 + + pop KEY + pop KS + + movdqu xmm0, [0*16 + KS] + movdqu xmm1, [12*16 + KS] + movdqu [12*16 + KS], xmm0 + movdqu [0*16 + KS], xmm1 + + i = 1 + WHILE i LT 6 + movdqu xmm0, [i*16 + KS] + movdqu xmm1, [(12-i)*16 + KS] + + aesimc xmm0, xmm0 + aesimc xmm1, xmm1 + + movdqu [(12-i)*16 + KS], xmm0 + movdqu [i*16 + KS], xmm1 + + i = i+1 + ENDM + + movdqu xmm0, [6*16 + KS] + aesimc xmm0, xmm0 + movdqu [6*16 + KS], xmm0 + ret +intel_aes_decrypt_init_192 ENDP + +ALIGN 16 +intel_aes_encrypt_init_256 PROC + + mov KEY, [esp + 1*4 + 0*4] + mov KS, [esp + 1*4 + 1*4] + movdqu xmm1, [16*0 + KEY] + movdqu xmm3, [16*1 + KEY] + + movdqu [16*0 + KS], xmm1 + movdqu [16*1 + KS], xmm3 + + lea ITR, Lcon1 + movdqu xmm0, [ITR] + lea ITR, Lmask256 + movdqu xmm5, [ITR] + + pxor xmm6, xmm6 + + mov ITR, 6 + +Lenc_256_ks_loop: + + movdqa xmm2, xmm3 + pshufb xmm2, xmm5 + aesenclast xmm2, xmm0 + pslld xmm0, 1 + movdqa xmm4, xmm1 + pslldq xmm4, 4 + pxor xmm1, xmm4 + pslldq xmm4, 4 + pxor xmm1, xmm4 + pslldq xmm4, 4 + pxor xmm1, xmm4 + pxor xmm1, xmm2 + movdqu [16*2 + KS], xmm1 + + pshufd xmm2, xmm1, 0ffh + aesenclast xmm2, xmm6 + movdqa xmm4, xmm3 + pslldq xmm4, 4 + pxor xmm3, xmm4 + pslldq xmm4, 4 + pxor xmm3, xmm4 + pslldq xmm4, 4 + pxor xmm3, xmm4 + pxor xmm3, xmm2 + movdqu [16*3 + KS], xmm3 + + lea KS, [32 + KS] + dec ITR + jnz Lenc_256_ks_loop + + movdqa xmm2, xmm3 + pshufb xmm2, xmm5 + aesenclast xmm2, xmm0 + movdqa xmm4, xmm1 + pslldq xmm4, 4 + pxor xmm1, xmm4 + pslldq xmm4, 4 + pxor xmm1, xmm4 + pslldq xmm4, 4 + pxor xmm1, xmm4 + pxor xmm1, xmm2 + movdqu [16*2 + KS], xmm1 + + ret +intel_aes_encrypt_init_256 ENDP + +ALIGN 16 +intel_aes_decrypt_init_256 PROC + mov KEY, [esp + 1*4 + 0*4] + mov KS, [esp + 1*4 + 1*4] + + push KS + push KEY + + call intel_aes_encrypt_init_256 + + pop KEY + pop KS + + movdqu xmm0, [0*16 + KS] + movdqu xmm1, [14*16 + KS] + movdqu [14*16 + KS], xmm0 + movdqu [0*16 + KS], xmm1 + + i = 1 + WHILE i LT 7 + movdqu xmm0, [i*16 + KS] + movdqu xmm1, [(14-i)*16 + KS] + + aesimc xmm0, xmm0 + aesimc xmm1, xmm1 + + movdqu [(14-i)*16 + KS], xmm0 + movdqu [i*16 + KS], xmm1 + + i = i+1 + ENDM + + movdqu xmm0, [7*16 + KS] + aesimc xmm0, xmm0 + movdqu [7*16 + KS], xmm0 + ret +intel_aes_decrypt_init_256 ENDP + + + +gen_aes_cbc_enc_func MACRO rnds + +LOCAL loop1 +LOCAL bail + + push inputLen + + mov ctx, [esp + 2*4 + 0*4] + mov output, [esp + 2*4 + 1*4] + mov input, [esp + 2*4 + 4*4] + mov inputLen, [esp + 2*4 + 5*4] + + movdqu xmm0, [252+ctx] + + movdqu xmm2, [0*16 + ctx] + movdqu xmm3, [1*16 + ctx] + movdqu xmm4, [2*16 + ctx] + movdqu xmm5, [3*16 + ctx] + movdqu xmm6, [4*16 + ctx] + +loop1: + cmp inputLen, 1*16 + jb bail + + movdqu xmm1, [input] + pxor xmm1, xmm2 + pxor xmm0, xmm1 + + aesenc xmm0, xmm3 + aesenc xmm0, xmm4 + aesenc xmm0, xmm5 + aesenc xmm0, xmm6 + + i = 5 + WHILE i LT rnds + movdqu xmm7, [i*16 + ctx] + aesenc xmm0, xmm7 + i = i+1 + ENDM + movdqu xmm7, [rnds*16 + ctx] + aesenclast xmm0, xmm7 + + movdqu [output], xmm0 + + lea input, [1*16 + input] + lea output, [1*16 + output] + sub inputLen, 1*16 + jmp loop1 + +bail: + movdqu [252+ctx], xmm0 + + xor eax, eax + pop inputLen + ret + +ENDM + +gen_aes_cbc_dec_func MACRO rnds + +LOCAL loop7 +LOCAL loop1 +LOCAL dec1 +LOCAL bail + + push inputLen + + mov ctx, [esp + 2*4 + 0*4] + mov output, [esp + 2*4 + 1*4] + mov input, [esp + 2*4 + 4*4] + mov inputLen, [esp + 2*4 + 5*4] + +loop7: + cmp inputLen, 7*16 + jb dec1 + + movdqu xmm0, [0*16 + input] + movdqu xmm1, [1*16 + input] + movdqu xmm2, [2*16 + input] + movdqu xmm3, [3*16 + input] + movdqu xmm4, [4*16 + input] + movdqu xmm5, [5*16 + input] + movdqu xmm6, [6*16 + input] + + movdqu xmm7, [0*16 + ctx] + pxor xmm0, xmm7 + pxor xmm1, xmm7 + pxor xmm2, xmm7 + pxor xmm3, xmm7 + pxor xmm4, xmm7 + pxor xmm5, xmm7 + pxor xmm6, xmm7 + + i = 1 + WHILE i LT rnds + aes_dec_rnd i + i = i+1 + ENDM + aes_dec_last_rnd rnds + + movdqu xmm7, [252 + ctx] + pxor xmm0, xmm7 + movdqu xmm7, [0*16 + input] + pxor xmm1, xmm7 + movdqu xmm7, [1*16 + input] + pxor xmm2, xmm7 + movdqu xmm7, [2*16 + input] + pxor xmm3, xmm7 + movdqu xmm7, [3*16 + input] + pxor xmm4, xmm7 + movdqu xmm7, [4*16 + input] + pxor xmm5, xmm7 + movdqu xmm7, [5*16 + input] + pxor xmm6, xmm7 + movdqu xmm7, [6*16 + input] + + movdqu [0*16 + output], xmm0 + movdqu [1*16 + output], xmm1 + movdqu [2*16 + output], xmm2 + movdqu [3*16 + output], xmm3 + movdqu [4*16 + output], xmm4 + movdqu [5*16 + output], xmm5 + movdqu [6*16 + output], xmm6 + movdqu [252 + ctx], xmm7 + + lea input, [7*16 + input] + lea output, [7*16 + output] + sub inputLen, 7*16 + jmp loop7 +dec1: + + movdqu xmm3, [252 + ctx] + +loop1: + cmp inputLen, 1*16 + jb bail + + movdqu xmm0, [input] + movdqa xmm4, xmm0 + movdqu xmm7, [0*16 + ctx] + pxor xmm0, xmm7 + + i = 1 + WHILE i LT rnds + movdqu xmm7, [i*16 + ctx] + aesdec xmm0, xmm7 + i = i+1 + ENDM + movdqu xmm7, [rnds*16 + ctx] + aesdeclast xmm0, xmm7 + pxor xmm3, xmm0 + + movdqu [output], xmm3 + movdqa xmm3, xmm4 + + lea input, [1*16 + input] + lea output, [1*16 + output] + sub inputLen, 1*16 + jmp loop1 + +bail: + movdqu [252 + ctx], xmm3 + xor eax, eax + pop inputLen + ret +ENDM + +ALIGN 16 +intel_aes_encrypt_cbc_128 PROC +gen_aes_cbc_enc_func 10 +intel_aes_encrypt_cbc_128 ENDP + +ALIGN 16 +intel_aes_encrypt_cbc_192 PROC +gen_aes_cbc_enc_func 12 +intel_aes_encrypt_cbc_192 ENDP + +ALIGN 16 +intel_aes_encrypt_cbc_256 PROC +gen_aes_cbc_enc_func 14 +intel_aes_encrypt_cbc_256 ENDP + +ALIGN 16 +intel_aes_decrypt_cbc_128 PROC +gen_aes_cbc_dec_func 10 +intel_aes_decrypt_cbc_128 ENDP + +ALIGN 16 +intel_aes_decrypt_cbc_192 PROC +gen_aes_cbc_dec_func 12 +intel_aes_decrypt_cbc_192 ENDP + +ALIGN 16 +intel_aes_decrypt_cbc_256 PROC +gen_aes_cbc_dec_func 14 +intel_aes_decrypt_cbc_256 ENDP + + + +ctrCtx textequ <esi> +CTR textequ <ebx> + +gen_aes_ctr_func MACRO rnds + +LOCAL loop7 +LOCAL loop1 +LOCAL enc1 +LOCAL bail + + push inputLen + push ctrCtx + push CTR + push ebp + + mov ctrCtx, [esp + 4*5 + 0*4] + mov output, [esp + 4*5 + 1*4] + mov input, [esp + 4*5 + 4*4] + mov inputLen, [esp + 4*5 + 5*4] + + mov ctx, [4+ctrCtx] + + mov ebp, esp + sub esp, 7*16 + and esp, -16 + + movdqu xmm0, [8+ctrCtx] + mov ctrCtx, [ctrCtx + 8 + 3*4] + bswap ctrCtx + movdqu xmm1, [ctx + 0*16] + + pxor xmm0, xmm1 + + movdqa [esp + 0*16], xmm0 + movdqa [esp + 1*16], xmm0 + movdqa [esp + 2*16], xmm0 + movdqa [esp + 3*16], xmm0 + movdqa [esp + 4*16], xmm0 + movdqa [esp + 5*16], xmm0 + movdqa [esp + 6*16], xmm0 + + inc ctrCtx + mov CTR, ctrCtx + bswap CTR + xor CTR, [ctx + 3*4] + mov [esp + 1*16 + 3*4], CTR + + inc ctrCtx + mov CTR, ctrCtx + bswap CTR + xor CTR, [ctx + 3*4] + mov [esp + 2*16 + 3*4], CTR + + inc ctrCtx + mov CTR, ctrCtx + bswap CTR + xor CTR, [ctx + 3*4] + mov [esp + 3*16 + 3*4], CTR + + inc ctrCtx + mov CTR, ctrCtx + bswap CTR + xor CTR, [ctx + 3*4] + mov [esp + 4*16 + 3*4], CTR + + inc ctrCtx + mov CTR, ctrCtx + bswap CTR + xor CTR, [ctx + 3*4] + mov [esp + 5*16 + 3*4], CTR + + inc ctrCtx + mov CTR, ctrCtx + bswap CTR + xor CTR, [ctx + 3*4] + mov [esp + 6*16 + 3*4], CTR + + +loop7: + cmp inputLen, 7*16 + jb loop1 + + movdqu xmm0, [0*16 + esp] + movdqu xmm1, [1*16 + esp] + movdqu xmm2, [2*16 + esp] + movdqu xmm3, [3*16 + esp] + movdqu xmm4, [4*16 + esp] + movdqu xmm5, [5*16 + esp] + movdqu xmm6, [6*16 + esp] + + i = 1 + WHILE i LE 7 + aes_rnd i + + inc ctrCtx + mov CTR, ctrCtx + bswap CTR + xor CTR, [ctx + 3*4] + mov [esp + (i-1)*16 + 3*4], CTR + + i = i+1 + ENDM + WHILE i LT rnds + aes_rnd i + i = i+1 + ENDM + aes_last_rnd rnds + + movdqu xmm7, [0*16 + input] + pxor xmm0, xmm7 + movdqu xmm7, [1*16 + input] + pxor xmm1, xmm7 + movdqu xmm7, [2*16 + input] + pxor xmm2, xmm7 + movdqu xmm7, [3*16 + input] + pxor xmm3, xmm7 + movdqu xmm7, [4*16 + input] + pxor xmm4, xmm7 + movdqu xmm7, [5*16 + input] + pxor xmm5, xmm7 + movdqu xmm7, [6*16 + input] + pxor xmm6, xmm7 + + movdqu [0*16 + output], xmm0 + movdqu [1*16 + output], xmm1 + movdqu [2*16 + output], xmm2 + movdqu [3*16 + output], xmm3 + movdqu [4*16 + output], xmm4 + movdqu [5*16 + output], xmm5 + movdqu [6*16 + output], xmm6 + + lea input, [7*16 + input] + lea output, [7*16 + output] + sub inputLen, 7*16 + jmp loop7 + + +loop1: + cmp inputLen, 1*16 + jb bail + + movdqu xmm0, [esp] + add esp, 16 + + i = 1 + WHILE i LT rnds + movdqu xmm7, [i*16 + ctx] + aesenc xmm0, xmm7 + i = i+1 + ENDM + movdqu xmm7, [rnds*16 + ctx] + aesenclast xmm0, xmm7 + + movdqu xmm7, [input] + pxor xmm0, xmm7 + movdqu [output], xmm0 + + lea input, [1*16 + input] + lea output, [1*16 + output] + sub inputLen, 1*16 + jmp loop1 + +bail: + + mov ctrCtx, [ebp + 4*5 + 0*4] + movdqu xmm0, [esp] + movdqu xmm1, [ctx + 0*16] + pxor xmm0, xmm1 + movdqu [8+ctrCtx], xmm0 + + + xor eax, eax + mov esp, ebp + pop ebp + pop CTR + pop ctrCtx + pop inputLen + ret +ENDM + + +ALIGN 16 +intel_aes_encrypt_ctr_128 PROC +gen_aes_ctr_func 10 +intel_aes_encrypt_ctr_128 ENDP + +ALIGN 16 +intel_aes_encrypt_ctr_192 PROC +gen_aes_ctr_func 12 +intel_aes_encrypt_ctr_192 ENDP + +ALIGN 16 +intel_aes_encrypt_ctr_256 PROC +gen_aes_ctr_func 14 +intel_aes_encrypt_ctr_256 ENDP + + +END |