diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
commit | 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch) | |
tree | e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/spdk/intel-ipsec-mb/avx2/gcm_avx_gen4.asm | |
parent | Initial commit. (diff) | |
download | ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip |
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/intel-ipsec-mb/avx2/gcm_avx_gen4.asm')
-rw-r--r-- | src/spdk/intel-ipsec-mb/avx2/gcm_avx_gen4.asm | 3270 |
1 files changed, 3270 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/avx2/gcm_avx_gen4.asm b/src/spdk/intel-ipsec-mb/avx2/gcm_avx_gen4.asm new file mode 100644 index 00000000..48f2609a --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx2/gcm_avx_gen4.asm @@ -0,0 +1,3270 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2018, Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; +; Authors: +; Erdinc Ozturk +; Vinodh Gopal +; James Guilford +; +; +; References: +; This code was derived and highly optimized from the code described in paper: +; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010 +; The details of the implementation is explained in: +; Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012. +; +; +; +; +; Assumptions: +; +; +; +; iv: +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Salt (From the SA) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Initialization Vector | +; | (This is the sequence number from IPSec header) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x1 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; +; +; AAD: +; AAD will be padded with 0 to the next 16byte multiple +; for example, assume AAD is a u32 vector +; +; if AAD is 8 bytes: +; AAD[3] = {A0, A1}; +; padded AAD in xmm register = {A1 A0 0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A1) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 32-bit Sequence Number (A0) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 32-bit Sequence Number +; +; if AAD is 12 bytes: +; AAD[3] = {A0, A1, A2}; +; padded AAD in xmm register = {A2 A1 A0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A2) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 64-bit Extended Sequence Number {A1,A0} | +; | | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 64-bit Extended Sequence Number +; +; +; aadLen: +; Must be a multiple of 4 bytes and from the definition of the spec. +; The code additionally supports any aadLen length. +; +; TLen: +; from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +; +; poly = x^128 + x^127 + x^126 + x^121 + 1 +; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part. +; + +%include "os.asm" +%include "reg_sizes.asm" +%include "gcm_defines.asm" + +%ifndef GCM128_MODE +%ifndef GCM192_MODE +%ifndef GCM256_MODE +%error "No GCM mode selected for gcm_avx_gen4.asm!" +%endif +%endif +%endif + +;; Decide on AES-GCM key size to compile for +%ifdef GCM128_MODE +%define NROUNDS 9 +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx_gen4 +%endif + +%ifdef GCM192_MODE +%define NROUNDS 11 +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx_gen4 +%endif + +%ifdef GCM256_MODE +%define NROUNDS 13 +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx_gen4 +%endif + +section .text +default rel + +; need to push 4 registers into stack to maintain +%define STACK_OFFSET 8*4 + +%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register) +%define TMP3 16*1 ; Temporary storage for AES State 3 +%define TMP4 16*2 ; Temporary storage for AES State 4 +%define TMP5 16*3 ; Temporary storage for AES State 5 +%define TMP6 16*4 ; Temporary storage for AES State 6 +%define TMP7 16*5 ; Temporary storage for AES State 7 +%define TMP8 16*6 ; Temporary storage for AES State 8 + +%define LOCAL_STORAGE 16*7 + +%ifidn __OUTPUT_FORMAT__, win64 + %define XMM_STORAGE 16*10 +%else + %define XMM_STORAGE 0 +%endif + +%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Utility Macros +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +; Input: A and B (128-bits each, bit-reflected) +; Output: C = A*B*x mod poly, (i.e. >>1 ) +; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GHASH_MUL 7 +%define %%GH %1 ; 16 Bytes +%define %%HK %2 ; 16 Bytes +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1 + vpclmulqdq %%T2, %%GH, %%HK, 0x00 ; %%T2 = a0*b0 + vpclmulqdq %%T3, %%GH, %%HK, 0x01 ; %%T3 = a1*b0 + vpclmulqdq %%GH, %%GH, %%HK, 0x10 ; %%GH = a0*b1 + vpxor %%GH, %%GH, %%T3 + + + vpsrldq %%T3, %%GH, 8 ; shift-R %%GH 2 DWs + vpslldq %%GH, %%GH, 8 ; shift-L %%GH 2 DWs + + vpxor %%T1, %%T1, %%T3 + vpxor %%GH, %%GH, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;first phase of the reduction + vmovdqu %%T3, [POLY2] + + vpclmulqdq %%T2, %%T3, %%GH, 0x01 + vpslldq %%T2, %%T2, 8 ; shift-L %%T2 2 DWs + + vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;second phase of the reduction + vpclmulqdq %%T2, %%T3, %%GH, 0x00 + vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + vpclmulqdq %%GH, %%T3, %%GH, 0x10 + vpslldq %%GH, %%GH, 4 ; shift-L %%GH 1 DW (Shift-L 1-DW to obtain result with no shifts) + + vpxor %%GH, %%GH, %%T2 ; second phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vpxor %%GH, %%GH, %%T1 ; the result is in %%GH + +%endmacro + + +; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx_gen4 +; functions, but are kept to allow users to switch cpu architectures between calls +; of pre, init, update, and finalize. +%macro PRECOMPUTE 8 +%define %%GDATA %1 +%define %%HK %2 +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 +%define %%T6 %8 + + ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vmovdqa %%T5, %%HK + + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly + vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_2_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly + vmovdqu [%%GDATA + HashKey_3], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_3_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly + vmovdqu [%%GDATA + HashKey_4], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_4_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly + vmovdqu [%%GDATA + HashKey_5], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_5_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly + vmovdqu [%%GDATA + HashKey_6], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_6_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly + vmovdqu [%%GDATA + HashKey_7], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_7_k], %%T1 + + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly + vmovdqu [%%GDATA + HashKey_8], %%T5 + vpshufd %%T1, %%T5, 01001110b + vpxor %%T1, %%T5 + vmovdqu [%%GDATA + HashKey_8_k], %%T1 +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes. +; Returns 0 if data has length 0. +; Input: The input data (INPUT), that data's length (LENGTH). +; Output: The packed xmm register (OUTPUT). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro READ_SMALL_DATA_INPUT 6 +%define %%OUTPUT %1 ; %%OUTPUT is an xmm register +%define %%INPUT %2 +%define %%LENGTH %3 +%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers +%define %%COUNTER %5 +%define %%TMP1 %6 + + vpxor %%OUTPUT, %%OUTPUT + mov %%COUNTER, %%LENGTH + mov %%END_READ_LOCATION, %%INPUT + add %%END_READ_LOCATION, %%LENGTH + xor %%TMP1, %%TMP1 + + + cmp %%COUNTER, 8 + jl %%_byte_loop_2 + vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists + je %%_done + + sub %%COUNTER, 8 + +%%_byte_loop_1: ;Read in data 1 byte at a time while data is left + shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in + dec %%END_READ_LOCATION + mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] + dec %%COUNTER + jg %%_byte_loop_1 + vpinsrq %%OUTPUT, %%TMP1, 1 + jmp %%_done + +%%_byte_loop_2: ;Read in data 1 byte at a time while data is left + ;; NOTE: in current implementation check for zero length is obsolete here. + ;; The adequate checks are done by callers of this macro. + ;; cmp %%COUNTER, 0 + ;; je %%_done + shl %%TMP1, 8 ;This loop handles when no bytes were already read in + dec %%END_READ_LOCATION + mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] + dec %%COUNTER + jg %%_byte_loop_2 + vpinsrq %%OUTPUT, %%TMP1, 0 +%%_done: + +%endmacro ; READ_SMALL_DATA_INPUT + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. +; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). +; Output: The hash of the data (AAD_HASH). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro CALC_AAD_HASH 14 +%define %%A_IN %1 +%define %%A_LEN %2 +%define %%AAD_HASH %3 +%define %%HASH_KEY %4 +%define %%XTMP1 %5 ; xmm temp reg 5 +%define %%XTMP2 %6 +%define %%XTMP3 %7 +%define %%XTMP4 %8 +%define %%XTMP5 %9 ; xmm temp reg 5 +%define %%T1 %10 ; temp reg 1 +%define %%T2 %11 +%define %%T3 %12 +%define %%T4 %13 +%define %%T5 %14 ; temp reg 5 + + + mov %%T1, %%A_IN ; T1 = AAD + mov %%T2, %%A_LEN ; T2 = aadLen + vpxor %%AAD_HASH, %%AAD_HASH + + cmp %%T2, 16 + jl %%_get_small_AAD_block + +%%_get_AAD_loop16: + + vmovdqu %%XTMP1, [%%T1] + ;byte-reflect the AAD data + vpshufb %%XTMP1, [SHUF_MASK] + vpxor %%AAD_HASH, %%XTMP1 + GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 + + sub %%T2, 16 + je %%_CALC_AAD_done + + add %%T1, 16 + cmp %%T2, 16 + jge %%_get_AAD_loop16 + +%%_get_small_AAD_block: + READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5 + ;byte-reflect the AAD data + vpshufb %%XTMP1, [SHUF_MASK] + vpxor %%AAD_HASH, %%XTMP1 + GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 + +%%_CALC_AAD_done: + +%endmacro ; CALC_AAD_HASH + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls. +; Requires the input data be at least 1 byte long. +; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN), +; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET), +; and whether encoding or decoding (ENC_DEC) +; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro PARTIAL_BLOCK 8 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%DATA_OFFSET %6 +%define %%AAD_HASH %7 +%define %%ENC_DEC %8 + + mov r13, [%%GDATA_CTX + PBlockLen] + cmp r13, 0 + je %%_partial_block_done ;Leave Macro if no partial blocks + + cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading + jl %%_fewer_than_16_bytes + VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register + jmp %%_data_read + +%%_fewer_than_16_bytes: + lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15 + +%%_data_read: ;Finished reading in data + + + vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key + vmovdqu xmm13, [%%GDATA_KEY + HashKey] + + lea r12, [SHIFT_MASK] + + add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16) + vmovdqu xmm2, [r12] ; get the appropriate shuffle mask + vpshufb xmm9, xmm2 ;shift right r13 bytes + +%ifidn %%ENC_DEC, DEC + vmovdqa xmm3, xmm1 + vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn) + + mov r15, %%PLAIN_CYPH_LEN + add r15, r13 + sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly + sub r12, r15 +%%_no_extra_mask_1: + + vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK]; get the appropriate mask to mask out bottom r13 bytes of xmm9 + vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 + + vpand xmm3, xmm1 + vpshufb xmm3, [SHUF_MASK] + vpshufb xmm3, xmm2 + vpxor %%AAD_HASH, xmm3 + + + cmp r15,0 + jl %%_partial_incomplete_1 + + GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + xor rax,rax + mov [%%GDATA_CTX + PBlockLen], rax + jmp %%_dec_done +%%_partial_incomplete_1: +%ifidn __OUTPUT_FORMAT__, win64 + mov rax, %%PLAIN_CYPH_LEN + add [%%GDATA_CTX + PBlockLen], rax +%else + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%endif +%%_dec_done: + vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH + +%else + vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + + mov r15, %%PLAIN_CYPH_LEN + add r15, r13 + sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly + sub r12, r15 +%%_no_extra_mask_2: + + vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 + vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 + + vpshufb xmm9, [SHUF_MASK] + vpshufb xmm9, xmm2 + vpxor %%AAD_HASH, xmm9 + + cmp r15,0 + jl %%_partial_incomplete_2 + + GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + xor rax,rax + mov [%%GDATA_CTX + PBlockLen], rax + jmp %%_encode_done +%%_partial_incomplete_2: +%ifidn __OUTPUT_FORMAT__, win64 + mov rax, %%PLAIN_CYPH_LEN + add [%%GDATA_CTX + PBlockLen], rax +%else + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%endif +%%_encode_done: + vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH + + vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext + vpshufb xmm9, xmm2 +%endif + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; output encrypted Bytes + cmp r15,0 + jl %%_partial_fill + mov r12, r13 + mov r13, 16 + sub r13, r12 ; Set r13 to be the number of bytes to write out + jmp %%_count_set +%%_partial_fill: + mov r13, %%PLAIN_CYPH_LEN +%%_count_set: + vmovq rax, xmm9 + cmp r13, 8 + jle %%_less_than_8_bytes_left + + mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax + add %%DATA_OFFSET, 8 + vpsrldq xmm9, xmm9, 8 + vmovq rax, xmm9 + sub r13, 8 +%%_less_than_8_bytes_left: + mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al + add %%DATA_OFFSET, 1 + shr rax, 8 + sub r13, 1 + jne %%_less_than_8_bytes_left + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_partial_block_done: +%endmacro ; PARTIAL_BLOCK + + +%macro GHASH_SINGLE_MUL 9 +%define %%GDATA %1 +%define %%HASHKEY %2 +%define %%CIPHER %3 +%define %%STATE_11 %4 +%define %%STATE_00 %5 +%define %%STATE_MID %6 +%define %%T1 %7 +%define %%T2 %8 +%define %%FIRST %9 + + vmovdqu %%T1, [%%GDATA + %%HASHKEY] +%ifidn %%FIRST, first + vpclmulqdq %%STATE_11, %%CIPHER, %%T1, 0x11 ; %%T4 = a1*b1 + vpclmulqdq %%STATE_00, %%CIPHER, %%T1, 0x00 ; %%T4_2 = a0*b0 + vpclmulqdq %%STATE_MID, %%CIPHER, %%T1, 0x01 ; %%T6 = a1*b0 + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 ; %%T5 = a0*b1 + vpxor %%STATE_MID, %%STATE_MID, %%T2 +%else + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x11 + vpxor %%STATE_11, %%STATE_11, %%T2 + + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x00 + vpxor %%STATE_00, %%STATE_00, %%T2 + + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x01 + vpxor %%STATE_MID, %%STATE_MID, %%T2 + + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 + vpxor %%STATE_MID, %%STATE_MID, %%T2 +%endif + +%endmacro + +; if a = number of total plaintext bytes +; b = floor(a/16) +; %%num_initial_blocks = b mod 8; +; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext +; %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified. +; Updated AAD_HASH is returned in %%T3 + +%macro INITIAL_BLOCKS 23 +%define %%GDATA_KEY %1 +%define %%CYPH_PLAIN_OUT %2 +%define %%PLAIN_CYPH_IN %3 +%define %%LENGTH %4 +%define %%DATA_OFFSET %5 +%define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7 +%define %%T1 %7 +%define %%T2 %8 +%define %%T3 %9 +%define %%T4 %10 +%define %%T5 %11 +%define %%CTR %12 +%define %%XMM1 %13 +%define %%XMM2 %14 +%define %%XMM3 %15 +%define %%XMM4 %16 +%define %%XMM5 %17 +%define %%XMM6 %18 +%define %%XMM7 %19 +%define %%XMM8 %20 +%define %%T6 %21 +%define %%T_key %22 +%define %%ENC_DEC %23 + +%assign i (8-%%num_initial_blocks) + ;; Move AAD_HASH to temp reg + vmovdqu %%T2, %%XMM8 + ;; Start AES for %%num_initial_blocks blocks + ;; vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0 + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0 + vmovdqa reg(i), %%CTR + vpshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap +%assign i (i+1) +%endrep + +%if(%%num_initial_blocks>0) +vmovdqu %%T_key, [%%GDATA_KEY+16*0] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vpxor reg(i),reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j 1 +%rep NROUNDS +vmovdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vaesenc reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j (j+1) +%endrep + + +vmovdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vaesenclast reg(i),%%T_key +%assign i (i+1) +%endrep + +%endif ; %if(%%num_initial_blocks>0) + + + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + vpxor reg(i), reg(i), %%T1 + ;; Write back ciphertext for %%num_initial_blocks blocks + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) + add %%DATA_OFFSET, 16 + %ifidn %%ENC_DEC, DEC + vmovdqa reg(i), %%T1 + %endif + ;; Prepare ciphertext for GHASH computations + vpshufb reg(i), [SHUF_MASK] +%assign i (i+1) +%endrep + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%assign i (9-%%num_initial_blocks) +%if(%%num_initial_blocks>0) + vmovdqa %%T3, reg(i) +%assign i (i+1) +%endif +%rep %%num_initial_blocks-1 + vmovdqu [rsp + TMP %+ i], reg(i) +%assign i (i+1) +%endrep + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Haskey_i_k holds XORed values of the low and high parts of + ;; the Haskey_i + vpaddd %%XMM1, %%CTR, [ONE] ; INCR Y0 + vpaddd %%XMM2, %%CTR, [TWO] ; INCR Y0 + vpaddd %%XMM3, %%XMM1, [TWO] ; INCR Y0 + vpaddd %%XMM4, %%XMM2, [TWO] ; INCR Y0 + vpaddd %%XMM5, %%XMM3, [TWO] ; INCR Y0 + vpaddd %%XMM6, %%XMM4, [TWO] ; INCR Y0 + vpaddd %%XMM7, %%XMM5, [TWO] ; INCR Y0 + vpaddd %%XMM8, %%XMM6, [TWO] ; INCR Y0 + vmovdqa %%CTR, %%XMM8 + + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + + vmovdqu %%T_key, [%%GDATA_KEY+16*0] + vpxor %%XMM1, %%XMM1, %%T_key + vpxor %%XMM2, %%XMM2, %%T_key + vpxor %%XMM3, %%XMM3, %%T_key + vpxor %%XMM4, %%XMM4, %%T_key + vpxor %%XMM5, %%XMM5, %%T_key + vpxor %%XMM6, %%XMM6, %%T_key + vpxor %%XMM7, %%XMM7, %%T_key + vpxor %%XMM8, %%XMM8, %%T_key + +%assign i (8-%%num_initial_blocks) +%assign j (9-%%num_initial_blocks) +%assign k (%%num_initial_blocks) + +%define %%T4_2 %%T4 +%if(%%num_initial_blocks>0) + ;; Hash in AES state + ;; T2 - incoming AAD hash + vpxor %%T2, %%T3 + + ;; GDATA, HASHKEY, CIPHER, + ;; STATE_11, STATE_00, STATE_MID, T1, T2 + GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ + %%T1, %%T4, %%T6, %%T5, %%T3, first +%endif + + vmovdqu %%T_key, [%%GDATA_KEY+16*1] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + + vmovdqu %%T_key, [%%GDATA_KEY+16*2] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%if(%%num_initial_blocks>1) + ;; GDATA, HASHKEY, CIPHER, + ;; STATE_11, STATE_00, STATE_MID, T1, T2 + vmovdqu %%T2, [rsp + TMP %+ j] + GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ + %%T1, %%T4, %%T6, %%T5, %%T3, not_first +%endif + + vmovdqu %%T_key, [%%GDATA_KEY+16*3] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + + vmovdqu %%T_key, [%%GDATA_KEY+16*4] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%if(%%num_initial_blocks>2) + ;; GDATA, HASHKEY, CIPHER, + ;; STATE_11, STATE_00, STATE_MID, T1, T2 + vmovdqu %%T2, [rsp + TMP %+ j] + GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ + %%T1, %%T4, %%T6, %%T5, %%T3, not_first +%endif + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%if(%%num_initial_blocks>3) + ;; GDATA, HASHKEY, CIPHER, + ;; STATE_11, STATE_00, STATE_MID, T1, T2 + vmovdqu %%T2, [rsp + TMP %+ j] + GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ + %%T1, %%T4, %%T6, %%T5, %%T3, not_first +%endif + + vmovdqu %%T_key, [%%GDATA_KEY+16*5] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + + vmovdqu %%T_key, [%%GDATA_KEY+16*6] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%if(%%num_initial_blocks>4) + ;; GDATA, HASHKEY, CIPHER, + ;; STATE_11, STATE_00, STATE_MID, T1, T2 + vmovdqu %%T2, [rsp + TMP %+ j] + GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ + %%T1, %%T4, %%T6, %%T5, %%T3, not_first +%endif + + vmovdqu %%T_key, [%%GDATA_KEY+16*7] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + + vmovdqu %%T_key, [%%GDATA_KEY+16*8] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%if(%%num_initial_blocks>5) + ;; GDATA, HASHKEY, CIPHER, + ;; STATE_11, STATE_00, STATE_MID, T1, T2 + vmovdqu %%T2, [rsp + TMP %+ j] + GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ + %%T1, %%T4, %%T6, %%T5, %%T3, not_first +%endif + + vmovdqu %%T_key, [%%GDATA_KEY+16*9] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + +%ifndef GCM128_MODE + vmovdqu %%T_key, [%%GDATA_KEY+16*10] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key +%endif + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%if(%%num_initial_blocks>6) + ;; GDATA, HASHKEY, CIPHER, + ;; STATE_11, STATE_00, STATE_MID, T1, T2 + vmovdqu %%T2, [rsp + TMP %+ j] + GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ + %%T1, %%T4, %%T6, %%T5, %%T3, not_first +%endif + +%ifdef GCM128_MODE + vmovdqu %%T_key, [%%GDATA_KEY+16*10] + vaesenclast %%XMM1, %%T_key + vaesenclast %%XMM2, %%T_key + vaesenclast %%XMM3, %%T_key + vaesenclast %%XMM4, %%T_key + vaesenclast %%XMM5, %%T_key + vaesenclast %%XMM6, %%T_key + vaesenclast %%XMM7, %%T_key + vaesenclast %%XMM8, %%T_key +%endif + +%ifdef GCM192_MODE + vmovdqu %%T_key, [%%GDATA_KEY+16*11] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + + vmovdqu %%T_key, [%%GDATA_KEY+16*12] + vaesenclast %%XMM1, %%T_key + vaesenclast %%XMM2, %%T_key + vaesenclast %%XMM3, %%T_key + vaesenclast %%XMM4, %%T_key + vaesenclast %%XMM5, %%T_key + vaesenclast %%XMM6, %%T_key + vaesenclast %%XMM7, %%T_key + vaesenclast %%XMM8, %%T_key +%endif +%ifdef GCM256_MODE + vmovdqu %%T_key, [%%GDATA_KEY+16*11] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + + vmovdqu %%T_key, [%%GDATA_KEY+16*12] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key +%endif + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%if(%%num_initial_blocks>7) + ;; GDATA, HASHKEY, CIPHER, + ;; STATE_11, STATE_00, STATE_MID, T1, T2 + vmovdqu %%T2, [rsp + TMP %+ j] + GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \ + %%T1, %%T4, %%T6, %%T5, %%T3, not_first +%endif + +%ifdef GCM256_MODE ; GCM256 + vmovdqu %%T_key, [%%GDATA_KEY+16*13] + vaesenc %%XMM1, %%T_key + vaesenc %%XMM2, %%T_key + vaesenc %%XMM3, %%T_key + vaesenc %%XMM4, %%T_key + vaesenc %%XMM5, %%T_key + vaesenc %%XMM6, %%T_key + vaesenc %%XMM7, %%T_key + vaesenc %%XMM8, %%T_key + + vmovdqu %%T_key, [%%GDATA_KEY+16*14] + vaesenclast %%XMM1, %%T_key + vaesenclast %%XMM2, %%T_key + vaesenclast %%XMM3, %%T_key + vaesenclast %%XMM4, %%T_key + vaesenclast %%XMM5, %%T_key + vaesenclast %%XMM6, %%T_key + vaesenclast %%XMM7, %%T_key + vaesenclast %%XMM8, %%T_key +%endif ; GCM256 mode + +%if(%%num_initial_blocks>0) + vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs + vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs + vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4 + vpxor %%T4, %%T6, %%T4 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; First phase of the reduction + vmovdqu %%T3, [POLY2] + + vpclmulqdq %%T2, %%T3, %%T4, 0x01 + vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs + + ;; First phase of the reduction complete + vpxor %%T4, %%T4, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; Second phase of the reduction + vpclmulqdq %%T2, %%T3, %%T4, 0x00 + ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + vpsrldq %%T2, %%T2, 4 + + vpclmulqdq %%T4, %%T3, %%T4, 0x10 + ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) + vpslldq %%T4, %%T4, 4 + ;; Second phase of the reduction complete + vpxor %%T4, %%T4, %%T2 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; The result is in %%T3 + vpxor %%T3, %%T1, %%T4 +%else + ;; The hash should end up in T3 + vmovdqa %%T3, %%T2 +%endif + + ;; Final hash is now in T3 +%if %%num_initial_blocks > 0 + ;; NOTE: obsolete in case %%num_initial_blocks = 0 + sub %%LENGTH, 16*%%num_initial_blocks +%endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0] + vpxor %%XMM1, %%XMM1, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM1, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1] + vpxor %%XMM2, %%XMM2, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM2, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2] + vpxor %%XMM3, %%XMM3, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM3, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3] + vpxor %%XMM4, %%XMM4, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM4, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4] + vpxor %%XMM5, %%XMM5, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM5, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5] + vpxor %%XMM6, %%XMM6, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM6, %%T1 + %endif + + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6] + vpxor %%XMM7, %%XMM7, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM7, %%T1 + %endif + +%if %%num_initial_blocks > 0 + ;; NOTE: 'jl' is never taken for %%num_initial_blocks = 0 + ;; This macro is executed for lenght 128 and up, + ;; zero length is checked in GCM_ENC_DEC. + ;; If the last block is partial then the xor will be done later + ;; in ENCRYPT_FINAL_PARTIAL_BLOCK. + ;; We know it's partial if LENGTH - 16*num_initial_blocks < 128 + cmp %%LENGTH, 128 + jl %%_initial_skip_last_word_write +%endif + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7] + vpxor %%XMM8, %%XMM8, %%T1 + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8 + %ifidn %%ENC_DEC, DEC + vmovdqa %%XMM8, %%T1 + %endif + + ;; Update %%LENGTH with the number of blocks processed + sub %%LENGTH, 16 + add %%DATA_OFFSET, 16 +%%_initial_skip_last_word_write: + sub %%LENGTH, 128-16 + add %%DATA_OFFSET, 128-16 + + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + ;; Combine GHASHed value with the corresponding ciphertext + vpxor %%XMM1, %%XMM1, %%T3 + vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_initial_blocks_done: + + +%endmacro + +;;; INITIAL_BLOCKS macro with support for a partial final block. +;;; num_initial_blocks is expected to include the partial final block +;;; in the count. +%macro INITIAL_BLOCKS_PARTIAL 25 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%LENGTH %5 +%define %%DATA_OFFSET %6 +%define %%num_initial_blocks %7 ; can be 1, 2, 3, 4, 5, 6 or 7 (not 0) +%define %%T1 %8 +%define %%T2 %9 +%define %%T3 %10 +%define %%T4 %11 +%define %%T5 %12 +%define %%CTR %13 +%define %%XMM1 %14 +%define %%XMM2 %15 +%define %%XMM3 %16 +%define %%XMM4 %17 +%define %%XMM5 %18 +%define %%XMM6 %19 +%define %%XMM7 %20 +%define %%XMM8 %21 +%define %%T6 %22 +%define %%T_key %23 +%define %%ENC_DEC %24 +%define %%INSTANCE_TYPE %25 + +%assign i (8-%%num_initial_blocks) + ;; Move AAD_HASH to temp reg + vmovdqu %%T2, %%XMM8 + ;; vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0 + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + ;; Compute AES counters + vpaddd %%CTR, %%CTR, [rel ONE] ; INCR Y0 + vmovdqa reg(i), %%CTR + vpshufb reg(i), [rel SHUF_MASK] ; perform a 16Byte swap +%assign i (i+1) +%endrep + +vmovdqu %%T_key, [%%GDATA_KEY+16*0] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + ; Start AES for %%num_initial_blocks blocks + vpxor reg(i),reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j 1 +%rep NROUNDS +vmovdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vaesenc reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j (j+1) +%endrep + + +vmovdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + vaesenclast reg(i),%%T_key +%assign i (i+1) +%endrep + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Hash all but the last block of data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks-1 + ;; Encrypt the message for all but the last block + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + vpxor reg(i), reg(i), %%T1 + ;; write back ciphertext for %%num_initial_blocks blocks + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) + add %%DATA_OFFSET, 16 + %ifidn %%ENC_DEC, DEC + vmovdqa reg(i), %%T1 + %endif + ;; Prepare ciphertext for GHASH computations + vpshufb reg(i), [rel SHUF_MASK] +%assign i (i+1) +%endrep + + ;; The final block of data may be <16B + sub %%LENGTH, 16*(%%num_initial_blocks-1) + +%if %%num_initial_blocks < 8 + ;; NOTE: the 'jl' is always taken for num_initial_blocks = 8. + ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 128. + cmp %%LENGTH, 16 + jl %%_small_initial_partial_block + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Handle a full length final block - encrypt and hash all blocks +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + sub %%LENGTH, 16 + mov [%%GDATA_CTX + PBlockLen], %%LENGTH + + ;; Encrypt the message + VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + vpxor reg(i), reg(i), %%T1 + ;; write back ciphertext for %%num_initial_blocks blocks + VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) + add %%DATA_OFFSET, 16 + %ifidn %%ENC_DEC, DEC + vmovdqa reg(i), %%T1 + %endif + ;; Prepare ciphertext for GHASH computations + vpshufb reg(i), [rel SHUF_MASK] + + ;; Hash all of the data +%assign i (8-%%num_initial_blocks) +%assign j (9-%%num_initial_blocks) +%assign k (%%num_initial_blocks) +%assign last_block_to_hash 0 + +%if(%%num_initial_blocks>last_block_to_hash) + ;; Hash in AES state + vpxor %%T2, reg(j) + + ;; T2 - incoming AAD hash + ;; reg(i) holds ciphertext + ;; T5 - hash key + ;; T6 - updated xor + ;; reg(1)/xmm1 should now be available for tmp use + vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] + vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 + vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0 + vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0 + vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1 + vpxor %%T6, %%T6, %%T5 +%endif + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%assign rep_count (%%num_initial_blocks-1) +%rep rep_count + + vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] + vpclmulqdq %%T3, reg(j), %%T5, 0x11 + vpxor %%T1, %%T1, %%T3 + + vpclmulqdq %%T3, reg(j), %%T5, 0x00 + vpxor %%T4, %%T4, %%T3 + + vpclmulqdq %%T3, reg(j), %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, reg(j), %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%endrep + + ;; Record that a reduction is needed + mov r12, 1 + + jmp %%_small_initial_compute_hash + + +%endif ; %if %%num_initial_blocks < 8 + +%%_small_initial_partial_block: + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Handle ghash for a <16B final block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;; In this case if it's a single call to encrypt we can + ;; hash all of the data but if it's an init / update / finalize + ;; series of call we need to leave the last block if it's + ;; less than a full block of data. + + mov [%%GDATA_CTX + PBlockLen], %%LENGTH + vmovdqu [%%GDATA_CTX + PBlockEncKey], reg(i) + ;; Handle a partial final block + ;; GDATA, KEY, T1, T2 + ;; r13 - length + ;; LT16 - indicates type of read and that the buffer is less than 16 bytes long + ;; NOTE: could be replaced with %%LENGTH but at this point + ;; %%LENGTH is always less than 16. + ;; No PLAIN_CYPH_LEN argument available in this macro. + ENCRYPT_FINAL_PARTIAL_BLOCK reg(i), %%T1, %%T3, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, LT16, %%ENC_DEC, %%DATA_OFFSET + vpshufb reg(i), [SHUF_MASK] + +%ifidn %%INSTANCE_TYPE, multi_call +%assign i (8-%%num_initial_blocks) +%assign j (9-%%num_initial_blocks) +%assign k (%%num_initial_blocks-1) +%assign last_block_to_hash 1 +%else +%assign i (8-%%num_initial_blocks) +%assign j (9-%%num_initial_blocks) +%assign k (%%num_initial_blocks) +%assign last_block_to_hash 0 +%endif + +%if(%%num_initial_blocks>last_block_to_hash) + ;; Record that a reduction is needed + mov r12, 1 + ;; Hash in AES state + vpxor %%T2, reg(j) + + ;; T2 - incoming AAD hash + ;; reg(i) holds ciphertext + ;; T5 - hash key + ;; T6 - updated xor + ;; reg(1)/xmm1 should now be available for tmp use + vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] + vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 + vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0 + vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0 + vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1 + vpxor %%T6, %%T6, %%T5 +%else + ;; Record that a reduction is not needed - + ;; In this case no hashes are computed because there + ;; is only one initial block and it is < 16B in length. + mov r12, 0 +%endif + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%ifidn %%INSTANCE_TYPE, multi_call +%assign rep_count (%%num_initial_blocks-2) +%%_multi_call_hash: +%else +%assign rep_count (%%num_initial_blocks-1) +%endif +%rep rep_count + + vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k] + vpclmulqdq %%T3, reg(j), %%T5, 0x11 + vpxor %%T1, %%T1, %%T3 + + vpclmulqdq %%T3, reg(j), %%T5, 0x00 + vpxor %%T4, %%T4, %%T3 + + vpclmulqdq %%T3, reg(j), %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, reg(j), %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + +%assign i (i+1) +%assign j (j+1) +%assign k (k-1) +%endrep + +%%_small_initial_compute_hash: + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Ghash reduction +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%if(%%num_initial_blocks=1) +%ifidn %%INSTANCE_TYPE, multi_call + ;; We only need to check if a reduction is needed if + ;; initial_blocks == 1 and init/update/final is being used. + ;; In this case we may just have a partial block, and that + ;; gets hashed in finalize. + cmp r12, 0 + je %%_no_reduction_needed +%endif +%endif + + vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs + vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs + vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4 + vpxor %%T4, %%T6, %%T4 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; First phase of the reduction + vmovdqu %%T3, [POLY2] + + vpclmulqdq %%T2, %%T3, %%T4, 0x01 + ;; shift-L xmm2 2 DWs + vpslldq %%T2, %%T2, 8 + vpxor %%T4, %%T4, %%T2 + + ;; First phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Second phase of the reduction + + vpclmulqdq %%T2, %%T3, %%T4, 0x00 + ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + vpsrldq %%T2, %%T2, 4 + + vpclmulqdq %%T4, %%T3, %%T4, 0x10 + ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) + vpslldq %%T4, %%T4, 4 + + vpxor %%T4, %%T4, %%T2 + ;; Second phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vpxor %%T3, %%T1, %%T4 + +%ifidn %%INSTANCE_TYPE, multi_call + ;; If using init/update/finalize, we need to xor any partial block data + ;; into the hash. +%if %%num_initial_blocks > 1 + ;; NOTE: for %%num_initial_blocks = 0 the xor never takes place +%if %%num_initial_blocks != 8 + ;; NOTE: for %%num_initial_blocks = 8, %%LENGTH, stored in [PBlockLen] is never zero + cmp qword [%%GDATA_CTX + PBlockLen], 0 + je %%_no_partial_block_xor +%endif ; %%num_initial_blocks != 8 + vpxor %%T3, %%T3, reg(8) +%%_no_partial_block_xor: +%endif ; %%num_initial_blocks > 1 +%endif ; %%INSTANCE_TYPE, multi_call + +%if(%%num_initial_blocks=1) +%ifidn %%INSTANCE_TYPE, multi_call + ;; NOTE: %%_no_reduction_needed case only valid for + ;; multi_call with initial_blocks = 1. + ;; Look for comment above around '_no_reduction_needed' + ;; The jmp below is obsolete as the code will fall through. + + ;; The result is in %%T3 + jmp %%_after_reduction + +%%_no_reduction_needed: + ;; The hash should end up in T3. The only way we should get here is if + ;; there is a partial block of data, so xor that into the hash. + vpxor %%T3, %%T2, reg(8) +%endif ; %%INSTANCE_TYPE = multi_call +%endif ; %%num_initial_blocks=1 + +%%_after_reduction: + ;; Final hash is now in T3 + +%endmacro ; INITIAL_BLOCKS_PARTIAL + + + +; encrypt 8 blocks at a time +; ghash the 8 previously encrypted ciphertext blocks +; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified +; %%DATA_OFFSET is the data offset value +%macro GHASH_8_ENCRYPT_8_PARALLEL 23 +%define %%GDATA %1 +%define %%CYPH_PLAIN_OUT %2 +%define %%PLAIN_CYPH_IN %3 +%define %%DATA_OFFSET %4 +%define %%T1 %5 +%define %%T2 %6 +%define %%T3 %7 +%define %%T4 %8 +%define %%T5 %9 +%define %%T6 %10 +%define %%CTR %11 +%define %%XMM1 %12 +%define %%XMM2 %13 +%define %%XMM3 %14 +%define %%XMM4 %15 +%define %%XMM5 %16 +%define %%XMM6 %17 +%define %%XMM7 %18 +%define %%XMM8 %19 +%define %%T7 %20 +%define %%loop_idx %21 +%define %%ENC_DEC %22 +%define %%FULL_PARTIAL %23 + + vmovdqa %%T2, %%XMM1 + vmovdqu [rsp + TMP2], %%XMM2 + vmovdqu [rsp + TMP3], %%XMM3 + vmovdqu [rsp + TMP4], %%XMM4 + vmovdqu [rsp + TMP5], %%XMM5 + vmovdqu [rsp + TMP6], %%XMM6 + vmovdqu [rsp + TMP7], %%XMM7 + vmovdqu [rsp + TMP8], %%XMM8 + +%ifidn %%loop_idx, in_order + vpaddd %%XMM1, %%CTR, [ONE] ; INCR CNT + vmovdqu %%T5, [TWO] + vpaddd %%XMM2, %%CTR, %%T5 + vpaddd %%XMM3, %%XMM1, %%T5 + vpaddd %%XMM4, %%XMM2, %%T5 + vpaddd %%XMM5, %%XMM3, %%T5 + vpaddd %%XMM6, %%XMM4, %%T5 + vpaddd %%XMM7, %%XMM5, %%T5 + vpaddd %%XMM8, %%XMM6, %%T5 + vmovdqa %%CTR, %%XMM8 + + vmovdqu %%T5, [SHUF_MASK] + vpshufb %%XMM1, %%T5 ; perform a 16Byte swap + vpshufb %%XMM2, %%T5 ; perform a 16Byte swap + vpshufb %%XMM3, %%T5 ; perform a 16Byte swap + vpshufb %%XMM4, %%T5 ; perform a 16Byte swap + vpshufb %%XMM5, %%T5 ; perform a 16Byte swap + vpshufb %%XMM6, %%T5 ; perform a 16Byte swap + vpshufb %%XMM7, %%T5 ; perform a 16Byte swap + vpshufb %%XMM8, %%T5 ; perform a 16Byte swap +%else + vpaddd %%XMM1, %%CTR, [ONEf] ; INCR CNT + vmovdqu %%T5, [TWOf] + vpaddd %%XMM2, %%CTR, %%T5 + vpaddd %%XMM3, %%XMM1, %%T5 + vpaddd %%XMM4, %%XMM2, %%T5 + vpaddd %%XMM5, %%XMM3, %%T5 + vpaddd %%XMM6, %%XMM4, %%T5 + vpaddd %%XMM7, %%XMM5, %%T5 + vpaddd %%XMM8, %%XMM6, %%T5 + vmovdqa %%CTR, %%XMM8 +%endif + + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T1, [%%GDATA + 16*0] + vpxor %%XMM1, %%XMM1, %%T1 + vpxor %%XMM2, %%XMM2, %%T1 + vpxor %%XMM3, %%XMM3, %%T1 + vpxor %%XMM4, %%XMM4, %%T1 + vpxor %%XMM5, %%XMM5, %%T1 + vpxor %%XMM6, %%XMM6, %%T1 + vpxor %%XMM7, %%XMM7, %%T1 + vpxor %%XMM8, %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T1, [%%GDATA + 16*1] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + + vmovdqu %%T1, [%%GDATA + 16*2] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_8] + vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1 + vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0 + vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0 + vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1 + vpxor %%T6, %%T6, %%T5 + + vmovdqu %%T1, [%%GDATA + 16*3] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP2] + vmovdqu %%T5, [%%GDATA + HashKey_7] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*4] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqu %%T1, [rsp + TMP3] + vmovdqu %%T5, [%%GDATA + HashKey_6] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*5] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + + vmovdqu %%T1, [rsp + TMP4] + vmovdqu %%T5, [%%GDATA + HashKey_5] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*6] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP5] + vmovdqu %%T5, [%%GDATA + HashKey_4] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*7] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP6] + vmovdqu %%T5, [%%GDATA + HashKey_3] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vmovdqu %%T1, [%%GDATA + 16*8] + vaesenc %%XMM1, %%T1 + vaesenc %%XMM2, %%T1 + vaesenc %%XMM3, %%T1 + vaesenc %%XMM4, %%T1 + vaesenc %%XMM5, %%T1 + vaesenc %%XMM6, %%T1 + vaesenc %%XMM7, %%T1 + vaesenc %%XMM8, %%T1 + + vmovdqu %%T1, [rsp + TMP7] + vmovdqu %%T5, [%%GDATA + HashKey_2] + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T4, %%T4, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + 16*9] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T1, [rsp + TMP8] + vmovdqu %%T5, [%%GDATA + HashKey] + + + vpclmulqdq %%T3, %%T1, %%T5, 0x00 + vpxor %%T7, %%T7, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x01 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x10 + vpxor %%T6, %%T6, %%T3 + + vpclmulqdq %%T3, %%T1, %%T5, 0x11 + vpxor %%T1, %%T4, %%T3 + + + vmovdqu %%T5, [%%GDATA + 16*10] + %ifndef GCM128_MODE ; GCM192 or GCM256 + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*11] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*12] +%endif +%ifdef GCM256_MODE + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*13] + vaesenc %%XMM1, %%T5 + vaesenc %%XMM2, %%T5 + vaesenc %%XMM3, %%T5 + vaesenc %%XMM4, %%T5 + vaesenc %%XMM5, %%T5 + vaesenc %%XMM6, %%T5 + vaesenc %%XMM7, %%T5 + vaesenc %%XMM8, %%T5 + + vmovdqu %%T5, [%%GDATA + 16*14] +%endif ; GCM256 + +%assign i 0 +%assign j 1 +%rep 8 + + ;; SNP TBD: This is pretty ugly - consider whether just XORing the + ;; data in after vaesenclast is simpler and performant. Would + ;; also have to ripple it through partial block and ghash_mul_8. +%ifidn %%FULL_PARTIAL, full + %ifdef NT_LD + VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] + vpxor %%T2, %%T2, %%T5 + %else + vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] + %endif + + %ifidn %%ENC_DEC, ENC + vaesenclast reg(j), reg(j), %%T2 + %else + vaesenclast %%T3, reg(j), %%T2 + vpxor reg(j), %%T2, %%T5 + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3 + %endif + +%else + ; Don't read the final data during partial block processing + %ifdef NT_LD + %if (i<7) + VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] + vpxor %%T2, %%T2, %%T5 + %else + ;; Stage the key directly in T2 rather than hash it with plaintext + vmovdqu %%T2, %%T5 + %endif + %else + %if (i<7) + vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] + %else + ;; Stage the key directly in T2 rather than hash it with plaintext + vmovdqu %%T2, %%T5 + %endif + %endif + + %ifidn %%ENC_DEC, ENC + vaesenclast reg(j), reg(j), %%T2 + %else + %if (i<7) + vaesenclast %%T3, reg(j), %%T2 + vpxor reg(j), %%T2, %%T5 + ;; Do not read the data since it could fault + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3 + %else + vaesenclast reg(j), reg(j), %%T2 + %endif + %endif +%endif + +%assign i (i+1) +%assign j (j+1) +%endrep + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs + vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs + vpxor %%T7, %%T7, %%T3 + vpxor %%T1, %%T1, %%T6 ; accumulate the results in %%T1:%%T7 + + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;first phase of the reduction + vmovdqu %%T3, [POLY2] + + vpclmulqdq %%T2, %%T3, %%T7, 0x01 + vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs + + vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + %ifidn %%ENC_DEC, ENC + ; Write to the Ciphertext buffer + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 + %ifidn %%FULL_PARTIAL, full + ;; Avoid writing past the buffer if handling a partial block + VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 + %endif + %endif + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;second phase of the reduction + vpclmulqdq %%T2, %%T3, %%T7, 0x00 + vpsrldq %%T2, %%T2, 4 ; shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + vpclmulqdq %%T4, %%T3, %%T7, 0x10 + vpslldq %%T4, %%T4, 4 ; shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) + + vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vpxor %%T1, %%T1, %%T4 ; the result is in %%T1 + + vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + + + vpxor %%XMM1, %%T1 + + +%endmacro ; GHASH_8_ENCRYPT_8_PARALLEL + + +; GHASH the last 4 ciphertext blocks. +%macro GHASH_LAST_8 16 +%define %%GDATA %1 +%define %%T1 %2 +%define %%T2 %3 +%define %%T3 %4 +%define %%T4 %5 +%define %%T5 %6 +%define %%T6 %7 +%define %%T7 %8 +%define %%XMM1 %9 +%define %%XMM2 %10 +%define %%XMM3 %11 +%define %%XMM4 %12 +%define %%XMM5 %13 +%define %%XMM6 %14 +%define %%XMM7 %15 +%define %%XMM8 %16 + + ;; Karatsuba Method + + vmovdqu %%T5, [%%GDATA + HashKey_8] + + vpshufd %%T2, %%XMM1, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM1 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T6, %%XMM1, %%T5, 0x11 + vpclmulqdq %%T7, %%XMM1, %%T5, 0x00 + + vpclmulqdq %%XMM1, %%T2, %%T3, 0x00 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_7] + vpshufd %%T2, %%XMM2, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM2 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM2, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM2, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_6] + vpshufd %%T2, %%XMM3, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM3 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM3, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM3, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_5] + vpshufd %%T2, %%XMM4, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM4 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM4, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM4, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_4] + vpshufd %%T2, %%XMM5, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM5 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM5, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM5, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_3] + vpshufd %%T2, %%XMM6, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM6 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM6, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM6, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_2] + vpshufd %%T2, %%XMM7, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM7 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM7, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM7, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey] + vpshufd %%T2, %%XMM8, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM8 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM8, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM8, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + vpxor %%XMM1, %%XMM1, %%T6 + vpxor %%T2, %%XMM1, %%T7 + + + + + vpslldq %%T4, %%T2, 8 + vpsrldq %%T2, %%T2, 8 + + vpxor %%T7, %%T7, %%T4 + vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;first phase of the reduction + vmovdqu %%T3, [POLY2] + + vpclmulqdq %%T2, %%T3, %%T7, 0x01 + vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs + + vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + ;second phase of the reduction + vpclmulqdq %%T2, %%T3, %%T7, 0x00 + vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + vpclmulqdq %%T4, %%T3, %%T7, 0x10 + vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts) + + vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vpxor %%T6, %%T6, %%T4 ; the result is in %%T6 +%endmacro + + +; GHASH the last 4 ciphertext blocks. +%macro GHASH_LAST_7 15 +%define %%GDATA %1 +%define %%T1 %2 +%define %%T2 %3 +%define %%T3 %4 +%define %%T4 %5 +%define %%T5 %6 +%define %%T6 %7 +%define %%T7 %8 +%define %%XMM1 %9 +%define %%XMM2 %10 +%define %%XMM3 %11 +%define %%XMM4 %12 +%define %%XMM5 %13 +%define %%XMM6 %14 +%define %%XMM7 %15 + + ;; Karatsuba Method + + vmovdqu %%T5, [%%GDATA + HashKey_7] + + vpshufd %%T2, %%XMM1, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM1 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T6, %%XMM1, %%T5, 0x11 + vpclmulqdq %%T7, %%XMM1, %%T5, 0x00 + + vpclmulqdq %%XMM1, %%T2, %%T3, 0x00 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_6] + vpshufd %%T2, %%XMM2, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM2 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM2, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM2, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_5] + vpshufd %%T2, %%XMM3, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM3 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM3, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM3, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_4] + vpshufd %%T2, %%XMM4, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM4 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM4, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM4, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_3] + vpshufd %%T2, %%XMM5, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM5 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM5, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM5, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_2] + vpshufd %%T2, %%XMM6, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM6 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM6, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM6, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vmovdqu %%T5, [%%GDATA + HashKey_1] + vpshufd %%T2, %%XMM7, 01001110b + vpshufd %%T3, %%T5, 01001110b + vpxor %%T2, %%T2, %%XMM7 + vpxor %%T3, %%T3, %%T5 + + vpclmulqdq %%T4, %%XMM7, %%T5, 0x11 + vpxor %%T6, %%T6, %%T4 + + vpclmulqdq %%T4, %%XMM7, %%T5, 0x00 + vpxor %%T7, %%T7, %%T4 + + vpclmulqdq %%T2, %%T2, %%T3, 0x00 + + vpxor %%XMM1, %%XMM1, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;; + + vpxor %%XMM1, %%XMM1, %%T6 + vpxor %%T2, %%XMM1, %%T7 + + + + + vpslldq %%T4, %%T2, 8 + vpsrldq %%T2, %%T2, 8 + + vpxor %%T7, %%T7, %%T4 + vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;first phase of the reduction + vmovdqu %%T3, [POLY2] + + vpclmulqdq %%T2, %%T3, %%T7, 0x01 + vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs + + vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + ;second phase of the reduction + vpclmulqdq %%T2, %%T3, %%T7, 0x00 + vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + vpclmulqdq %%T4, %%T3, %%T7, 0x10 + vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts) + + vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vpxor %%T6, %%T6, %%T4 ; the result is in %%T6 +%endmacro + + + +;;; Handle encryption of the final partial block +;;; IN: +;;; r13 - Number of bytes to read +;;; MODIFIES: +;;; KEY - Key for encrypting the partial block +;;; HASH - Current hash value +;;; SMASHES: +;;; r10, r12, r15, rax +;;; T1, T2 +;;; Note: +;;; PLAIN_CYPH_LEN, %7, is passed only to determine +;;; if buffer is big enough to do a 16 byte read & shift. +;;; 'LT16' is passed here only if buffer is known to be smaller +;;; than 16 bytes. +;;; Any other value passed here will result in 16 byte read +;;; code path. +;;; TBD: Remove HASH from the instantiation +%macro ENCRYPT_FINAL_PARTIAL_BLOCK 8 +%define %%KEY %1 +%define %%T1 %2 +%define %%T2 %3 +%define %%CYPH_PLAIN_OUT %4 +%define %%PLAIN_CYPH_IN %5 +%define %%PLAIN_CYPH_LEN %6 +%define %%ENC_DEC %7 +%define %%DATA_OFFSET %8 + + ;; NOTE: type of read tuned based %%PLAIN_CYPH_LEN setting +%ifidn %%PLAIN_CYPH_LEN, LT16 + ;; Handle the case where the message is < 16 bytes + lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + + ;; T1 - packed output + ;; r10 - input data address + ;; r13 - input data length + ;; r12, r15, rax - temp registers + READ_SMALL_DATA_INPUT %%T1, r10, r13, r12, r15, rax + + lea r12, [SHIFT_MASK + 16] + sub r12, r13 +%else + ;; Handle the case where the message is >= 16 bytes + sub %%DATA_OFFSET, 16 + add %%DATA_OFFSET, r13 + ;; Receive the last <16 Byte block + vmovdqu %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] + sub %%DATA_OFFSET, r13 + add %%DATA_OFFSET, 16 + + lea r12, [SHIFT_MASK + 16] + ;; Adjust the shuffle mask pointer to be able to shift 16-r13 bytes + ;; (r13 is the number of bytes in plaintext mod 16) + sub r12, r13 + ;; Get the appropriate shuffle mask + vmovdqu %%T2, [r12] + ;; shift right 16-r13 bytes + vpshufb %%T1, %%T2 +%endif ; %%PLAIN_CYPH_LEN, LT16 + + ;; At this point T1 contains the partial block data +%ifidn %%ENC_DEC, DEC + ;; Plaintext XOR E(K, Yn) + ;; Set aside the ciphertext + vmovdqa %%T2, %%T1 + vpxor %%KEY, %%KEY, %%T1 + ;; Get the appropriate mask to mask out top 16-r13 bytes of ciphertext + vmovdqu %%T1, [r12 + ALL_F - SHIFT_MASK] + ;; Mask out top 16-r13 bytes of ciphertext + vpand %%KEY, %%KEY, %%T1 + + ;; Prepare the ciphertext for the hash + ;; mask out top 16-r13 bytes of the plaintext + vpand %%T2, %%T2, %%T1 +%else + ;; Plaintext XOR E(K, Yn) + vpxor %%KEY, %%KEY, %%T1 + ;; Get the appropriate mask to mask out top 16-r13 bytes of %%KEY + vmovdqu %%T1, [r12 + ALL_F - SHIFT_MASK] + ;; Mask out top 16-r13 bytes of %%KEY + vpand %%KEY, %%KEY, %%T1 +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Output r13 Bytes + vmovq rax, %%KEY + cmp r13, 8 + jle %%_less_than_8_bytes_left + + mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax + add %%DATA_OFFSET, 8 + vpsrldq %%T1, %%KEY, 8 + vmovq rax, %%T1 + sub r13, 8 + +%%_less_than_8_bytes_left: + mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al + add %%DATA_OFFSET, 1 + shr rax, 8 + sub r13, 1 + jne %%_less_than_8_bytes_left + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%ifidn %%ENC_DEC, DEC + ;; If decrypt, restore the ciphertext into %%KEY + vmovdqu %%KEY, %%T2 +%endif +%endmacro ; ENCRYPT_FINAL_PARTIAL_BLOCK + + + +; Encryption of a single block +%macro ENCRYPT_SINGLE_BLOCK 2 +%define %%GDATA %1 +%define %%XMM0 %2 + + vpxor %%XMM0, %%XMM0, [%%GDATA+16*0] +%assign i 1 +%rep NROUNDS + vaesenc %%XMM0, [%%GDATA+16*i] +%assign i (i+1) +%endrep + vaesenclast %%XMM0, [%%GDATA+16*i] +%endmacro + + +;; Start of Stack Setup + +%macro FUNC_SAVE 0 + ;; Required for Update/GMC_ENC + ;the number of pushes must equal STACK_OFFSET + push r12 + push r13 + push r14 + push r15 + mov r14, rsp + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 + vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7 + vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8 + vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9 + vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10 + vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11 + vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12 + vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13 + vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14 + vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15 +%endif +%endmacro + + +%macro FUNC_RESTORE 0 + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15, [rsp + LOCAL_STORAGE + 9*16] + vmovdqu xmm14, [rsp + LOCAL_STORAGE + 8*16] + vmovdqu xmm13, [rsp + LOCAL_STORAGE + 7*16] + vmovdqu xmm12, [rsp + LOCAL_STORAGE + 6*16] + vmovdqu xmm11, [rsp + LOCAL_STORAGE + 5*16] + vmovdqu xmm10, [rsp + LOCAL_STORAGE + 4*16] + vmovdqu xmm9, [rsp + LOCAL_STORAGE + 3*16] + vmovdqu xmm8, [rsp + LOCAL_STORAGE + 2*16] + vmovdqu xmm7, [rsp + LOCAL_STORAGE + 1*16] + vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] +%endif + +;; Required for Update/GMC_ENC + mov rsp, r14 + pop r15 + pop r14 + pop r13 + pop r12 +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding. +; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV, +; Additional Authentication data (A_IN), Additional Data length (A_LEN). +; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA_CTX. +; Clobbers rax, r10-r13, and xmm0-xmm6 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_INIT 5 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%IV %3 +%define %%A_IN %4 +%define %%A_LEN %5 +%define %%AAD_HASH xmm14 +%define %%SUBHASH xmm1 + + + vmovdqu %%SUBHASH, [%%GDATA_KEY + HashKey] + + mov r10, %%A_LEN + cmp r10, 0 + je %%_aad_is_zero + + CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax + jmp %%_after_aad + +%%_aad_is_zero: + vpxor %%AAD_HASH, %%AAD_HASH + +%%_after_aad: + mov r10, %%A_LEN + vpxor xmm2, xmm3 + + vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash + mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length + xor r10, r10 + mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0 + mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0 + vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0 + mov r10, %%IV + vmovdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001 + vpinsrq xmm2, [r10], 0 + vpinsrd xmm2, [r10+8], 2 + vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv + + vpshufb xmm2, [SHUF_MASK] + + vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv +%endmacro + +%macro GCM_ENC_DEC_SMALL 12 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%ENC_DEC %6 +%define %%DATA_OFFSET %7 +%define %%LENGTH %8 +%define %%NUM_BLOCKS %9 +%define %%CTR %10 +%define %%HASH %11 +%define %%INSTANCE_TYPE %12 + + ;; NOTE: the check below is obsolete in current implementation. The check is already done in GCM_ENC_DEC. + ;; cmp %%NUM_BLOCKS, 0 + ;; je %%_small_initial_blocks_encrypted + cmp %%NUM_BLOCKS, 8 + je %%_small_initial_num_blocks_is_8 + cmp %%NUM_BLOCKS, 7 + je %%_small_initial_num_blocks_is_7 + cmp %%NUM_BLOCKS, 6 + je %%_small_initial_num_blocks_is_6 + cmp %%NUM_BLOCKS, 5 + je %%_small_initial_num_blocks_is_5 + cmp %%NUM_BLOCKS, 4 + je %%_small_initial_num_blocks_is_4 + cmp %%NUM_BLOCKS, 3 + je %%_small_initial_num_blocks_is_3 + cmp %%NUM_BLOCKS, 2 + je %%_small_initial_num_blocks_is_2 + + jmp %%_small_initial_num_blocks_is_1 + + +%%_small_initial_num_blocks_is_8: + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 8, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_7: + ;; r13 - %%LENGTH + ;; xmm12 - T1 + ;; xmm13 - T2 + ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys + ;; xmm15 - T4 + ;; xmm11 - T5 + ;; xmm9 - CTR + ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys + ;; xmm2 - XMM2 + ;; xmm3 - XMM3 + ;; xmm4 - XMM4 + ;; xmm5 - XMM5 + ;; xmm6 - XMM6 + ;; xmm7 - XMM7 + ;; xmm8 - XMM8 - AAD HASH IN + ;; xmm10 - T6 + ;; xmm0 - T_key + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_6: + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_5: + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_4: + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_3: + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_2: + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_1: + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE + + ;; Note: zero initial blocks not allowed. + +%%_small_initial_blocks_encrypted: + +%endmacro ; GCM_ENC_DEC_SMALL + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct +; has been initialized by GCM_INIT +; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. +; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN), +; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC). +; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10-r15, and xmm0-xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_ENC_DEC 7 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%ENC_DEC %6 +%define %%INSTANCE_TYPE %7 +%define %%DATA_OFFSET r11 + +; Macro flow: +; calculate the number of 16byte blocks in the message +; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted' +; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left' +; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes' + + cmp %%PLAIN_CYPH_LEN, 0 + je %%_enc_dec_done + + xor %%DATA_OFFSET, %%DATA_OFFSET + ;; Update length of data processed +%ifidn __OUTPUT_FORMAT__, win64 + mov rax, %%PLAIN_CYPH_LEN + add [%%GDATA_CTX + InLen], rax +%else + add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN +%endif + vmovdqu xmm13, [%%GDATA_KEY + HashKey] + vmovdqu xmm8, [%%GDATA_CTX + AadHash] + +%ifidn %%INSTANCE_TYPE, multi_call + ;; NOTE: partial block processing makes only sense for multi_call here. + ;; Used for the update flow - if there was a previous partial + ;; block fill the remaining bytes here. + PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC +%endif + + ;; lift CTR set from initial_blocks to here +%ifidn %%INSTANCE_TYPE, single_call + vmovdqu xmm9, xmm2 +%else + vmovdqu xmm9, [%%GDATA_CTX + CurCount] +%endif + + ;; Save the amount of data left to process in r10 + mov r13, %%PLAIN_CYPH_LEN +%ifidn %%INSTANCE_TYPE, multi_call + ;; NOTE: %%DATA_OFFSET is zero in single_call case. + ;; Consequently PLAIN_CYPH_LEN will never be zero after + ;; %%DATA_OFFSET subtraction below. + sub r13, %%DATA_OFFSET + + ;; There may be no more data if it was consumed in the partial block. + cmp r13, 0 + je %%_enc_dec_done +%endif ; %%INSTANCE_TYPE, multi_call + mov r10, r13 + + ;; Determine how many blocks to process in INITIAL + mov r12, r13 + shr r12, 4 + and r12, 7 + + ;; Process one additional block in INITIAL if there is a partial block + and r10, 0xf + blsmsk r10, r10 ; Set CF if zero + cmc ; Flip CF + adc r12, 0x0 ; Process an additional INITIAL block if CF set + + ;; Less than 127B will be handled by the small message code, which + ;; can process up to 7 16B blocks. + cmp r13, 128 + jge %%_large_message_path + + GCM_ENC_DEC_SMALL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, r13, r12, xmm9, xmm14, %%INSTANCE_TYPE + jmp %%_ghash_done + +%%_large_message_path: + and r12, 0x7 ; Still, don't allow 8 INITIAL blocks since this will + ; can be handled by the x8 partial loop. + + cmp r12, 0 + je %%_initial_num_blocks_is_0 + cmp r12, 7 + je %%_initial_num_blocks_is_7 + cmp r12, 6 + je %%_initial_num_blocks_is_6 + cmp r12, 5 + je %%_initial_num_blocks_is_5 + cmp r12, 4 + je %%_initial_num_blocks_is_4 + cmp r12, 3 + je %%_initial_num_blocks_is_3 + cmp r12, 2 + je %%_initial_num_blocks_is_2 + + jmp %%_initial_num_blocks_is_1 + +%%_initial_num_blocks_is_7: + ;; r13 - %%LENGTH + ;; xmm12 - T1 + ;; xmm13 - T2 + ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys + ;; xmm15 - T4 + ;; xmm11 - T5 + ;; xmm9 - CTR + ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys + ;; xmm2 - XMM2 + ;; xmm3 - XMM3 + ;; xmm4 - XMM4 + ;; xmm5 - XMM5 + ;; xmm6 - XMM6 + ;; xmm7 - XMM7 + ;; xmm8 - XMM8 - AAD HASH IN + ;; xmm10 - T6 + ;; xmm0 - T_key + INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_6: + INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_5: + INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_4: + INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_3: + INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_2: + INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_1: + INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_0: + INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + + +%%_initial_blocks_encrypted: + ;; The entire message was encrypted processed in initial and now need to be hashed + cmp r13, 0 + je %%_encrypt_done + + ;; Encrypt the final <16 byte (partial) block, then hash + cmp r13, 16 + jl %%_encrypt_final_partial + + ;; Process 7 full blocks plus a partial block + cmp r13, 128 + jl %%_encrypt_by_8_partial + + +%%_encrypt_by_8_parallel: + ;; in_order vs. out_order is an optimization to increment the counter without shuffling + ;; it back into little endian. r15d keeps track of when we need to increent in order so + ;; that the carry is handled correctly. + vmovd r15d, xmm9 + and r15d, 255 + vpshufb xmm9, [rel SHUF_MASK] + + +%%_encrypt_by_8_new: + cmp r15d, 255-8 + jg %%_encrypt_by_8 + + + + ;; xmm0 - T1 + ;; xmm10 - T2 + ;; xmm11 - T3 + ;; xmm12 - T4 + ;; xmm13 - T5 + ;; xmm14 - T6 + ;; xmm9 - CTR + ;; xmm1 - XMM1 + ;; xmm2 - XMM2 + ;; xmm3 - XMM3 + ;; xmm4 - XMM4 + ;; xmm5 - XMM5 + ;; xmm6 - XMM6 + ;; xmm7 - XMM7 + ;; xmm8 - XMM8 + ;; xmm15 - T7 + add r15b, 8 + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC, full + add %%DATA_OFFSET, 128 + sub r13, 128 + cmp r13, 128 + jge %%_encrypt_by_8_new + + vpshufb xmm9, [SHUF_MASK] + jmp %%_encrypt_by_8_parallel_done + +%%_encrypt_by_8: + vpshufb xmm9, [SHUF_MASK] + add r15b, 8 + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, full + vpshufb xmm9, [SHUF_MASK] + add %%DATA_OFFSET, 128 + sub r13, 128 + cmp r13, 128 + jge %%_encrypt_by_8_new + vpshufb xmm9, [SHUF_MASK] + + +%%_encrypt_by_8_parallel_done: + ;; Test to see if we need a by 8 with partial block. At this point + ;; bytes remaining should be either zero or between 113-127. + cmp r13, 0 + je %%_encrypt_done + +%%_encrypt_by_8_partial: + ;; Shuffle needed to align key for partial block xor. out_order + ;; is a little faster because it avoids extra shuffles. + ;; TBD: Might need to account for when we don't have room to increment the counter. + + + ;; Process parallel buffers with a final partial block. + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, partial + + + add %%DATA_OFFSET, 128-16 + sub r13, 128-16 + +%%_encrypt_final_partial: + + vpshufb xmm8, [SHUF_MASK] + mov [%%GDATA_CTX + PBlockLen], r13 + vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm8 + + ;; xmm8 - Final encrypted counter - need to hash with partial or full block ciphertext + ;; GDATA, KEY, T1, T2 + ENCRYPT_FINAL_PARTIAL_BLOCK xmm8, xmm0, xmm10, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET + + vpshufb xmm8, [SHUF_MASK] + + +%%_encrypt_done: + + ;; Mapping to macro parameters + ;; IN: + ;; xmm9 contains the counter + ;; xmm1-xmm8 contain the xor'd ciphertext + ;; OUT: + ;; xmm14 contains the final hash + ;; GDATA, T1, T2, T3, T4, T5, T6, T7, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 +%ifidn %%INSTANCE_TYPE, multi_call + mov r13, [%%GDATA_CTX + PBlockLen] + cmp r13, 0 + jz %%_hash_last_8 + GHASH_LAST_7 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 + ;; XOR the partial word into the hash + vpxor xmm14, xmm14, xmm8 + jmp %%_ghash_done +%endif +%%_hash_last_8: + GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 + +%%_ghash_done: + vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9 + vmovdqu [%%GDATA_CTX + AadHash], xmm14 ; my_ctx_data.aad hash = xmm14 + +%%_enc_dec_done: + + +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes. +; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX) and whether encoding or decoding (ENC_DEC). +; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN) +; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_COMPLETE 6 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%AUTH_TAG %3 +%define %%AUTH_TAG_LEN %4 +%define %%ENC_DEC %5 +%define %%INSTANCE_TYPE %6 +%define %%PLAIN_CYPH_LEN rax + + vmovdqu xmm13, [%%GDATA_KEY + HashKey] + ;; Start AES as early as possible + vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0 + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0) + +%ifidn %%INSTANCE_TYPE, multi_call + ;; If the GCM function is called as a single function call rather + ;; than invoking the individual parts (init, update, finalize) we + ;; can remove a write to read dependency on AadHash. + vmovdqu xmm14, [%%GDATA_CTX + AadHash] + + ;; Encrypt the final partial block. If we did this as a single call then + ;; the partial block was handled in the main GCM_ENC_DEC macro. + mov r12, [%%GDATA_CTX + PBlockLen] + cmp r12, 0 + + je %%_partial_done + + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + vmovdqu [%%GDATA_CTX + AadHash], xmm14 + +%%_partial_done: + +%endif + + mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes) + mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen] + + shl r12, 3 ; convert into number of bits + vmovd xmm15, r12d ; len(A) in xmm15 + + shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128) + vmovq xmm1, %%PLAIN_CYPH_LEN + vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000 + vpxor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C) + + vpxor xmm14, xmm15 + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 + vpshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap + + vpxor xmm9, xmm9, xmm14 + + +%%_return_T: + mov r10, %%AUTH_TAG ; r10 = authTag + mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len + + cmp r11, 16 + je %%_T_16 + + cmp r11, 12 + je %%_T_12 + +%%_T_8: + vmovq rax, xmm9 + mov [r10], rax + jmp %%_return_T_done +%%_T_12: + vmovq rax, xmm9 + mov [r10], rax + vpsrldq xmm9, xmm9, 8 + vmovd eax, xmm9 + mov [r10 + 8], eax + jmp %%_return_T_done + +%%_T_16: + vmovdqu [r10], xmm9 + +%%_return_T_done: +%endmacro ; GCM_COMPLETE + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_precomp_128_avx_gen4 / +; aes_gcm_precomp_192_avx_gen4 / +; aes_gcm_precomp_256_avx_gen4 +; (struct gcm_key_data *key_data) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(precomp,_),function,) +FN_NAME(precomp,_): + push r12 + push r13 + push r14 + push r15 + + mov r14, rsp + + + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 ; align rsp to 64 bytes + +%ifidn __OUTPUT_FORMAT__, win64 + ; only xmm6 needs to be maintained + vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 +%endif + + vpxor xmm6, xmm6 + ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey + + vpshufb xmm6, [rel SHUF_MASK] + ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; + vmovdqa xmm2, xmm6 + vpsllq xmm6, xmm6, 1 + vpsrlq xmm2, xmm2, 63 + vmovdqa xmm1, xmm2 + vpslldq xmm2, xmm2, 8 + vpsrldq xmm1, xmm1, 8 + vpor xmm6, xmm6, xmm2 + ;reduction + vpshufd xmm2, xmm1, 00100100b + vpcmpeqd xmm2, [TWOONE] + vpand xmm2, xmm2, [POLY] + vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly + + + PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] +%endif + mov rsp, r14 + + pop r15 + pop r14 + pop r13 + pop r12 + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_init_128_avx_gen4 / aes_gcm_init_192_avx_gen4 / aes_gcm_init_256_avx_gen4 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *iv, +; const u8 *aad, +; u64 aad_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(init,_),function,) +FN_NAME(init,_): + push r12 + push r13 +%ifidn __OUTPUT_FORMAT__, win64 + push r14 + push r15 + mov r14, rsp + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 1*16 + movdqu [rsp + 0*16], xmm6 +%endif + + GCM_INIT arg1, arg2, arg3, arg4, arg5 + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm6 , [rsp + 0*16] + mov rsp, r14 + pop r15 + pop r14 +%endif + pop r13 + pop r12 + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_update_avx_gen4 / aes_gcm_enc_192_update_avx_gen4 / +; aes_gcm_enc_128_update_avx_gen4 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(enc,_update_),function,) +FN_NAME(enc,_update_): + + FUNC_SAVE + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call + + FUNC_RESTORE + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_update_avx_gen4 / aes_gcm_dec_192_update_avx_gen4 / +; aes_gcm_dec_256_update_avx_gen4 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(dec,_update_),function,) +FN_NAME(dec,_update_): + + FUNC_SAVE + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call + + FUNC_RESTORE + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_finalize_avx_gen4 / aes_gcm_enc_192_finalize_avx_gen4 / +; aes_gcm_enc_256_finalize_avx_gen4 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(enc,_finalize_),function,) +FN_NAME(enc,_finalize_): + + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 5*16 + vmovdqu [rsp + 0*16], xmm6 + vmovdqu [rsp + 1*16], xmm9 + vmovdqu [rsp + 2*16], xmm11 + vmovdqu [rsp + 3*16], xmm14 + vmovdqu [rsp + 4*16], xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, ENC, multi_call + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15, [rsp + 4*16] + vmovdqu xmm14, [rsp + 3*16] + vmovdqu xmm11, [rsp + 2*16] + vmovdqu xmm9, [rsp + 1*16] + vmovdqu xmm6, [rsp + 0*16] + add rsp, 5*16 +%endif + + pop r12 +ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_finalize_avx_gen4 / aes_gcm_dec_192_finalize_avx_gen4 +; aes_gcm_dec_256_finalize_avx_gen4 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(dec,_finalize_),function,) +FN_NAME(dec,_finalize_): + + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 5*16 + vmovdqu [rsp + 0*16], xmm6 + vmovdqu [rsp + 1*16], xmm9 + vmovdqu [rsp + 2*16], xmm11 + vmovdqu [rsp + 3*16], xmm14 + vmovdqu [rsp + 4*16], xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, DEC, multi_call + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15, [rsp + 4*16] + vmovdqu xmm14, [rsp + 3*16] + vmovdqu xmm11, [rsp + 2*16] + vmovdqu xmm9, [rsp + 1*16] + vmovdqu xmm6, [rsp + 0*16] + add rsp, 5*16 +%endif + + pop r12 + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_avx_gen4 / aes_gcm_enc_192_avx_gen4 / aes_gcm_enc_256_avx_gen4 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(enc,_),function,) +FN_NAME(enc,_): + + FUNC_SAVE + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call + + GCM_COMPLETE arg1, arg2, arg9, arg10, ENC, single_call + + FUNC_RESTORE + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_avx_gen4 / aes_gcm_dec_192_avx_gen4 / aes_gcm_dec_256_avx_gen4 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(dec,_),function,) +FN_NAME(dec,_): + + FUNC_SAVE + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call + + GCM_COMPLETE arg1, arg2, arg9, arg10, DEC, single_call + + FUNC_RESTORE + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif |