diff options
Diffstat (limited to 'src/spdk/intel-ipsec-mb/avx512/gcm_vaes_avx512.asm')
-rw-r--r-- | src/spdk/intel-ipsec-mb/avx512/gcm_vaes_avx512.asm | 4272 |
1 files changed, 4272 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/avx512/gcm_vaes_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/gcm_vaes_avx512.asm new file mode 100644 index 000000000..4ef183d31 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx512/gcm_vaes_avx512.asm @@ -0,0 +1,4272 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2018-2019, Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; +; Authors: +; Erdinc Ozturk +; Vinodh Gopal +; James Guilford +; Tomasz Kantecki +; +; +; References: +; This code was derived and highly optimized from the code described in paper: +; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010 +; The details of the implementation is explained in: +; Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012. +; +; +; +; +; Assumptions: +; +; +; +; iv: +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Salt (From the SA) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Initialization Vector | +; | (This is the sequence number from IPSec header) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x1 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; +; +; AAD: +; AAD will be padded with 0 to the next 16byte multiple +; for example, assume AAD is a u32 vector +; +; if AAD is 8 bytes: +; AAD[3] = {A0, A1}; +; padded AAD in xmm register = {A1 A0 0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A1) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 32-bit Sequence Number (A0) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 32-bit Sequence Number +; +; if AAD is 12 bytes: +; AAD[3] = {A0, A1, A2}; +; padded AAD in xmm register = {A2 A1 A0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A2) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 64-bit Extended Sequence Number {A1,A0} | +; | | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 64-bit Extended Sequence Number +; +; +; aadLen: +; Must be a multiple of 4 bytes and from the definition of the spec. +; The code additionally supports any aadLen length. +; +; TLen: +; from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +; +; poly = x^128 + x^127 + x^126 + x^121 + 1 +; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part. +; + +%include "include/os.asm" +%include "include/reg_sizes.asm" +%include "include/clear_regs.asm" +%include "include/gcm_defines.asm" +%include "include/gcm_keys_vaes_avx512.asm" +%include "include/memcpy.asm" +%include "include/aes_common.asm" + +%ifndef GCM128_MODE +%ifndef GCM192_MODE +%ifndef GCM256_MODE +%error "No GCM mode selected for gcm_avx512.asm!" +%endif +%endif +%endif + +;; Decide on AES-GCM key size to compile for +%ifdef GCM128_MODE +%define NROUNDS 9 +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ vaes_avx512 +%endif + +%ifdef GCM192_MODE +%define NROUNDS 11 +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ vaes_avx512 +%endif + +%ifdef GCM256_MODE +%define NROUNDS 13 +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ vaes_avx512 +%endif + +section .text +default rel + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Stack frame definition +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifidn __OUTPUT_FORMAT__, win64 + %define XMM_STORAGE (10*16) ; space for 10 XMM registers + %define GP_STORAGE ((9*8) + 24) ; space for 9 GP registers + 24 bytes for 64 byte alignment +%else + %define XMM_STORAGE 0 + %define GP_STORAGE (8*8) ; space for 7 GP registers + 1 for alignment +%endif +%ifdef GCM_BIG_DATA +%define LOCAL_STORAGE (128*16) ; space for up to 128 AES blocks +%else +%define LOCAL_STORAGE (48*16) ; space for up to 48 AES blocks +%endif + +;;; sequence is (bottom-up): GP, XMM, local +%define STACK_GP_OFFSET 0 +%define STACK_XMM_OFFSET (STACK_GP_OFFSET + GP_STORAGE) +%define STACK_LOCAL_OFFSET (STACK_XMM_OFFSET + XMM_STORAGE) +%define STACK_FRAME_SIZE (STACK_LOCAL_OFFSET + LOCAL_STORAGE) + +;; for compatibility with stack argument definitions in gcm_defines.asm +%define STACK_OFFSET 0 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Utility Macros +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; =========================================================================== +;;; =========================================================================== +;;; Horizontal XOR - 4 x 128bits xored together +%macro VHPXORI4x128 2 +%define %%REG %1 ; [in/out] ZMM with 4x128bits to xor; 128bit output +%define %%TMP %2 ; [clobbered] ZMM temporary register + vextracti64x4 YWORD(%%TMP), %%REG, 1 + vpxorq YWORD(%%REG), YWORD(%%REG), YWORD(%%TMP) + vextracti32x4 XWORD(%%TMP), YWORD(%%REG), 1 + vpxorq XWORD(%%REG), XWORD(%%REG), XWORD(%%TMP) +%endmacro ; VHPXORI4x128 + +;;; =========================================================================== +;;; =========================================================================== +;;; Horizontal XOR - 2 x 128bits xored together +%macro VHPXORI2x128 2 +%define %%REG %1 ; [in/out] YMM/ZMM with 2x128bits to xor; 128bit output +%define %%TMP %2 ; [clobbered] XMM/YMM/ZMM temporary register + vextracti32x4 XWORD(%%TMP), %%REG, 1 + vpxorq XWORD(%%REG), XWORD(%%REG), XWORD(%%TMP) +%endmacro ; VHPXORI2x128 + +;;; =========================================================================== +;;; =========================================================================== +;;; schoolbook multiply - 1st step +%macro VCLMUL_STEP1 6-7 +%define %%KP %1 ; [in] key pointer +%define %%HI %2 ; [in] previous blocks 4 to 7 +%define %%TMP %3 ; [clobbered] ZMM/YMM/XMM temporary +%define %%TH %4 ; [out] high product +%define %%TM %5 ; [out] medium product +%define %%TL %6 ; [out] low product +%define %%HKEY %7 ; [in/optional] hash key for multiplication + +%if %0 == 6 + vmovdqu64 %%TMP, [%%KP + HashKey_4] +%else + vmovdqa64 %%TMP, %%HKEY +%endif + vpclmulqdq %%TH, %%HI, %%TMP, 0x11 ; %%T5 = a1*b1 + vpclmulqdq %%TL, %%HI, %%TMP, 0x00 ; %%T7 = a0*b0 + vpclmulqdq %%TM, %%HI, %%TMP, 0x01 ; %%T6 = a1*b0 + vpclmulqdq %%TMP, %%HI, %%TMP, 0x10 ; %%T4 = a0*b1 + vpxorq %%TM, %%TM, %%TMP ; [%%TH : %%TM : %%TL] +%endmacro ; VCLMUL_STEP1 + +;;; =========================================================================== +;;; =========================================================================== +;;; schoolbook multiply - 2nd step +%macro VCLMUL_STEP2 9-11 +%define %%KP %1 ; [in] key pointer +%define %%HI %2 ; [out] ghash high 128 bits +%define %%LO %3 ; [in/out] cipher text blocks 0-3 (in); ghash low 128 bits (out) +%define %%TMP0 %4 ; [clobbered] ZMM/YMM/XMM temporary +%define %%TMP1 %5 ; [clobbered] ZMM/YMM/XMM temporary +%define %%TMP2 %6 ; [clobbered] ZMM/YMM/XMM temporary +%define %%TH %7 ; [in] high product +%define %%TM %8 ; [in] medium product +%define %%TL %9 ; [in] low product +%define %%HKEY %10 ; [in/optional] hash key for multiplication +%define %%HXOR %11 ; [in/optional] type of horizontal xor (4 - 4x128; 2 - 2x128; 1 - none) + +%if %0 == 9 + vmovdqu64 %%TMP0, [%%KP + HashKey_8] +%else + vmovdqa64 %%TMP0, %%HKEY +%endif + vpclmulqdq %%TMP1, %%LO, %%TMP0, 0x10 ; %%TMP1 = a0*b1 + vpclmulqdq %%TMP2, %%LO, %%TMP0, 0x11 ; %%TMP2 = a1*b1 + vpxorq %%TH, %%TH, %%TMP2 + vpclmulqdq %%TMP2, %%LO, %%TMP0, 0x00 ; %%TMP2 = a0*b0 + vpxorq %%TL, %%TL, %%TMP2 + vpclmulqdq %%TMP0, %%LO, %%TMP0, 0x01 ; %%TMP0 = a1*b0 + vpternlogq %%TM, %%TMP1, %%TMP0, 0x96 ; %%TM = TM xor TMP1 xor TMP0 + + ;; finish multiplications + vpsrldq %%TMP2, %%TM, 8 + vpxorq %%HI, %%TH, %%TMP2 + vpslldq %%TMP2, %%TM, 8 + vpxorq %%LO, %%TL, %%TMP2 + + ;; xor 128bit words horizontally and compute [(X8*H1) + (X7*H2) + ... ((X1+Y0)*H8] + ;; note: (X1+Y0) handled elsewhere +%if %0 < 11 + VHPXORI4x128 %%HI, %%TMP2 + VHPXORI4x128 %%LO, %%TMP1 +%else +%if %%HXOR == 4 + VHPXORI4x128 %%HI, %%TMP2 + VHPXORI4x128 %%LO, %%TMP1 +%elif %%HXOR == 2 + VHPXORI2x128 %%HI, %%TMP2 + VHPXORI2x128 %%LO, %%TMP1 +%endif ; HXOR + ;; for HXOR == 1 there is nothing to be done +%endif ; !(%0 < 11) + ;; HIx holds top 128 bits + ;; LOx holds low 128 bits + ;; - further reductions to follow +%endmacro ; VCLMUL_STEP2 + +;;; =========================================================================== +;;; =========================================================================== +;;; AVX512 reduction macro +%macro VCLMUL_REDUCE 6 +%define %%OUT %1 ; [out] zmm/ymm/xmm: result (must not be %%TMP1 or %%HI128) +%define %%POLY %2 ; [in] zmm/ymm/xmm: polynomial +%define %%HI128 %3 ; [in] zmm/ymm/xmm: high 128b of hash to reduce +%define %%LO128 %4 ; [in] zmm/ymm/xmm: low 128b of hash to reduce +%define %%TMP0 %5 ; [in] zmm/ymm/xmm: temporary register +%define %%TMP1 %6 ; [in] zmm/ymm/xmm: temporary register + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; first phase of the reduction + vpclmulqdq %%TMP0, %%POLY, %%LO128, 0x01 + vpslldq %%TMP0, %%TMP0, 8 ; shift-L 2 DWs + vpxorq %%TMP0, %%LO128, %%TMP0 ; first phase of the reduction complete + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; second phase of the reduction + vpclmulqdq %%TMP1, %%POLY, %%TMP0, 0x00 + vpsrldq %%TMP1, %%TMP1, 4 ; shift-R only 1-DW to obtain 2-DWs shift-R + + vpclmulqdq %%OUT, %%POLY, %%TMP0, 0x10 + vpslldq %%OUT, %%OUT, 4 ; shift-L 1-DW to obtain result with no shifts + + vpternlogq %%OUT, %%TMP1, %%HI128, 0x96 ; OUT/GHASH = OUT xor TMP1 xor HI128 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endmacro + +;;; =========================================================================== +;;; =========================================================================== +;;; schoolbook multiply (1 to 8 blocks) - 1st step +%macro VCLMUL_1_TO_8_STEP1 8 +%define %%KP %1 ; [in] key pointer +%define %%HI %2 ; [in] ZMM ciphered blocks 4 to 7 +%define %%TMP1 %3 ; [clobbered] ZMM temporary +%define %%TMP2 %4 ; [clobbered] ZMM temporary +%define %%TH %5 ; [out] ZMM high product +%define %%TM %6 ; [out] ZMM medium product +%define %%TL %7 ; [out] ZMM low product +%define %%NBLOCKS %8 ; [in] number of blocks to ghash (0 to 8) + +%if %%NBLOCKS == 8 + VCLMUL_STEP1 %%KP, %%HI, %%TMP1, %%TH, %%TM, %%TL +%elif %%NBLOCKS == 7 + vmovdqu64 %%TMP2, [%%KP + HashKey_3] + vmovdqa64 %%TMP1, [rel mask_out_top_block] + vpandq %%TMP2, %%TMP1 + vpandq %%HI, %%TMP1 + VCLMUL_STEP1 NULL, %%HI, %%TMP1, %%TH, %%TM, %%TL, %%TMP2 +%elif %%NBLOCKS == 6 + vmovdqu64 YWORD(%%TMP2), [%%KP + HashKey_2] + VCLMUL_STEP1 NULL, YWORD(%%HI), YWORD(%%TMP1), \ + YWORD(%%TH), YWORD(%%TM), YWORD(%%TL), YWORD(%%TMP2) +%elif %%NBLOCKS == 5 + vmovdqu64 XWORD(%%TMP2), [%%KP + HashKey_1] + VCLMUL_STEP1 NULL, XWORD(%%HI), XWORD(%%TMP1), \ + XWORD(%%TH), XWORD(%%TM), XWORD(%%TL), XWORD(%%TMP2) +%else + vpxorq %%TH, %%TH + vpxorq %%TM, %%TM + vpxorq %%TL, %%TL +%endif +%endmacro ; VCLMUL_1_TO_8_STEP1 + +;;; =========================================================================== +;;; =========================================================================== +;;; schoolbook multiply (1 to 8 blocks) - 2nd step +%macro VCLMUL_1_TO_8_STEP2 10 +%define %%KP %1 ; [in] key pointer +%define %%HI %2 ; [out] ZMM ghash high 128bits +%define %%LO %3 ; [in/out] ZMM ciphered blocks 0 to 3 (in); ghash low 128bits (out) +%define %%TMP0 %4 ; [clobbered] ZMM temporary +%define %%TMP1 %5 ; [clobbered] ZMM temporary +%define %%TMP2 %6 ; [clobbered] ZMM temporary +%define %%TH %7 ; [in/clobbered] ZMM high sum +%define %%TM %8 ; [in/clobbered] ZMM medium sum +%define %%TL %9 ; [in/clobbered] ZMM low sum +%define %%NBLOCKS %10 ; [in] number of blocks to ghash (0 to 8) + +%if %%NBLOCKS == 8 + VCLMUL_STEP2 %%KP, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL +%elif %%NBLOCKS == 7 + vmovdqu64 %%TMP2, [%%KP + HashKey_7] + VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4 +%elif %%NBLOCKS == 6 + vmovdqu64 %%TMP2, [%%KP + HashKey_6] + VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4 +%elif %%NBLOCKS == 5 + vmovdqu64 %%TMP2, [%%KP + HashKey_5] + VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4 +%elif %%NBLOCKS == 4 + vmovdqu64 %%TMP2, [%%KP + HashKey_4] + VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4 +%elif %%NBLOCKS == 3 + vmovdqu64 %%TMP2, [%%KP + HashKey_3] + vmovdqa64 %%TMP1, [rel mask_out_top_block] + vpandq %%TMP2, %%TMP1 + vpandq %%LO, %%TMP1 + VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4 +%elif %%NBLOCKS == 2 + vmovdqu64 YWORD(%%TMP2), [%%KP + HashKey_2] + VCLMUL_STEP2 NULL, YWORD(%%HI), YWORD(%%LO), \ + YWORD(%%TMP0), YWORD(%%TMP1), YWORD(%%TMP2), \ + YWORD(%%TH), YWORD(%%TM), YWORD(%%TL), YWORD(%%TMP2), 2 +%elif %%NBLOCKS == 1 + vmovdqu64 XWORD(%%TMP2), [%%KP + HashKey_1] + VCLMUL_STEP2 NULL, XWORD(%%HI), XWORD(%%LO), \ + XWORD(%%TMP0), XWORD(%%TMP1), XWORD(%%TMP2), \ + XWORD(%%TH), XWORD(%%TM), XWORD(%%TL), XWORD(%%TMP2), 1 +%else + vpxorq %%HI, %%HI + vpxorq %%LO, %%LO +%endif +%endmacro ; VCLMUL_1_TO_8_STEP2 + +;;; =========================================================================== +;;; =========================================================================== +;;; GHASH 1 to 16 blocks of cipher text +;;; - performs reduction at the end +;;; - can take intermediate GHASH sums as input +%macro GHASH_1_TO_16 20 +%define %%KP %1 ; [in] pointer to expanded keys +%define %%GHASH %2 ; [out] ghash output +%define %%T1 %3 ; [clobbered] temporary ZMM +%define %%T2 %4 ; [clobbered] temporary ZMM +%define %%T3 %5 ; [clobbered] temporary ZMM +%define %%T4 %6 ; [clobbered] temporary ZMM +%define %%T5 %7 ; [clobbered] temporary ZMM +%define %%T6 %8 ; [clobbered] temporary ZMM +%define %%T7 %9 ; [clobbered] temporary ZMM +%define %%T8 %10 ; [clobbered] temporary ZMM +%define %%T9 %11 ; [clobbered] temporary ZMM +%define %%GH %12 ; [in/cloberred] ghash sum (high) or "no_zmm" +%define %%GL %13 ; [in/cloberred] ghash sum (low) or "no_zmm" +%define %%GM %14 ; [in/cloberred] ghash sum (medium) or "no_zmm" +%define %%AAD_HASH_IN %15 ; [in] input hash value +%define %%CIPHER_IN0 %16 ; [in] ZMM with cipher text blocks 0-3 +%define %%CIPHER_IN1 %17 ; [in] ZMM with cipher text blocks 4-7 +%define %%CIPHER_IN2 %18 ; [in] ZMM with cipher text blocks 8-11 +%define %%CIPHER_IN3 %19 ; [in] ZMM with cipher text blocks 12-15 +%define %%NUM_BLOCKS %20 ; [in] numerical value, number of blocks + +%define %%T0H %%T1 +%define %%T0L %%T2 +%define %%T0M1 %%T3 +%define %%T0M2 %%T4 + +%define %%T1H %%T5 +%define %%T1L %%T6 +%define %%T1M1 %%T7 +%define %%T1M2 %%T8 + +%define %%HK %%T9 + +%assign hashk HashKey_ %+ %%NUM_BLOCKS +%assign reg_idx 0 +%assign blocks_left %%NUM_BLOCKS + + vpxorq %%CIPHER_IN0, %%CIPHER_IN0, %%AAD_HASH_IN + +%assign first_result 1 + +%ifnidn %%GH, no_zmm +%ifnidn %%GM, no_zmm +%ifnidn %%GL, no_zmm + ;; GHASH sums passed in to be updated and + ;; reduced at the end + vmovdqa64 %%T0H, %%GH + vmovdqa64 %%T0L, %%GL + vmovdqa64 %%T0M1, %%GM + vpxorq %%T0M2, %%T0M2 +%assign first_result 0 +%endif +%endif +%endif + +%rep (blocks_left / 4) +%xdefine %%REG_IN %%CIPHER_IN %+ reg_idx + vmovdqu64 %%HK, [%%KP + hashk] +%if first_result == 1 + vpclmulqdq %%T0H, %%REG_IN, %%HK, 0x11 ; H = a1*b1 + vpclmulqdq %%T0L, %%REG_IN, %%HK, 0x00 ; L = a0*b0 + vpclmulqdq %%T0M1, %%REG_IN, %%HK, 0x01 ; M1 = a1*b0 + vpclmulqdq %%T0M2, %%REG_IN, %%HK, 0x10 ; TM2 = a0*b1 +%assign first_result 0 +%else + vpclmulqdq %%T1H, %%REG_IN, %%HK, 0x11 ; H = a1*b1 + vpclmulqdq %%T1L, %%REG_IN, %%HK, 0x00 ; L = a0*b0 + vpclmulqdq %%T1M1, %%REG_IN, %%HK, 0x01 ; M1 = a1*b0 + vpclmulqdq %%T1M2, %%REG_IN, %%HK, 0x10 ; M2 = a0*b1 + vpxorq %%T0H, %%T0H, %%T1H + vpxorq %%T0L, %%T0L, %%T1L + vpxorq %%T0M1, %%T0M1, %%T1M1 + vpxorq %%T0M2, %%T0M2, %%T1M2 +%endif +%undef %%REG_IN +%assign reg_idx (reg_idx + 1) +%assign hashk (hashk + 64) +%assign blocks_left (blocks_left - 4) +%endrep + +%if blocks_left > 0 +;; There are 1, 2 or 3 blocks left to process. +;; It may also be that they are the only blocks to process. + +%xdefine %%REG_IN %%CIPHER_IN %+ reg_idx + +%if first_result == 1 +;; Case where %%NUM_BLOCKS = 1, 2 or 3 +%xdefine %%OUT_H %%T0H +%xdefine %%OUT_L %%T0L +%xdefine %%OUT_M1 %%T0M1 +%xdefine %%OUT_M2 %%T0M2 +%else +%xdefine %%OUT_H %%T1H +%xdefine %%OUT_L %%T1L +%xdefine %%OUT_M1 %%T1M1 +%xdefine %%OUT_M2 %%T1M2 +%endif + +%if blocks_left == 1 + vmovdqu64 XWORD(%%HK), [%%KP + hashk] + vpclmulqdq XWORD(%%OUT_H), XWORD(%%REG_IN), XWORD(%%HK), 0x11 ; %%TH = a1*b1 + vpclmulqdq XWORD(%%OUT_L), XWORD(%%REG_IN), XWORD(%%HK), 0x00 ; %%TL = a0*b0 + vpclmulqdq XWORD(%%OUT_M1), XWORD(%%REG_IN), XWORD(%%HK), 0x01 ; %%TM1 = a1*b0 + vpclmulqdq XWORD(%%OUT_M2), XWORD(%%REG_IN), XWORD(%%HK), 0x10 ; %%TM2 = a0*b1 +%elif blocks_left == 2 + vmovdqu64 YWORD(%%HK), [%%KP + hashk] + vpclmulqdq YWORD(%%OUT_H), YWORD(%%REG_IN), YWORD(%%HK), 0x11 ; %%TH = a1*b1 + vpclmulqdq YWORD(%%OUT_L), YWORD(%%REG_IN), YWORD(%%HK), 0x00 ; %%TL = a0*b0 + vpclmulqdq YWORD(%%OUT_M1), YWORD(%%REG_IN), YWORD(%%HK), 0x01 ; %%TM1 = a1*b0 + vpclmulqdq YWORD(%%OUT_M2), YWORD(%%REG_IN), YWORD(%%HK), 0x10 ; %%TM2 = a0*b1 +%else ; blocks_left == 3 + vmovdqu64 YWORD(%%HK), [%%KP + hashk] + vinserti64x2 %%HK, [%%KP + hashk + 32], 2 + vpclmulqdq %%OUT_H, %%REG_IN, %%HK, 0x11 ; %%TH = a1*b1 + vpclmulqdq %%OUT_L, %%REG_IN, %%HK, 0x00 ; %%TL = a0*b0 + vpclmulqdq %%OUT_M1, %%REG_IN, %%HK, 0x01 ; %%TM1 = a1*b0 + vpclmulqdq %%OUT_M2, %%REG_IN, %%HK, 0x10 ; %%TM2 = a0*b1 +%endif ; blocks_left + +%undef %%REG_IN +%undef %%OUT_H +%undef %%OUT_L +%undef %%OUT_M1 +%undef %%OUT_M2 + +%if first_result != 1 + vpxorq %%T0H, %%T0H, %%T1H + vpxorq %%T0L, %%T0L, %%T1L + vpxorq %%T0M1, %%T0M1, %%T1M1 + vpxorq %%T0M2, %%T0M2, %%T1M2 +%endif + +%endif ; blocks_left > 0 + + ;; integrate TM into TH and TL + vpxorq %%T0M1, %%T0M1, %%T0M2 + vpsrldq %%T1M1, %%T0M1, 8 + vpslldq %%T1M2, %%T0M1, 8 + vpxorq %%T0H, %%T0H, %%T1M1 + vpxorq %%T0L, %%T0L, %%T1M2 + + ;; add TH and TL 128-bit words horizontally + VHPXORI4x128 %%T0H, %%T1M1 + VHPXORI4x128 %%T0L, %%T1M2 + + ;; reduction + vmovdqa64 XWORD(%%HK), [rel POLY2] + VCLMUL_REDUCE XWORD(%%GHASH), XWORD(%%HK), \ + XWORD(%%T0H), XWORD(%%T0L), XWORD(%%T0M1), XWORD(%%T0M2) +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +;;; Input: A and B (128-bits each, bit-reflected) +;;; Output: C = A*B*x mod poly, (i.e. >>1 ) +;;; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +;;; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GHASH_MUL 7 +%define %%GH %1 ; 16 Bytes +%define %%HK %2 ; 16 Bytes +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1 + vpclmulqdq %%T2, %%GH, %%HK, 0x00 ; %%T2 = a0*b0 + vpclmulqdq %%T3, %%GH, %%HK, 0x01 ; %%T3 = a1*b0 + vpclmulqdq %%GH, %%GH, %%HK, 0x10 ; %%GH = a0*b1 + vpxorq %%GH, %%GH, %%T3 + + + vpsrldq %%T3, %%GH, 8 ; shift-R %%GH 2 DWs + vpslldq %%GH, %%GH, 8 ; shift-L %%GH 2 DWs + + vpxorq %%T1, %%T1, %%T3 + vpxorq %%GH, %%GH, %%T2 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;first phase of the reduction + vmovdqu64 %%T3, [rel POLY2] + + vpclmulqdq %%T2, %%T3, %%GH, 0x01 + vpslldq %%T2, %%T2, 8 ; shift-L %%T2 2 DWs + + vpxorq %%GH, %%GH, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;second phase of the reduction + vpclmulqdq %%T2, %%T3, %%GH, 0x00 + vpsrldq %%T2, %%T2, 4 ; shift-R only 1-DW to obtain 2-DWs shift-R + + vpclmulqdq %%GH, %%T3, %%GH, 0x10 + vpslldq %%GH, %%GH, 4 ; Shift-L 1-DW to obtain result with no shifts + + ; second phase of the reduction complete, the result is in %%GH + vpternlogq %%GH, %%T1, %%T2, 0x96 ; GH = GH xor T1 xor T2 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx512 +;;; functions, but are kept to allow users to switch cpu architectures between calls +;;; of pre, init, update, and finalize. +%macro PRECOMPUTE 8 +%define %%GDATA %1 +%define %%HK %2 +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 +%define %%T6 %8 + + vmovdqa %%T5, %%HK + + ;; GHASH keys 2 to 48 or 128 +%ifdef GCM_BIG_DATA +%assign max_hkey_idx 128 +%else +%assign max_hkey_idx 48 +%endif + +%assign i 2 +%rep (max_hkey_idx - 1) + GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^i<<1 mod poly + vmovdqu [%%GDATA + HashKey_ %+ i], %%T5 ; [HashKey_i] = %%T5 +%assign i (i + 1) +%endrep + +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; READ_SMALL_DATA_INPUT +;;; Packs xmm register with data when data input is less or equal to 16 bytes +;;; Returns 0 if data has length 0 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro READ_SMALL_DATA_INPUT 5 +%define %%OUTPUT %1 ; [out] xmm register +%define %%INPUT %2 ; [in] buffer pointer to read from +%define %%LENGTH %3 ; [in] number of bytes to read +%define %%TMP1 %4 ; [clobbered] +%define %%MASK %5 ; [out] k1 to k7 register to store the partial block mask + + cmp %%LENGTH, 16 + jge %%_read_small_data_ge16 + lea %%TMP1, [rel byte_len_to_mask_table] +%ifidn __OUTPUT_FORMAT__, win64 + add %%TMP1, %%LENGTH + add %%TMP1, %%LENGTH + kmovw %%MASK, [%%TMP1] +%else + kmovw %%MASK, [%%TMP1 + %%LENGTH*2] +%endif + vmovdqu8 %%OUTPUT{%%MASK}{z}, [%%INPUT] + jmp %%_read_small_data_end +%%_read_small_data_ge16: + VX512LDR %%OUTPUT, [%%INPUT] + mov %%TMP1, 0xffff + kmovq %%MASK, %%TMP1 +%%_read_small_data_end: +%endmacro ; READ_SMALL_DATA_INPUT + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. +; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). +; Output: The hash of the data (AAD_HASH). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro CALC_AAD_HASH 18 +%define %%A_IN %1 ; [in] AAD text pointer +%define %%A_LEN %2 ; [in] AAD length +%define %%AAD_HASH %3 ; [out] xmm ghash value +%define %%GDATA_KEY %4 ; [in] pointer to keys +%define %%ZT0 %5 ; [clobbered] ZMM register +%define %%ZT1 %6 ; [clobbered] ZMM register +%define %%ZT2 %7 ; [clobbered] ZMM register +%define %%ZT3 %8 ; [clobbered] ZMM register +%define %%ZT4 %9 ; [clobbered] ZMM register +%define %%ZT5 %10 ; [clobbered] ZMM register +%define %%ZT6 %11 ; [clobbered] ZMM register +%define %%ZT7 %12 ; [clobbered] ZMM register +%define %%ZT8 %13 ; [clobbered] ZMM register +%define %%ZT9 %14 ; [clobbered] ZMM register +%define %%T1 %15 ; [clobbered] GP register +%define %%T2 %16 ; [clobbered] GP register +%define %%T3 %17 ; [clobbered] GP register +%define %%MASKREG %18 ; [clobbered] mask register + +%define %%SHFMSK %%ZT9 +%define %%POLY %%ZT8 +%define %%TH %%ZT7 +%define %%TM %%ZT6 +%define %%TL %%ZT5 + + mov %%T1, %%A_IN ; T1 = AAD + mov %%T2, %%A_LEN ; T2 = aadLen + vpxorq %%AAD_HASH, %%AAD_HASH + + vmovdqa64 %%SHFMSK, [rel SHUF_MASK] + vmovdqa64 %%POLY, [rel POLY2] + +%%_get_AAD_loop128: + cmp %%T2, 128 + jl %%_exit_AAD_loop128 + + vmovdqu64 %%ZT2, [%%T1 + 64*0] ; LO blocks (0-3) + vmovdqu64 %%ZT1, [%%T1 + 64*1] ; HI blocks (4-7) + vpshufb %%ZT2, %%SHFMSK + vpshufb %%ZT1, %%SHFMSK + + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) + + VCLMUL_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%TH, %%TM, %%TL + VCLMUL_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, %%ZT0, %%ZT3, %%ZT4, %%TH, %%TM, %%TL + + ;; result in %%ZT1(H):%%ZT2(L) + ;; reduce and put the result in AAD_HASH + VCLMUL_REDUCE %%AAD_HASH, XWORD(%%POLY), XWORD(%%ZT1), XWORD(%%ZT2), \ + XWORD(%%ZT0), XWORD(%%ZT3) + + sub %%T2, 128 + je %%_CALC_AAD_done + + add %%T1, 128 + jmp %%_get_AAD_loop128 + +%%_exit_AAD_loop128: + or %%T2, %%T2 + jz %%_CALC_AAD_done + + ;; prep mask source address + lea %%T3, [rel byte64_len_to_mask_table] + lea %%T3, [%%T3 + %%T2*8] + + ;; calculate number of blocks to ghash (including partial bytes) + add %%T2, 15 + and %%T2, -16 ; 1 to 8 blocks possible here + shr %%T2, 4 + cmp %%T2, 7 + je %%_AAD_blocks_7 + cmp %%T2, 6 + je %%_AAD_blocks_6 + cmp %%T2, 5 + je %%_AAD_blocks_5 + cmp %%T2, 4 + je %%_AAD_blocks_4 + cmp %%T2, 3 + je %%_AAD_blocks_3 + cmp %%T2, 2 + je %%_AAD_blocks_2 + cmp %%T2, 1 + je %%_AAD_blocks_1 + ;; fall through for 8 blocks + + ;; The flow of each of these cases is identical: + ;; - load blocks plain text + ;; - shuffle loaded blocks + ;; - xor in current hash value into block 0 + ;; - perform up multiplications with ghash keys + ;; - jump to reduction code +%%_AAD_blocks_8: + sub %%T3, (64 * 8) + kmovq %%MASKREG, [%%T3] + vmovdqu8 %%ZT2, [%%T1 + 64*0] + vmovdqu8 %%ZT1{%%MASKREG}{z}, [%%T1 + 64*1] + vpshufb %%ZT2, %%SHFMSK + vpshufb %%ZT1, %%SHFMSK + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) ; xor in current ghash + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 8 + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \ + %%ZT0, %%ZT3, %%ZT4, \ + %%TH, %%TM, %%TL, 8 + jmp %%_AAD_blocks_done + +%%_AAD_blocks_7: + sub %%T3, (64 * 8) + kmovq %%MASKREG, [%%T3] + vmovdqu8 %%ZT2, [%%T1 + 64*0] + vmovdqu8 %%ZT1{%%MASKREG}{z}, [%%T1 + 64*1] + vpshufb %%ZT2, %%SHFMSK + vpshufb %%ZT1, %%SHFMSK + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) ; xor in current ghash + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 7 + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \ + %%ZT0, %%ZT3, %%ZT4, \ + %%TH, %%TM, %%TL, 7 + jmp %%_AAD_blocks_done + +%%_AAD_blocks_6: + sub %%T3, (64 * 8) + kmovq %%MASKREG, [%%T3] + vmovdqu8 %%ZT2, [%%T1 + 64*0] + vmovdqu8 YWORD(%%ZT1){%%MASKREG}{z}, [%%T1 + 64*1] + vpshufb %%ZT2, %%SHFMSK + vpshufb YWORD(%%ZT1), YWORD(%%SHFMSK) + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 6 + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \ + %%ZT0, %%ZT3, %%ZT4, \ + %%TH, %%TM, %%TL, 6 + jmp %%_AAD_blocks_done + +%%_AAD_blocks_5: + sub %%T3, (64 * 8) + kmovq %%MASKREG, [%%T3] + vmovdqu8 %%ZT2, [%%T1 + 64*0] + vmovdqu8 XWORD(%%ZT1){%%MASKREG}{z}, [%%T1 + 64*1] + vpshufb %%ZT2, %%SHFMSK + vpshufb XWORD(%%ZT1), XWORD(%%SHFMSK) + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 5 + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \ + %%ZT0, %%ZT3, %%ZT4, \ + %%TH, %%TM, %%TL, 5 + jmp %%_AAD_blocks_done + +%%_AAD_blocks_4: + kmovq %%MASKREG, [%%T3] + vmovdqu8 %%ZT2{%%MASKREG}{z}, [%%T1 + 64*0] + vpshufb %%ZT2, %%SHFMSK + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 4 + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \ + %%ZT0, %%ZT3, %%ZT4, \ + %%TH, %%TM, %%TL, 4 + jmp %%_AAD_blocks_done + +%%_AAD_blocks_3: + kmovq %%MASKREG, [%%T3] + vmovdqu8 %%ZT2{%%MASKREG}{z}, [%%T1 + 64*0] + vpshufb %%ZT2, %%SHFMSK + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 3 + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \ + %%ZT0, %%ZT3, %%ZT4, \ + %%TH, %%TM, %%TL, 3 + jmp %%_AAD_blocks_done + +%%_AAD_blocks_2: + kmovq %%MASKREG, [%%T3] + vmovdqu8 YWORD(%%ZT2){%%MASKREG}{z}, [%%T1 + 64*0] + vpshufb YWORD(%%ZT2), YWORD(%%SHFMSK) + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 2 + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \ + %%ZT0, %%ZT3, %%ZT4, \ + %%TH, %%TM, %%TL, 2 + jmp %%_AAD_blocks_done + +%%_AAD_blocks_1: + kmovq %%MASKREG, [%%T3] + vmovdqu8 XWORD(%%ZT2){%%MASKREG}{z}, [%%T1 + 64*0] + vpshufb XWORD(%%ZT2), XWORD(%%SHFMSK) + vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 1 + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \ + %%ZT0, %%ZT3, %%ZT4, \ + %%TH, %%TM, %%TL, 1 + +%%_AAD_blocks_done: + ;; Multiplications have been done. Do the reduction now + VCLMUL_REDUCE %%AAD_HASH, XWORD(%%POLY), XWORD(%%ZT1), XWORD(%%ZT2), \ + XWORD(%%ZT0), XWORD(%%ZT3) +%%_CALC_AAD_done: + ;; result in AAD_HASH + +%endmacro ; CALC_AAD_HASH + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; PARTIAL_BLOCK +;;; Handles encryption/decryption and the tag partial blocks between +;;; update calls. +;;; Requires the input data be at least 1 byte long. +;;; Output: +;;; A cipher/plain of the first partial block (CYPH_PLAIN_OUT), +;;; AAD_HASH and updated GDATA_CTX +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro PARTIAL_BLOCK 22 +%define %%GDATA_KEY %1 ; [in] key pointer +%define %%GDATA_CTX %2 ; [in] context pointer +%define %%CYPH_PLAIN_OUT %3 ; [in] output buffer +%define %%PLAIN_CYPH_IN %4 ; [in] input buffer +%define %%PLAIN_CYPH_LEN %5 ; [in] buffer length +%define %%DATA_OFFSET %6 ; [in/out] data offset (gets updated) +%define %%AAD_HASH %7 ; [out] updated GHASH value +%define %%ENC_DEC %8 ; [in] cipher direction +%define %%GPTMP0 %9 ; [clobbered] GP temporary register +%define %%GPTMP1 %10 ; [clobbered] GP temporary register +%define %%GPTMP2 %11 ; [clobbered] GP temporary register +%define %%ZTMP0 %12 ; [clobbered] ZMM temporary register +%define %%ZTMP1 %13 ; [clobbered] ZMM temporary register +%define %%ZTMP2 %14 ; [clobbered] ZMM temporary register +%define %%ZTMP3 %15 ; [clobbered] ZMM temporary register +%define %%ZTMP4 %16 ; [clobbered] ZMM temporary register +%define %%ZTMP5 %17 ; [clobbered] ZMM temporary register +%define %%ZTMP6 %18 ; [clobbered] ZMM temporary register +%define %%ZTMP7 %19 ; [clobbered] ZMM temporary register +%define %%ZTMP8 %20 ; [clobbered] ZMM temporary register +%define %%ZTMP9 %21 ; [clobbered] ZMM temporary register +%define %%MASKREG %22 ; [clobbered] mask temporary register + +%define %%XTMP0 XWORD(%%ZTMP0) +%define %%XTMP1 XWORD(%%ZTMP1) +%define %%XTMP2 XWORD(%%ZTMP2) +%define %%XTMP3 XWORD(%%ZTMP3) +%define %%XTMP4 XWORD(%%ZTMP4) +%define %%XTMP5 XWORD(%%ZTMP5) +%define %%XTMP6 XWORD(%%ZTMP6) +%define %%XTMP7 XWORD(%%ZTMP7) +%define %%XTMP8 XWORD(%%ZTMP8) +%define %%XTMP9 XWORD(%%ZTMP9) + +%define %%LENGTH %%GPTMP0 +%define %%IA0 %%GPTMP1 +%define %%IA1 %%GPTMP2 + + mov %%LENGTH, [%%GDATA_CTX + PBlockLen] + or %%LENGTH, %%LENGTH + je %%_partial_block_done ;Leave Macro if no partial blocks + + READ_SMALL_DATA_INPUT %%XTMP0, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%IA0, %%MASKREG + + ;; XTMP1 = my_ctx_data.partial_block_enc_key + vmovdqu64 %%XTMP1, [%%GDATA_CTX + PBlockEncKey] + vmovdqu64 %%XTMP2, [%%GDATA_KEY + HashKey] + + ;; adjust the shuffle mask pointer to be able to shift right %%LENGTH bytes + ;; (16 - %%LENGTH) is the number of bytes in plaintext mod 16) + lea %%IA0, [rel SHIFT_MASK] + add %%IA0, %%LENGTH + vmovdqu64 %%XTMP3, [%%IA0] ; shift right shuffle mask + vpshufb %%XTMP1, %%XTMP3 + +%ifidn %%ENC_DEC, DEC + ;; keep copy of cipher text in %%XTMP4 + vmovdqa64 %%XTMP4, %%XTMP0 +%endif + vpxorq %%XTMP1, %%XTMP0 ; Cyphertext XOR E(K, Yn) + + ;; Set %%IA1 to be the amount of data left in CYPH_PLAIN_IN after filling the block + ;; Determine if partial block is not being filled and shift mask accordingly + mov %%IA1, %%PLAIN_CYPH_LEN + add %%IA1, %%LENGTH + sub %%IA1, 16 + jge %%_no_extra_mask + sub %%IA0, %%IA1 +%%_no_extra_mask: + ;; get the appropriate mask to mask out bottom %%LENGTH bytes of %%XTMP1 + ;; - mask out bottom %%LENGTH bytes of %%XTMP1 + vmovdqu64 %%XTMP0, [%%IA0 + ALL_F - SHIFT_MASK] + vpand %%XTMP1, %%XTMP0 + +%ifidn %%ENC_DEC, DEC + vpand %%XTMP4, %%XTMP0 + vpshufb %%XTMP4, [rel SHUF_MASK] + vpshufb %%XTMP4, %%XTMP3 + vpxorq %%AAD_HASH, %%XTMP4 +%else + vpshufb %%XTMP1, [rel SHUF_MASK] + vpshufb %%XTMP1, %%XTMP3 + vpxorq %%AAD_HASH, %%XTMP1 +%endif + cmp %%IA1, 0 + jl %%_partial_incomplete + + ;; GHASH computation for the last <16 Byte block + GHASH_MUL %%AAD_HASH, %%XTMP2, %%XTMP5, %%XTMP6, %%XTMP7, %%XTMP8, %%XTMP9 + + mov qword [%%GDATA_CTX + PBlockLen], 0 + + ;; Set %%IA1 to be the number of bytes to write out + mov %%IA0, %%LENGTH + mov %%LENGTH, 16 + sub %%LENGTH, %%IA0 + jmp %%_enc_dec_done + +%%_partial_incomplete: +%ifidn __OUTPUT_FORMAT__, win64 + mov %%IA0, %%PLAIN_CYPH_LEN + add [%%GDATA_CTX + PBlockLen], %%IA0 +%else + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%endif + mov %%LENGTH, %%PLAIN_CYPH_LEN + +%%_enc_dec_done: + ;; output encrypted Bytes + + lea %%IA0, [rel byte_len_to_mask_table] + kmovw %%MASKREG, [%%IA0 + %%LENGTH*2] + vmovdqu64 [%%GDATA_CTX + AadHash], %%AAD_HASH + +%ifidn %%ENC_DEC, ENC + ;; shuffle XTMP1 back to output as ciphertext + vpshufb %%XTMP1, [rel SHUF_MASK] + vpshufb %%XTMP1, %%XTMP3 +%endif + vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET]{%%MASKREG}, %%XTMP1 + add %%DATA_OFFSET, %%LENGTH +%%_partial_block_done: +%endmacro ; PARTIAL_BLOCK + + +%macro GHASH_SINGLE_MUL 9 +%define %%GDATA %1 +%define %%HASHKEY %2 +%define %%CIPHER %3 +%define %%STATE_11 %4 +%define %%STATE_00 %5 +%define %%STATE_MID %6 +%define %%T1 %7 +%define %%T2 %8 +%define %%FIRST %9 + + vmovdqu %%T1, [%%GDATA + %%HASHKEY] +%ifidn %%FIRST, first + vpclmulqdq %%STATE_11, %%CIPHER, %%T1, 0x11 ; %%T4 = a1*b1 + vpclmulqdq %%STATE_00, %%CIPHER, %%T1, 0x00 ; %%T4_2 = a0*b0 + vpclmulqdq %%STATE_MID, %%CIPHER, %%T1, 0x01 ; %%T6 = a1*b0 + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 ; %%T5 = a0*b1 + vpxor %%STATE_MID, %%STATE_MID, %%T2 +%else + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x11 + vpxor %%STATE_11, %%STATE_11, %%T2 + + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x00 + vpxor %%STATE_00, %%STATE_00, %%T2 + + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x01 + vpxor %%STATE_MID, %%STATE_MID, %%T2 + + vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 + vpxor %%STATE_MID, %%STATE_MID, %%T2 +%endif + +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; This macro is used to "warm-up" pipeline for GHASH_8_ENCRYPT_8_PARALLEL +;;; macro code. It is called only for data lenghts 128 and above. +;;; The flow is as follows: +;;; - encrypt the initial %%num_initial_blocks blocks (can be 0) +;;; - encrypt the next 8 blocks and stitch with +;;; GHASH for the first %%num_initial_blocks +;;; - the last 8th block can be partial (lengths between 129 and 239) +;;; - partial block ciphering is handled within this macro +;;; - top bytes of such block are cleared for +;;; the subsequent GHASH calculations +;;; - PBlockEncKey needs to be setup in case of multi-call +;;; - top bytes of the block need to include encrypted counter block so that +;;; when handling partial block case text is read and XOR'ed against it. +;;; This needs to be in un-shuffled format. + +%macro INITIAL_BLOCKS 26-27 +%define %%GDATA_KEY %1 ; [in] pointer to GCM keys +%define %%GDATA_CTX %2 ; [in] pointer to GCM context +%define %%CYPH_PLAIN_OUT %3 ; [in] output buffer +%define %%PLAIN_CYPH_IN %4 ; [in] input buffer +%define %%LENGTH %5 ; [in/out] number of bytes to process +%define %%DATA_OFFSET %6 ; [in/out] data offset +%define %%num_initial_blocks %7 ; [in] can be 0, 1, 2, 3, 4, 5, 6 or 7 +%define %%CTR %8 ; [in/out] XMM counter block +%define %%AAD_HASH %9 ; [in/out] ZMM with AAD hash +%define %%ZT1 %10 ; [out] ZMM cipher blocks 0-3 for GHASH +%define %%ZT2 %11 ; [out] ZMM cipher blocks 4-7 for GHASH +%define %%ZT3 %12 ; [clobbered] ZMM temporary +%define %%ZT4 %13 ; [clobbered] ZMM temporary +%define %%ZT5 %14 ; [clobbered] ZMM temporary +%define %%ZT6 %15 ; [clobbered] ZMM temporary +%define %%ZT7 %16 ; [clobbered] ZMM temporary +%define %%ZT8 %17 ; [clobbered] ZMM temporary +%define %%ZT9 %18 ; [clobbered] ZMM temporary +%define %%ZT10 %19 ; [clobbered] ZMM temporary +%define %%ZT11 %20 ; [clobbered] ZMM temporary +%define %%ZT12 %21 ; [clobbered] ZMM temporary +%define %%IA0 %22 ; [clobbered] GP temporary +%define %%IA1 %23 ; [clobbered] GP temporary +%define %%ENC_DEC %24 ; [in] ENC/DEC selector +%define %%MASKREG %25 ; [clobbered] mask register +%define %%SHUFMASK %26 ; [in] ZMM with BE/LE shuffle mask +%define %%PARTIAL_PRESENT %27 ; [in] "no_partial_block" option can be passed here (if length is guaranteed to be > 15*16 bytes) + +%define %%T1 XWORD(%%ZT1) +%define %%T2 XWORD(%%ZT2) +%define %%T3 XWORD(%%ZT3) +%define %%T4 XWORD(%%ZT4) +%define %%T5 XWORD(%%ZT5) +%define %%T6 XWORD(%%ZT6) +%define %%T7 XWORD(%%ZT7) +%define %%T8 XWORD(%%ZT8) +%define %%T9 XWORD(%%ZT9) + +%define %%TH %%ZT10 +%define %%TM %%ZT11 +%define %%TL %%ZT12 + +;; determine if partial block code needs to be added +%assign partial_block_possible 1 +%if %0 > 26 +%ifidn %%PARTIAL_PRESENT, no_partial_block +%assign partial_block_possible 0 +%endif +%endif + +%if %%num_initial_blocks > 0 + ;; prepare AES counter blocks +%if %%num_initial_blocks == 1 + vpaddd %%T3, %%CTR, [rel ONE] +%elif %%num_initial_blocks == 2 + vshufi64x2 YWORD(%%ZT3), YWORD(%%CTR), YWORD(%%CTR), 0 + vpaddd YWORD(%%ZT3), YWORD(%%ZT3), [rel ddq_add_1234] +%else + vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0 + vpaddd %%ZT3, ZWORD(%%CTR), [rel ddq_add_1234] + vpaddd %%ZT4, ZWORD(%%CTR), [rel ddq_add_5678] +%endif + + ;; extract new counter value (%%T3) + ;; shuffle the counters for AES rounds +%if %%num_initial_blocks <= 4 + vextracti32x4 %%CTR, %%ZT3, (%%num_initial_blocks - 1) +%else + vextracti32x4 %%CTR, %%ZT4, (%%num_initial_blocks - 5) +%endif + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \ + %%ZT3, %%ZT4, no_zmm, no_zmm, \ + %%ZT3, %%ZT4, no_zmm, no_zmm, \ + %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK + + ;; load plain/cipher text + ZMM_LOAD_BLOCKS_0_16 %%num_initial_blocks, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \ + %%ZT5, %%ZT6, no_zmm, no_zmm + + ;; AES rounds and XOR with plain/cipher text +%assign j 0 +%rep (NROUNDS + 2) + vbroadcastf64x2 %%ZT1, [%%GDATA_KEY + (j * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \ + %%ZT1, j, \ + %%ZT5, %%ZT6, no_zmm, no_zmm, \ + %%num_initial_blocks, NROUNDS +%assign j (j + 1) +%endrep + + ;; write cipher/plain text back to output and + ;; zero bytes outside the mask before hashing + ZMM_STORE_BLOCKS_0_16 %%num_initial_blocks, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \ + %%ZT3, %%ZT4, no_zmm, no_zmm + + ;; Shuffle the cipher text blocks for hashing part + ;; ZT5 and ZT6 are expected outputs with blocks for hashing +%ifidn %%ENC_DEC, DEC + ;; Decrypt case + ;; - cipher blocks are in ZT5 & ZT6 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \ + %%ZT5, %%ZT6, no_zmm, no_zmm, \ + %%ZT5, %%ZT6, no_zmm, no_zmm, \ + %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK +%else + ;; Encrypt case + ;; - cipher blocks are in ZT3 & ZT4 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \ + %%ZT5, %%ZT6, no_zmm, no_zmm, \ + %%ZT3, %%ZT4, no_zmm, no_zmm, \ + %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK +%endif ; Encrypt + + ;; adjust data offset and length + sub %%LENGTH, (%%num_initial_blocks * 16) + add %%DATA_OFFSET, (%%num_initial_blocks * 16) + + ;; At this stage + ;; - ZT5:ZT6 include cipher blocks to be GHASH'ed + +%endif ; %%num_initial_blocks > 0 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; - cipher of %%num_initial_blocks is done + ;; - prepare counter blocks for the next 8 blocks (ZT3 & ZT4) + ;; - save the last block in %%CTR + ;; - shuffle the blocks for AES + ;; - stitch encryption of the new blocks with + ;; GHASHING the previous blocks + vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0 + vpaddd %%ZT3, ZWORD(%%CTR), [rel ddq_add_1234] + vpaddd %%ZT4, ZWORD(%%CTR), [rel ddq_add_5678] + vextracti32x4 %%CTR, %%ZT4, 3 + + vpshufb %%ZT3, %%SHUFMASK + vpshufb %%ZT4, %%SHUFMASK + +%if partial_block_possible != 0 + ;; get text load/store mask (assume full mask by default) + mov %%IA0, 0xffff_ffff_ffff_ffff +%if %%num_initial_blocks > 0 + ;; NOTE: 'jge' is always taken for %%num_initial_blocks = 0 + ;; This macro is executed for lenght 128 and up, + ;; zero length is checked in GCM_ENC_DEC. + ;; We know there is partial block if: + ;; LENGTH - 16*num_initial_blocks < 128 + cmp %%LENGTH, 128 + jge %%_initial_partial_block_continue + mov %%IA1, rcx + mov rcx, 128 + sub rcx, %%LENGTH + shr %%IA0, cl + mov rcx, %%IA1 +%%_initial_partial_block_continue: +%endif + kmovq %%MASKREG, %%IA0 + ;; load plain or cipher text (masked) + ZMM_LOAD_MASKED_BLOCKS_0_16 8, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \ + %%ZT1, %%ZT2, no_zmm, no_zmm, %%MASKREG +%else + ;; load plain or cipher text + ZMM_LOAD_BLOCKS_0_16 8, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \ + %%ZT1, %%ZT2, no_zmm, no_zmm +%endif ;; partial_block_possible + + ;; === AES ROUND 0 +%assign aes_round 0 + vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \ + %%ZT8, aes_round, \ + %%ZT1, %%ZT2, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) + + ;; === GHASH blocks 4-7 +%if (%%num_initial_blocks > 0) + ;; Hash in AES state + vpxorq %%ZT5, %%ZT5, %%AAD_HASH + + VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT6, %%ZT8, %%ZT9, \ + %%TH, %%TM, %%TL, %%num_initial_blocks +%endif + + ;; === [1/3] of AES rounds + +%rep ((NROUNDS + 1) / 3) + vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \ + %%ZT8, aes_round, \ + %%ZT1, %%ZT2, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endrep ; %rep ((NROUNDS + 1) / 2) + + ;; === GHASH blocks 0-3 and gather +%if (%%num_initial_blocks > 0) + VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT6, %%ZT5, \ + %%ZT7, %%ZT8, %%ZT9, \ + %%TH, %%TM, %%TL, %%num_initial_blocks +%endif + + ;; === [2/3] of AES rounds + +%rep ((NROUNDS + 1) / 3) + vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \ + %%ZT8, aes_round, \ + %%ZT1, %%ZT2, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endrep ; %rep ((NROUNDS + 1) / 2) + + ;; === GHASH reduction + +%if (%%num_initial_blocks > 0) + ;; [out] AAD_HASH - hash output + ;; [in] T8 - polynomial + ;; [in] T6 - high, T5 - low + ;; [clobbered] T9, T7 - temporary + vmovdqu64 %%T8, [rel POLY2] + VCLMUL_REDUCE XWORD(%%AAD_HASH), %%T8, %%T6, %%T5, %%T7, %%T9 +%endif + + ;; === [3/3] of AES rounds + +%rep (((NROUNDS + 1) / 3) + 2) +%if aes_round < (NROUNDS + 2) + vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \ + %%ZT8, aes_round, \ + %%ZT1, %%ZT2, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endif +%endrep ; %rep ((NROUNDS + 1) / 2) + +%if partial_block_possible != 0 + ;; write cipher/plain text back to output and + ;; zero bytes outside the mask before hashing + ZMM_STORE_MASKED_BLOCKS_0_16 8, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \ + %%ZT3, %%ZT4, no_zmm, no_zmm, %%MASKREG + ;; check if there is partial block + cmp %%LENGTH, 128 + jl %%_initial_save_partial + ;; adjust offset and length + add %%DATA_OFFSET, 128 + sub %%LENGTH, 128 + jmp %%_initial_blocks_done +%%_initial_save_partial: + ;; partial block case + ;; - save the partial block in unshuffled format + ;; - ZT4 is partially XOR'ed with data and top bytes contain + ;; encrypted counter block only + ;; - save number of bytes process in the partial block + ;; - adjust offset and zero the length + ;; - clear top bytes of the partial block for subsequent GHASH calculations + vextracti32x4 [%%GDATA_CTX + PBlockEncKey], %%ZT4, 3 + add %%DATA_OFFSET, %%LENGTH + sub %%LENGTH, (128 - 16) + mov [%%GDATA_CTX + PBlockLen], %%LENGTH + xor %%LENGTH, %%LENGTH + vmovdqu8 %%ZT4{%%MASKREG}{z}, %%ZT4 +%%_initial_blocks_done: +%else + ZMM_STORE_BLOCKS_0_16 8, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \ + %%ZT3, %%ZT4, no_zmm, no_zmm + add %%DATA_OFFSET, 128 + sub %%LENGTH, 128 +%endif ;; partial_block_possible + + ;; Shuffle AES result for GHASH. +%ifidn %%ENC_DEC, DEC + ;; Decrypt case + ;; - cipher blocks are in ZT1 & ZT2 + vpshufb %%ZT1, %%SHUFMASK + vpshufb %%ZT2, %%SHUFMASK +%else + ;; Encrypt case + ;; - cipher blocks are in ZT3 & ZT4 + vpshufb %%ZT1, %%ZT3, %%SHUFMASK + vpshufb %%ZT2, %%ZT4, %%SHUFMASK +%endif ; Encrypt + + ;; Current hash value is in AAD_HASH + + ;; Combine GHASHed value with the corresponding ciphertext + vpxorq %%ZT1, %%ZT1, %%AAD_HASH + +%endmacro ; INITIAL_BLOCKS +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; INITIAL_BLOCKS_PARTIAL macro with support for a partial final block. +;;; It may look similar to INITIAL_BLOCKS but its usage is different: +;;; - first encrypts/decrypts required number of blocks and then +;;; ghashes these blocks +;;; - Small packets or left over data chunks (<256 bytes) +;;; - single or multi call +;;; - Remaining data chunks below 256 bytes (multi buffer code) +;;; +;;; num_initial_blocks is expected to include the partial final block +;;; in the count. +%macro INITIAL_BLOCKS_PARTIAL 41 +%define %%GDATA_KEY %1 ; [in] key pointer +%define %%GDATA_CTX %2 ; [in] context pointer +%define %%CYPH_PLAIN_OUT %3 ; [in] text out pointer +%define %%PLAIN_CYPH_IN %4 ; [in] text out pointer +%define %%LENGTH %5 ; [in/clobbered] length in bytes +%define %%DATA_OFFSET %6 ; [in/out] current data offset (updated) +%define %%num_initial_blocks %7 ; [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0) +%define %%CTR %8 ; [in/out] current counter value +%define %%HASH_IN_OUT %9 ; [in/out] XMM ghash in/out value +%define %%ENC_DEC %10 ; [in] cipher direction (ENC/DEC) +%define %%INSTANCE_TYPE %11 ; [in] multi_call or single_call +%define %%ZT0 %12 ; [clobbered] ZMM temporary +%define %%ZT1 %13 ; [clobbered] ZMM temporary +%define %%ZT2 %14 ; [clobbered] ZMM temporary +%define %%ZT3 %15 ; [clobbered] ZMM temporary +%define %%ZT4 %16 ; [clobbered] ZMM temporary +%define %%ZT5 %17 ; [clobbered] ZMM temporary +%define %%ZT6 %18 ; [clobbered] ZMM temporary +%define %%ZT7 %19 ; [clobbered] ZMM temporary +%define %%ZT8 %20 ; [clobbered] ZMM temporary +%define %%ZT9 %21 ; [clobbered] ZMM temporary +%define %%ZT10 %22 ; [clobbered] ZMM temporary +%define %%ZT11 %23 ; [clobbered] ZMM temporary +%define %%ZT12 %24 ; [clobbered] ZMM temporary +%define %%ZT13 %25 ; [clobbered] ZMM temporary +%define %%ZT14 %26 ; [clobbered] ZMM temporary +%define %%ZT15 %27 ; [clobbered] ZMM temporary +%define %%ZT16 %28 ; [clobbered] ZMM temporary +%define %%ZT17 %29 ; [clobbered] ZMM temporary +%define %%ZT18 %30 ; [clobbered] ZMM temporary +%define %%ZT19 %31 ; [clobbered] ZMM temporary +%define %%ZT20 %32 ; [clobbered] ZMM temporary +%define %%ZT21 %33 ; [clobbered] ZMM temporary +%define %%ZT22 %34 ; [clobbered] ZMM temporary +%define %%GH %35 ; [in] ZMM ghash sum (high) +%define %%GL %36 ; [in] ZMM ghash sum (low) +%define %%GM %37 ; [in] ZMM ghash sum (middle) +%define %%IA0 %38 ; [clobbered] GP temporary +%define %%IA1 %39 ; [clobbered] GP temporary +%define %%MASKREG %40 ; [clobbered] mask register +%define %%SHUFMASK %41 ; [in] ZMM with BE/LE shuffle mask + +%define %%T1 XWORD(%%ZT1) +%define %%T2 XWORD(%%ZT2) +%define %%T7 XWORD(%%ZT7) + +%define %%CTR0 %%ZT3 +%define %%CTR1 %%ZT4 +%define %%CTR2 %%ZT8 +%define %%CTR3 %%ZT9 + +%define %%DAT0 %%ZT5 +%define %%DAT1 %%ZT6 +%define %%DAT2 %%ZT10 +%define %%DAT3 %%ZT11 + +%ifnidn %%GH, no_zmm +%ifnidn %%GL, no_zmm +%ifnidn %%GM, no_zmm + ;; when temporary sums are passed then zero HASH IN value + ;; - whatever it holds it is invalid in this case + vpxorq %%HASH_IN_OUT, %%HASH_IN_OUT +%endif +%endif +%endif + ;; Copy ghash to temp reg + vmovdqa64 %%T2, %%HASH_IN_OUT + + ;; prepare AES counter blocks +%if %%num_initial_blocks == 1 + vpaddd XWORD(%%CTR0), %%CTR, [rel ONE] +%elif %%num_initial_blocks == 2 + vshufi64x2 YWORD(%%CTR0), YWORD(%%CTR), YWORD(%%CTR), 0 + vpaddd YWORD(%%CTR0), YWORD(%%CTR0), [rel ddq_add_1234] +%else + vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0 + vpaddd %%CTR0, ZWORD(%%CTR), [rel ddq_add_1234] +%if %%num_initial_blocks > 4 + vpaddd %%CTR1, ZWORD(%%CTR), [rel ddq_add_5678] +%endif +%if %%num_initial_blocks > 8 + vpaddd %%CTR2, %%CTR0, [rel ddq_add_8888] +%endif +%if %%num_initial_blocks > 12 + vpaddd %%CTR3, %%CTR1, [rel ddq_add_8888] +%endif +%endif + + ;; get load/store mask + lea %%IA0, [rel byte64_len_to_mask_table] + mov %%IA1, %%LENGTH +%if %%num_initial_blocks > 12 + sub %%IA1, 3 * 64 +%elif %%num_initial_blocks > 8 + sub %%IA1, 2 * 64 +%elif %%num_initial_blocks > 4 + sub %%IA1, 64 +%endif + kmovq %%MASKREG, [%%IA0 + %%IA1*8] + + ;; extract new counter value + ;; shuffle the counters for AES rounds +%if %%num_initial_blocks <= 4 + vextracti32x4 %%CTR, %%CTR0, (%%num_initial_blocks - 1) +%elif %%num_initial_blocks <= 8 + vextracti32x4 %%CTR, %%CTR1, (%%num_initial_blocks - 5) +%elif %%num_initial_blocks <= 12 + vextracti32x4 %%CTR, %%CTR2, (%%num_initial_blocks - 9) +%else + vextracti32x4 %%CTR, %%CTR3, (%%num_initial_blocks - 13) +%endif + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK + + ;; load plain/cipher text + ZMM_LOAD_MASKED_BLOCKS_0_16 %%num_initial_blocks, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, %%MASKREG + + ;; AES rounds and XOR with plain/cipher text +%assign j 0 +%rep (NROUNDS + 2) + vbroadcastf64x2 %%ZT1, [%%GDATA_KEY + (j * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%ZT1, j, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, \ + %%num_initial_blocks, NROUNDS +%assign j (j + 1) +%endrep + + ;; retrieve the last cipher counter block (partially XOR'ed with text) + ;; - this is needed for partial block cases +%if %%num_initial_blocks <= 4 + vextracti32x4 %%T1, %%CTR0, (%%num_initial_blocks - 1) +%elif %%num_initial_blocks <= 8 + vextracti32x4 %%T1, %%CTR1, (%%num_initial_blocks - 5) +%elif %%num_initial_blocks <= 12 + vextracti32x4 %%T1, %%CTR2, (%%num_initial_blocks - 9) +%else + vextracti32x4 %%T1, %%CTR3, (%%num_initial_blocks - 13) +%endif + + ;; write cipher/plain text back to output and + ZMM_STORE_MASKED_BLOCKS_0_16 %%num_initial_blocks, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, %%MASKREG + + ;; zero bytes outside the mask before hashing +%if %%num_initial_blocks <= 4 + vmovdqu8 %%CTR0{%%MASKREG}{z}, %%CTR0 +%elif %%num_initial_blocks <= 8 + vmovdqu8 %%CTR1{%%MASKREG}{z}, %%CTR1 +%elif %%num_initial_blocks <= 12 + vmovdqu8 %%CTR2{%%MASKREG}{z}, %%CTR2 +%else + vmovdqu8 %%CTR3{%%MASKREG}{z}, %%CTR3 +%endif + + ;; Shuffle the cipher text blocks for hashing part + ;; ZT5 and ZT6 are expected outputs with blocks for hashing +%ifidn %%ENC_DEC, DEC + ;; Decrypt case + ;; - cipher blocks are in ZT5 & ZT6 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, \ + %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK +%else + ;; Encrypt case + ;; - cipher blocks are in CTR0-CTR3 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK +%endif ; Encrypt + + ;; Extract the last block for partials and multi_call cases +%if %%num_initial_blocks <= 4 + vextracti32x4 %%T7, %%DAT0, %%num_initial_blocks - 1 +%elif %%num_initial_blocks <= 8 + vextracti32x4 %%T7, %%DAT1, %%num_initial_blocks - 5 +%elif %%num_initial_blocks <= 12 + vextracti32x4 %%T7, %%DAT2, %%num_initial_blocks - 9 +%else + vextracti32x4 %%T7, %%DAT3, %%num_initial_blocks - 13 +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Hash all but the last block of data +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;; update data offset +%if %%num_initial_blocks > 1 + ;; The final block of data may be <16B + add %%DATA_OFFSET, 16 * (%%num_initial_blocks - 1) + sub %%LENGTH, 16 * (%%num_initial_blocks - 1) +%endif + +%if %%num_initial_blocks < 16 + ;; NOTE: the 'jl' is always taken for num_initial_blocks = 16. + ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 256. + cmp %%LENGTH, 16 + jl %%_small_initial_partial_block + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Handle a full length final block - encrypt and hash all blocks +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + sub %%LENGTH, 16 + add %%DATA_OFFSET, 16 + mov [%%GDATA_CTX + PBlockLen], %%LENGTH + + ;; Hash all of the data + + ;; ZT2 - incoming AAD hash (low 128bits) + ;; ZT12-ZT20 - temporary registers + GHASH_1_TO_16 %%GDATA_KEY, %%HASH_IN_OUT, \ + %%ZT12, %%ZT13, %%ZT14, %%ZT15, %%ZT16, \ + %%ZT17, %%ZT18, %%ZT19, %%ZT20, \ + %%GH, %%GL, %%GM, \ + %%ZT2, %%DAT0, %%DAT1, %%DAT2, %%DAT3, \ + %%num_initial_blocks + + jmp %%_small_initial_compute_done +%endif ; %if %%num_initial_blocks < 16 + +%%_small_initial_partial_block: + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;;; Handle ghash for a <16B final block + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;; In this case if it's a single call to encrypt we can + ;; hash all of the data but if it's an init / update / finalize + ;; series of call we need to leave the last block if it's + ;; less than a full block of data. + + mov [%%GDATA_CTX + PBlockLen], %%LENGTH + ;; %%T1 is ciphered counter block + vmovdqu64 [%%GDATA_CTX + PBlockEncKey], %%T1 + +%ifidn %%INSTANCE_TYPE, multi_call +%assign k (%%num_initial_blocks - 1) +%assign last_block_to_hash 1 +%else +%assign k (%%num_initial_blocks) +%assign last_block_to_hash 0 +%endif + +%if (%%num_initial_blocks > last_block_to_hash) + + ;; ZT12-ZT20 - temporary registers + GHASH_1_TO_16 %%GDATA_KEY, %%HASH_IN_OUT, \ + %%ZT12, %%ZT13, %%ZT14, %%ZT15, %%ZT16, \ + %%ZT17, %%ZT18, %%ZT19, %%ZT20, \ + %%GH, %%GL, %%GM, \ + %%ZT2, %%DAT0, %%DAT1, %%DAT2, %%DAT3, k + + ;; just fall through no jmp needed +%else + ;; Record that a reduction is not needed - + ;; In this case no hashes are computed because there + ;; is only one initial block and it is < 16B in length. + ;; We only need to check if a reduction is needed if + ;; initial_blocks == 1 and init/update/final is being used. + ;; In this case we may just have a partial block, and that + ;; gets hashed in finalize. + +%assign need_for_reduction 1 +%ifidn %%GH, no_zmm +%ifidn %%GL, no_zmm +%ifidn %%GM, no_zmm +;; if %%GH, %%GL & %%GM not passed then reduction is not required +%assign need_for_reduction 0 +%endif +%endif +%endif + +%if need_for_reduction == 0 + ;; The hash should end up in HASH_IN_OUT. + ;; The only way we should get here is if there is + ;; a partial block of data, so xor that into the hash. + vpxorq %%HASH_IN_OUT, %%T2, %%T7 +%else + ;; right - here we have nothing to ghash in the small data but + ;; we have GHASH sums passed through that we need to gather and reduce + + ;; integrate TM into TH and TL + vpsrldq %%ZT12, %%GM, 8 + vpslldq %%ZT13, %%GM, 8 + vpxorq %%GH, %%GH, %%ZT12 + vpxorq %%GL, %%GL, %%ZT13 + + ;; add TH and TL 128-bit words horizontally + VHPXORI4x128 %%GH, %%ZT12 + VHPXORI4x128 %%GL, %%ZT13 + + ;; reduction + vmovdqa64 XWORD(%%ZT12), [rel POLY2] + VCLMUL_REDUCE %%HASH_IN_OUT, XWORD(%%ZT12), \ + XWORD(%%GH), XWORD(%%GL), XWORD(%%ZT13), XWORD(%%ZT14) + + vpxorq %%HASH_IN_OUT, %%HASH_IN_OUT, %%T7 +%endif + ;; The result is in %%HASH_IN_OUT + jmp %%_after_reduction +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; After GHASH reduction +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_small_initial_compute_done: + +%ifidn %%INSTANCE_TYPE, multi_call + ;; If using init/update/finalize, we need to xor any partial block data + ;; into the hash. +%if %%num_initial_blocks > 1 + ;; NOTE: for %%num_initial_blocks = 0 the xor never takes place +%if %%num_initial_blocks != 16 + ;; NOTE: for %%num_initial_blocks = 16, %%LENGTH, stored in [PBlockLen] is never zero + or %%LENGTH, %%LENGTH + je %%_after_reduction +%endif ; %%num_initial_blocks != 16 + vpxorq %%HASH_IN_OUT, %%HASH_IN_OUT, %%T7 +%endif ; %%num_initial_blocks > 1 +%endif ; %%INSTANCE_TYPE, multi_call + +%%_after_reduction: + ;; Final hash is now in HASH_IN_OUT + +%endmacro ; INITIAL_BLOCKS_PARTIAL + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Main GCM macro stitching cipher with GHASH +;;; - operates on single stream +;;; - encrypts 8 blocks at a time +;;; - ghash the 8 previously encrypted ciphertext blocks +;;; For partial block case and multi_call , AES_PARTIAL_BLOCK on output +;;; contains encrypted counter block. +%macro GHASH_8_ENCRYPT_8_PARALLEL 34-37 +%define %%GDATA %1 ; [in] key pointer +%define %%CYPH_PLAIN_OUT %2 ; [in] pointer to output buffer +%define %%PLAIN_CYPH_IN %3 ; [in] pointer to input buffer +%define %%DATA_OFFSET %4 ; [in] data offset +%define %%CTR1 %5 ; [in/out] ZMM counter blocks 0 to 3 +%define %%CTR2 %6 ; [in/out] ZMM counter blocks 4 to 7 +%define %%GHASHIN_AESOUT_B03 %7 ; [in/out] ZMM ghash in / aes out blocks 0 to 3 +%define %%GHASHIN_AESOUT_B47 %8 ; [in/out] ZMM ghash in / aes out blocks 4 to 7 +%define %%AES_PARTIAL_BLOCK %9 ; [out] XMM partial block (AES) +%define %%loop_idx %10 ; [in] counter block prep selection "add+shuffle" or "add" +%define %%ENC_DEC %11 ; [in] cipher direction +%define %%FULL_PARTIAL %12 ; [in] last block type selection "full" or "partial" +%define %%IA0 %13 ; [clobbered] temporary GP register +%define %%IA1 %14 ; [clobbered] temporary GP register +%define %%LENGTH %15 ; [in] length +%define %%INSTANCE_TYPE %16 ; [in] 'single_call' or 'multi_call' selection +%define %%GH4KEY %17 ; [in] ZMM with GHASH keys 4 to 1 +%define %%GH8KEY %18 ; [in] ZMM with GHASH keys 8 to 5 +%define %%SHFMSK %19 ; [in] ZMM with byte swap mask for pshufb +%define %%ZT1 %20 ; [clobbered] temporary ZMM (cipher) +%define %%ZT2 %21 ; [clobbered] temporary ZMM (cipher) +%define %%ZT3 %22 ; [clobbered] temporary ZMM (cipher) +%define %%ZT4 %23 ; [clobbered] temporary ZMM (cipher) +%define %%ZT5 %24 ; [clobbered] temporary ZMM (cipher) +%define %%ZT10 %25 ; [clobbered] temporary ZMM (ghash) +%define %%ZT11 %26 ; [clobbered] temporary ZMM (ghash) +%define %%ZT12 %27 ; [clobbered] temporary ZMM (ghash) +%define %%ZT13 %28 ; [clobbered] temporary ZMM (ghash) +%define %%ZT14 %29 ; [clobbered] temporary ZMM (ghash) +%define %%ZT15 %30 ; [clobbered] temporary ZMM (ghash) +%define %%ZT16 %31 ; [clobbered] temporary ZMM (ghash) +%define %%ZT17 %32 ; [clobbered] temporary ZMM (ghash) +%define %%MASKREG %33 ; [clobbered] mask register for partial loads/stores +%define %%DO_REDUCTION %34 ; [in] "reduction", "no_reduction", "final_reduction" +%define %%TO_REDUCE_L %35 ; [in/out] ZMM for low 4x128-bit in case of "no_reduction" +%define %%TO_REDUCE_H %36 ; [in/out] ZMM for hi 4x128-bit in case of "no_reduction" +%define %%TO_REDUCE_M %37 ; [in/out] ZMM for medium 4x128-bit in case of "no_reduction" + +%define %%GH1H %%ZT10 +%define %%GH1L %%ZT11 +%define %%GH1M1 %%ZT12 +%define %%GH1M2 %%ZT13 + +%define %%GH2H %%ZT14 +%define %%GH2L %%ZT15 +%define %%GH2M1 %%ZT16 +%define %%GH2M2 %%ZT17 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; populate counter blocks for cipher part +%ifidn %%loop_idx, in_order + ;; %%CTR1 & %%CTR2 are shuffled outside the scope of this macro + ;; it has to be kept in unshuffled format + vpshufb %%ZT1, %%CTR1, %%SHFMSK + vpshufb %%ZT2, %%CTR2, %%SHFMSK +%else + vmovdqa64 %%ZT1, %%CTR1 + vmovdqa64 %%ZT2, %%CTR2 +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; stitch AES rounds with GHASH + +%assign aes_round 0 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES round 0 - ARK + vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \ + %%ZT3, aes_round, \ + %%ZT4, %%ZT5, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) + + ;;================================================== + ;; GHASH 4 blocks + vpclmulqdq %%GH1H, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x11 ; a1*b1 + vpclmulqdq %%GH1L, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x00 ; a0*b0 + vpclmulqdq %%GH1M1, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x01 ; a1*b0 + vpclmulqdq %%GH1M2, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x10 ; a0*b1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; 3 AES rounds +%rep 3 + vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \ + %%ZT3, aes_round, \ + %%ZT4, %%ZT5, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endrep ; 3 x AES ROUND + + ;; ================================================= + ;; GHASH 4 blocks + vpclmulqdq %%GH2M1, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x10 ; a0*b1 + vpclmulqdq %%GH2M2, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x01 ; a1*b0 + vpclmulqdq %%GH2H, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x11 ; a1*b1 + vpclmulqdq %%GH2L, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x00 ; a0*b0 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; 3 AES rounds +%rep 3 + vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \ + %%ZT3, aes_round, \ + %%ZT4, %%ZT5, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endrep ; 3 x AES ROUND + + ;; ================================================= + ;; gather GHASH in GH1L (low) and GH1H (high) +%ifidn %%DO_REDUCTION, no_reduction + vpternlogq %%GH1M1, %%GH1M2, %%GH2M1, 0x96 ; TM: GH1M1 ^= GH1M2 ^ GH2M1 + vpternlogq %%TO_REDUCE_M, %%GH1M1, %%GH2M2, 0x96 ; TM: TO_REDUCE_M ^= GH1M1 ^ GH2M2 + vpternlogq %%TO_REDUCE_H, %%GH1H, %%GH2H, 0x96 ; TH: TO_REDUCE_H ^= GH1H ^ GH2H + vpternlogq %%TO_REDUCE_L, %%GH1L, %%GH2L, 0x96 ; TL: TO_REDUCE_L ^= GH1L ^ GH2L +%endif +%ifidn %%DO_REDUCTION, do_reduction + ;; phase 1: add mid products together + vpternlogq %%GH1M1, %%GH1M2, %%GH2M1, 0x96 ; TM: GH1M1 ^= GH1M2 ^ GH2M1 + vpxorq %%GH1M1, %%GH1M1, %%GH2M2 + + vpsrldq %%GH2M1, %%GH1M1, 8 + vpslldq %%GH1M1, %%GH1M1, 8 +%endif +%ifidn %%DO_REDUCTION, final_reduction + ;; phase 1: add mid products together + vpternlogq %%GH1M1, %%GH1M2, %%GH2M1, 0x96 ; TM: GH1M1 ^= GH1M2 ^ GH2M1 + vpternlogq %%GH1M1, %%TO_REDUCE_M, %%GH2M2, 0x96 ; TM: GH1M1 ^= TO_REDUCE_M ^ GH2M2 + + vpsrldq %%GH2M1, %%GH1M1, 8 + vpslldq %%GH1M1, %%GH1M1, 8 +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; 2 AES rounds +%rep 2 + vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \ + %%ZT3, aes_round, \ + %%ZT4, %%ZT5, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endrep ; 2 x AES ROUND + + ;; ================================================= + ;; Add mid product to high and low then + ;; horizontal xor of low and high 4x128 +%ifidn %%DO_REDUCTION, final_reduction + vpternlogq %%GH1H, %%GH2H, %%GH2M1, 0x96 ; TH = TH1 + TH2 + TM>>64 + vpxorq %%GH1H, %%TO_REDUCE_H + vpternlogq %%GH1L, %%GH2L, %%GH1M1, 0x96 ; TL = TL1 + TL2 + TM<<64 + vpxorq %%GH1L, %%TO_REDUCE_L +%endif +%ifidn %%DO_REDUCTION, do_reduction + vpternlogq %%GH1H, %%GH2H, %%GH2M1, 0x96 ; TH = TH1 + TH2 + TM>>64 + vpternlogq %%GH1L, %%GH2L, %%GH1M1, 0x96 ; TL = TL1 + TL2 + TM<<64 +%endif +%ifnidn %%DO_REDUCTION, no_reduction + VHPXORI4x128 %%GH1H, %%GH2H + VHPXORI4x128 %%GH1L, %%GH2L +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; 2 AES rounds +%rep 2 +%if (aes_round < (NROUNDS + 1)) + vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \ + %%ZT3, aes_round, \ + %%ZT4, %%ZT5, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endif ; aes_round < (NROUNDS + 1) +%endrep + + ;; ================================================= + ;; first phase of reduction +%ifnidn %%DO_REDUCTION, no_reduction + vmovdqu64 XWORD(%%GH2M2), [rel POLY2] + vpclmulqdq XWORD(%%ZT15), XWORD(%%GH2M2), XWORD(%%GH1L), 0x01 + vpslldq XWORD(%%ZT15), XWORD(%%ZT15), 8 ; shift-L 2 DWs + vpxorq XWORD(%%ZT15), XWORD(%%GH1L), XWORD(%%ZT15) ; first phase of the reduct +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; 2 AES rounds +%rep 2 +%if (aes_round < (NROUNDS + 1)) + vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \ + %%ZT3, aes_round, \ + %%ZT4, %%ZT5, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endif ; aes_round < (NROUNDS + 1) +%endrep + + ;; ================================================= + ;; second phase of the reduction +%ifnidn %%DO_REDUCTION, no_reduction + vpclmulqdq XWORD(%%ZT16), XWORD(%%GH2M2), XWORD(%%ZT15), 0x00 + vpsrldq XWORD(%%ZT16), XWORD(%%ZT16), 4 ; shift-R 1-DW to obtain 2-DWs shift-R + + vpclmulqdq XWORD(%%ZT13), XWORD(%%GH2M2), XWORD(%%ZT15), 0x10 + vpslldq XWORD(%%ZT13), XWORD(%%ZT13), 4 ; shift-L 1-DW for result without shifts + ;; ZT13 = ZT13 xor ZT16 xor GH1H + vpternlogq XWORD(%%ZT13), XWORD(%%ZT16), XWORD(%%GH1H), 0x96 +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; all remaining AES rounds but the last +%rep (NROUNDS + 2) +%if (aes_round < (NROUNDS + 1)) + vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \ + %%ZT3, aes_round, \ + %%ZT4, %%ZT5, no_zmm, no_zmm, \ + 8, NROUNDS +%assign aes_round (aes_round + 1) +%endif ; aes_round < (NROUNDS + 1) +%endrep + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; load/store mask (partial case) and load the text data +%ifidn %%FULL_PARTIAL, full + VX512LDR %%ZT4, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + VX512LDR %%ZT5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 64] +%else + lea %%IA0, [rel byte64_len_to_mask_table] + mov %%IA1, %%LENGTH + sub %%IA1, 64 + kmovq %%MASKREG, [%%IA0 + 8*%%IA1] + VX512LDR %%ZT4, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + vmovdqu8 %%ZT5{%%MASKREG}{z}, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 64] +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; the last AES round (NROUNDS + 1) and XOR against plain/cipher text + vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)] + ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \ + %%ZT3, aes_round, \ + %%ZT4, %%ZT5, no_zmm, no_zmm, \ + 8, NROUNDS + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; store the cipher/plain text data +%ifidn %%FULL_PARTIAL, full + VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], %%ZT1 + VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 64], %%ZT2 +%else + VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], %%ZT1 + vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 64]{%%MASKREG}, %%ZT2 +%endif + + ;; ================================================= + ;; prep cipher text blocks for the next ghash round + +%ifnidn %%FULL_PARTIAL, full +%ifidn %%INSTANCE_TYPE, multi_call + ;; for partial block & multi_call we need encrypted counter block + vpxorq %%ZT3, %%ZT2, %%ZT5 + vextracti32x4 %%AES_PARTIAL_BLOCK, %%ZT3, 3 +%endif + ;; for GHASH computation purpose clear the top bytes of the partial block +%ifidn %%ENC_DEC, ENC + vmovdqu8 %%ZT2{%%MASKREG}{z}, %%ZT2 +%else + vmovdqu8 %%ZT5{%%MASKREG}{z}, %%ZT5 +%endif +%endif ; %ifnidn %%FULL_PARTIAL, full + + ;; ================================================= + ;; shuffle cipher text blocks for GHASH computation +%ifidn %%ENC_DEC, ENC + vpshufb %%GHASHIN_AESOUT_B03, %%ZT1, %%SHFMSK + vpshufb %%GHASHIN_AESOUT_B47, %%ZT2, %%SHFMSK +%else + vpshufb %%GHASHIN_AESOUT_B03, %%ZT4, %%SHFMSK + vpshufb %%GHASHIN_AESOUT_B47, %%ZT5, %%SHFMSK +%endif + +%ifidn %%DO_REDUCTION, do_reduction + ;; ================================================= + ;; XOR current GHASH value (ZT13) into block 0 + vpxorq %%GHASHIN_AESOUT_B03, %%ZT13 +%endif +%ifidn %%DO_REDUCTION, final_reduction + ;; ================================================= + ;; Return GHASH value (ZT13) in TO_REDUCE_L + vmovdqa64 %%TO_REDUCE_L, %%ZT13 +%endif + +%endmacro ; GHASH_8_ENCRYPT_8_PARALLEL + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Main GCM macro stitching cipher with GHASH +;;; - operates on single stream +;;; - encrypts 16 blocks at a time +;;; - ghash the 16 previously encrypted ciphertext blocks +;;; - no partial block or multi_call handling here +%macro GHASH_16_ENCRYPT_16_PARALLEL 42 +%define %%GDATA %1 ; [in] key pointer +%define %%CYPH_PLAIN_OUT %2 ; [in] pointer to output buffer +%define %%PLAIN_CYPH_IN %3 ; [in] pointer to input buffer +%define %%DATA_OFFSET %4 ; [in] data offset +%define %%CTR_BE %5 ; [in/out] ZMM counter blocks (last 4) in big-endian +%define %%CTR_CHECK %6 ; [in/out] GP with 8-bit counter for overflow check +%define %%HASHKEY_OFFSET %7 ; [in] numerical offset for the highest hash key +%define %%AESOUT_BLK_OFFSET %8 ; [in] numerical offset for AES-CTR out +%define %%GHASHIN_BLK_OFFSET %9 ; [in] numerical offset for GHASH blocks in +%define %%SHFMSK %10 ; [in] ZMM with byte swap mask for pshufb +%define %%ZT1 %11 ; [clobbered] temporary ZMM (cipher) +%define %%ZT2 %12 ; [clobbered] temporary ZMM (cipher) +%define %%ZT3 %13 ; [clobbered] temporary ZMM (cipher) +%define %%ZT4 %14 ; [clobbered] temporary ZMM (cipher) +%define %%ZT5 %15 ; [clobbered/out] temporary ZMM or GHASH OUT (final_reduction) +%define %%ZT6 %16 ; [clobbered] temporary ZMM (cipher) +%define %%ZT7 %17 ; [clobbered] temporary ZMM (cipher) +%define %%ZT8 %18 ; [clobbered] temporary ZMM (cipher) +%define %%ZT9 %19 ; [clobbered] temporary ZMM (cipher) +%define %%ZT10 %20 ; [clobbered] temporary ZMM (ghash) +%define %%ZT11 %21 ; [clobbered] temporary ZMM (ghash) +%define %%ZT12 %22 ; [clobbered] temporary ZMM (ghash) +%define %%ZT13 %23 ; [clobbered] temporary ZMM (ghash) +%define %%ZT14 %24 ; [clobbered] temporary ZMM (ghash) +%define %%ZT15 %25 ; [clobbered] temporary ZMM (ghash) +%define %%ZT16 %26 ; [clobbered] temporary ZMM (ghash) +%define %%ZT17 %27 ; [clobbered] temporary ZMM (ghash) +%define %%ZT18 %28 ; [clobbered] temporary ZMM (ghash) +%define %%ZT19 %29 ; [clobbered] temporary ZMM +%define %%ZT20 %30 ; [clobbered] temporary ZMM +%define %%ZT21 %31 ; [clobbered] temporary ZMM +%define %%ZT22 %32 ; [clobbered] temporary ZMM +%define %%ZT23 %33 ; [clobbered] temporary ZMM +%define %%ADDBE_4x4 %34 ; [in] ZMM with 4x128bits 4 in big-endian +%define %%ADDBE_1234 %35 ; [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian +%define %%TO_REDUCE_L %36 ; [in/out] ZMM for low 4x128-bit GHASH sum +%define %%TO_REDUCE_H %37 ; [in/out] ZMM for hi 4x128-bit GHASH sum +%define %%TO_REDUCE_M %38 ; [in/out] ZMM for medium 4x128-bit GHASH sum +%define %%DO_REDUCTION %39 ; [in] "no_reduction", "final_reduction", "first_time" +%define %%ENC_DEC %40 ; [in] cipher direction +%define %%DATA_DISPL %41 ; [in] fixed numerical data displacement/offset +%define %%GHASH_IN %42 ; [in] current GHASH value or "no_ghash_in" + +%define %%B00_03 %%ZT1 +%define %%B04_07 %%ZT2 +%define %%B08_11 %%ZT3 +%define %%B12_15 %%ZT4 + +%define %%GH1H %%ZT5 ; @note: do not change this mapping +%define %%GH1L %%ZT6 +%define %%GH1M %%ZT7 +%define %%GH1T %%ZT8 + +%define %%GH2H %%ZT9 +%define %%GH2L %%ZT10 +%define %%GH2M %%ZT11 +%define %%GH2T %%ZT12 + +%define %%RED_POLY %%GH2T +%define %%RED_P1 %%GH2L +%define %%RED_T1 %%GH2H +%define %%RED_T2 %%GH2M + +%define %%GH3H %%ZT13 +%define %%GH3L %%ZT14 +%define %%GH3M %%ZT15 +%define %%GH3T %%ZT16 + +%define %%DATA1 %%ZT13 +%define %%DATA2 %%ZT14 +%define %%DATA3 %%ZT15 +%define %%DATA4 %%ZT16 + +%define %%AESKEY1 %%ZT17 +%define %%AESKEY2 %%ZT18 + +%define %%GHKEY1 %%ZT19 +%define %%GHKEY2 %%ZT20 +%define %%GHDAT1 %%ZT21 +%define %%GHDAT2 %%ZT22 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; prepare counter blocks + + cmp BYTE(%%CTR_CHECK), (256 - 16) + jae %%_16_blocks_overflow + vpaddd %%B00_03, %%CTR_BE, %%ADDBE_1234 + vpaddd %%B04_07, %%B00_03, %%ADDBE_4x4 + vpaddd %%B08_11, %%B04_07, %%ADDBE_4x4 + vpaddd %%B12_15, %%B08_11, %%ADDBE_4x4 + jmp %%_16_blocks_ok +%%_16_blocks_overflow: + vpshufb %%CTR_BE, %%CTR_BE, %%SHFMSK + vmovdqa64 %%B12_15, [rel ddq_add_4444] + vpaddd %%B00_03, %%CTR_BE, [rel ddq_add_1234] + vpaddd %%B04_07, %%B00_03, %%B12_15 + vpaddd %%B08_11, %%B04_07, %%B12_15 + vpaddd %%B12_15, %%B08_11, %%B12_15 + vpshufb %%B00_03, %%SHFMSK + vpshufb %%B04_07, %%SHFMSK + vpshufb %%B08_11, %%SHFMSK + vpshufb %%B12_15, %%SHFMSK +%%_16_blocks_ok: + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; pre-load constants + vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 0)] +%ifnidn %%GHASH_IN, no_ghash_in + vpxorq %%GHDAT1, %%GHASH_IN, [rsp + %%GHASHIN_BLK_OFFSET + (0*64)] +%else + vmovdqa64 %%GHDAT1, [rsp + %%GHASHIN_BLK_OFFSET + (0*64)] +%endif + vmovdqu64 %%GHKEY1, [%%GDATA + %%HASHKEY_OFFSET + (0*64)] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; save counter for the next round + ;; increment counter overflow check register + vshufi64x2 %%CTR_BE, %%B12_15, %%B12_15, 1111_1111b + add BYTE(%%CTR_CHECK), 16 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; pre-load constants + vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 1)] + vmovdqu64 %%GHKEY2, [%%GDATA + %%HASHKEY_OFFSET + (1*64)] + vmovdqa64 %%GHDAT2, [rsp + %%GHASHIN_BLK_OFFSET + (1*64)] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; stitch AES rounds with GHASH + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES round 0 - ARK + + vpxorq %%B00_03, %%AESKEY1 + vpxorq %%B04_07, %%AESKEY1 + vpxorq %%B08_11, %%AESKEY1 + vpxorq %%B12_15, %%AESKEY1 + vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 2)] + + ;;================================================== + ;; GHASH 4 blocks (15 to 12) + vpclmulqdq %%GH1H, %%GHDAT1, %%GHKEY1, 0x11 ; a1*b1 + vpclmulqdq %%GH1L, %%GHDAT1, %%GHKEY1, 0x00 ; a0*b0 + vpclmulqdq %%GH1M, %%GHDAT1, %%GHKEY1, 0x01 ; a1*b0 + vpclmulqdq %%GH1T, %%GHDAT1, %%GHKEY1, 0x10 ; a0*b1 + + vmovdqu64 %%GHKEY1, [%%GDATA + %%HASHKEY_OFFSET + (2*64)] + vmovdqa64 %%GHDAT1, [rsp + %%GHASHIN_BLK_OFFSET + (2*64)] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES round 1 + vaesenc %%B00_03, %%B00_03, %%AESKEY2 + vaesenc %%B04_07, %%B04_07, %%AESKEY2 + vaesenc %%B08_11, %%B08_11, %%AESKEY2 + vaesenc %%B12_15, %%B12_15, %%AESKEY2 + vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 3)] + + ;; ================================================= + ;; GHASH 4 blocks (11 to 8) + vpclmulqdq %%GH2M, %%GHDAT2, %%GHKEY2, 0x10 ; a0*b1 + vpclmulqdq %%GH2T, %%GHDAT2, %%GHKEY2, 0x01 ; a1*b0 + vpclmulqdq %%GH2H, %%GHDAT2, %%GHKEY2, 0x11 ; a1*b1 + vpclmulqdq %%GH2L, %%GHDAT2, %%GHKEY2, 0x00 ; a0*b0 + + vmovdqu64 %%GHKEY2, [%%GDATA + %%HASHKEY_OFFSET + (3*64)] + vmovdqa64 %%GHDAT2, [rsp + %%GHASHIN_BLK_OFFSET + (3*64)] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES round 2 + vaesenc %%B00_03, %%B00_03, %%AESKEY1 + vaesenc %%B04_07, %%B04_07, %%AESKEY1 + vaesenc %%B08_11, %%B08_11, %%AESKEY1 + vaesenc %%B12_15, %%B12_15, %%AESKEY1 + vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 4)] + + ;; ================================================= + ;; GHASH 4 blocks (7 to 4) + vpclmulqdq %%GH3M, %%GHDAT1, %%GHKEY1, 0x10 ; a0*b1 + vpclmulqdq %%GH3T, %%GHDAT1, %%GHKEY1, 0x01 ; a1*b0 + vpclmulqdq %%GH3H, %%GHDAT1, %%GHKEY1, 0x11 ; a1*b1 + vpclmulqdq %%GH3L, %%GHDAT1, %%GHKEY1, 0x00 ; a0*b0 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES rounds 3 + vaesenc %%B00_03, %%B00_03, %%AESKEY2 + vaesenc %%B04_07, %%B04_07, %%AESKEY2 + vaesenc %%B08_11, %%B08_11, %%AESKEY2 + vaesenc %%B12_15, %%B12_15, %%AESKEY2 + vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 5)] + + ;; ================================================= + ;; Gather (XOR) GHASH for 12 blocks + vpternlogq %%GH1H, %%GH2H, %%GH3H, 0x96 + vpternlogq %%GH1L, %%GH2L, %%GH3L, 0x96 + vpternlogq %%GH1T, %%GH2T, %%GH3T, 0x96 + vpternlogq %%GH1M, %%GH2M, %%GH3M, 0x96 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES rounds 4 + vaesenc %%B00_03, %%B00_03, %%AESKEY1 + vaesenc %%B04_07, %%B04_07, %%AESKEY1 + vaesenc %%B08_11, %%B08_11, %%AESKEY1 + vaesenc %%B12_15, %%B12_15, %%AESKEY1 + vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 6)] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; load plain/cipher text (recycle GH3xx registers) + VX512LDR %%DATA1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (0 * 64)] + VX512LDR %%DATA2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (1 * 64)] + VX512LDR %%DATA3, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (2 * 64)] + VX512LDR %%DATA4, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (3 * 64)] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES rounds 5 + vaesenc %%B00_03, %%B00_03, %%AESKEY2 + vaesenc %%B04_07, %%B04_07, %%AESKEY2 + vaesenc %%B08_11, %%B08_11, %%AESKEY2 + vaesenc %%B12_15, %%B12_15, %%AESKEY2 + vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 7)] + + ;; ================================================= + ;; GHASH 4 blocks (3 to 0) + vpclmulqdq %%GH2M, %%GHDAT2, %%GHKEY2, 0x10 ; a0*b1 + vpclmulqdq %%GH2T, %%GHDAT2, %%GHKEY2, 0x01 ; a1*b0 + vpclmulqdq %%GH2H, %%GHDAT2, %%GHKEY2, 0x11 ; a1*b1 + vpclmulqdq %%GH2L, %%GHDAT2, %%GHKEY2, 0x00 ; a0*b0 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES round 6 + vaesenc %%B00_03, %%B00_03, %%AESKEY1 + vaesenc %%B04_07, %%B04_07, %%AESKEY1 + vaesenc %%B08_11, %%B08_11, %%AESKEY1 + vaesenc %%B12_15, %%B12_15, %%AESKEY1 + vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 8)] + + ;; ================================================= + ;; gather GHASH in GH1L (low) and GH1H (high) +%ifidn %%DO_REDUCTION, first_time + vpternlogq %%GH1M, %%GH1T, %%GH2T, 0x96 ; TM + vpxorq %%TO_REDUCE_M, %%GH1M, %%GH2M ; TM + vpxorq %%TO_REDUCE_H, %%GH1H, %%GH2H ; TH + vpxorq %%TO_REDUCE_L, %%GH1L, %%GH2L ; TL +%endif +%ifidn %%DO_REDUCTION, no_reduction + vpternlogq %%GH1M, %%GH1T, %%GH2T, 0x96 ; TM + vpternlogq %%TO_REDUCE_M, %%GH1M, %%GH2M, 0x96 ; TM + vpternlogq %%TO_REDUCE_H, %%GH1H, %%GH2H, 0x96 ; TH + vpternlogq %%TO_REDUCE_L, %%GH1L, %%GH2L, 0x96 ; TL +%endif +%ifidn %%DO_REDUCTION, final_reduction + ;; phase 1: add mid products together + ;; also load polynomial constant for reduction + vpternlogq %%GH1M, %%GH1T, %%GH2T, 0x96 ; TM + vpternlogq %%GH1M, %%TO_REDUCE_M, %%GH2M, 0x96 + + vpsrldq %%GH2M, %%GH1M, 8 + vpslldq %%GH1M, %%GH1M, 8 + + vmovdqa64 XWORD(%%RED_POLY), [rel POLY2] +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES round 7 + vaesenc %%B00_03, %%B00_03, %%AESKEY2 + vaesenc %%B04_07, %%B04_07, %%AESKEY2 + vaesenc %%B08_11, %%B08_11, %%AESKEY2 + vaesenc %%B12_15, %%B12_15, %%AESKEY2 + vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 9)] + + ;; ================================================= + ;; Add mid product to high and low +%ifidn %%DO_REDUCTION, final_reduction + vpternlogq %%GH1H, %%GH2H, %%GH2M, 0x96 ; TH = TH1 + TH2 + TM>>64 + vpxorq %%GH1H, %%TO_REDUCE_H + vpternlogq %%GH1L, %%GH2L, %%GH1M, 0x96 ; TL = TL1 + TL2 + TM<<64 + vpxorq %%GH1L, %%TO_REDUCE_L +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES round 8 + vaesenc %%B00_03, %%B00_03, %%AESKEY1 + vaesenc %%B04_07, %%B04_07, %%AESKEY1 + vaesenc %%B08_11, %%B08_11, %%AESKEY1 + vaesenc %%B12_15, %%B12_15, %%AESKEY1 + vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 10)] + + ;; ================================================= + ;; horizontal xor of low and high 4x128 +%ifidn %%DO_REDUCTION, final_reduction + VHPXORI4x128 %%GH1H, %%GH2H + VHPXORI4x128 %%GH1L, %%GH2L +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES round 9 + vaesenc %%B00_03, %%B00_03, %%AESKEY2 + vaesenc %%B04_07, %%B04_07, %%AESKEY2 + vaesenc %%B08_11, %%B08_11, %%AESKEY2 + vaesenc %%B12_15, %%B12_15, %%AESKEY2 +%if (NROUNDS >= 11) + vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 11)] +%endif + ;; ================================================= + ;; first phase of reduction +%ifidn %%DO_REDUCTION, final_reduction + vpclmulqdq XWORD(%%RED_P1), XWORD(%%RED_POLY), XWORD(%%GH1L), 0x01 + vpslldq XWORD(%%RED_P1), XWORD(%%RED_P1), 8 ; shift-L 2 DWs + vpxorq XWORD(%%RED_P1), XWORD(%%GH1L), XWORD(%%RED_P1) ; first phase of the reduct +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; AES rounds up to 11 (AES192) or 13 (AES256) + ;; AES128 is done +%if (NROUNDS >= 11) + vaesenc %%B00_03, %%B00_03, %%AESKEY1 + vaesenc %%B04_07, %%B04_07, %%AESKEY1 + vaesenc %%B08_11, %%B08_11, %%AESKEY1 + vaesenc %%B12_15, %%B12_15, %%AESKEY1 + vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 12)] + + vaesenc %%B00_03, %%B00_03, %%AESKEY2 + vaesenc %%B04_07, %%B04_07, %%AESKEY2 + vaesenc %%B08_11, %%B08_11, %%AESKEY2 + vaesenc %%B12_15, %%B12_15, %%AESKEY2 +%if (NROUNDS == 13) + vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 13)] + + vaesenc %%B00_03, %%B00_03, %%AESKEY1 + vaesenc %%B04_07, %%B04_07, %%AESKEY1 + vaesenc %%B08_11, %%B08_11, %%AESKEY1 + vaesenc %%B12_15, %%B12_15, %%AESKEY1 + vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 14)] + + vaesenc %%B00_03, %%B00_03, %%AESKEY2 + vaesenc %%B04_07, %%B04_07, %%AESKEY2 + vaesenc %%B08_11, %%B08_11, %%AESKEY2 + vaesenc %%B12_15, %%B12_15, %%AESKEY2 +%endif ; GCM256 / NROUNDS = 13 (15 including the first and the last) +%endif ; GCM192 / NROUNDS = 11 (13 including the first and the last) + + ;; ================================================= + ;; second phase of the reduction +%ifidn %%DO_REDUCTION, final_reduction + vpclmulqdq XWORD(%%RED_T1), XWORD(%%RED_POLY), XWORD(%%RED_P1), 0x00 + vpsrldq XWORD(%%RED_T1), XWORD(%%RED_T1), 4 ; shift-R 1-DW to obtain 2-DWs shift-R + + vpclmulqdq XWORD(%%RED_T2), XWORD(%%RED_POLY), XWORD(%%RED_P1), 0x10 + vpslldq XWORD(%%RED_T2), XWORD(%%RED_T2), 4 ; shift-L 1-DW for result without shifts + ;; GH1H = GH1H x RED_T1 x RED_T2 + vpternlogq XWORD(%%GH1H), XWORD(%%RED_T2), XWORD(%%RED_T1), 0x96 +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; the last AES round + vaesenclast %%B00_03, %%B00_03, %%AESKEY1 + vaesenclast %%B04_07, %%B04_07, %%AESKEY1 + vaesenclast %%B08_11, %%B08_11, %%AESKEY1 + vaesenclast %%B12_15, %%B12_15, %%AESKEY1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; XOR against plain/cipher text + vpxorq %%B00_03, %%B00_03, %%DATA1 + vpxorq %%B04_07, %%B04_07, %%DATA2 + vpxorq %%B08_11, %%B08_11, %%DATA3 + vpxorq %%B12_15, %%B12_15, %%DATA4 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; store cipher/plain text + VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (0 * 64)], %%B00_03 + VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (1 * 64)], %%B04_07 + VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (2 * 64)], %%B08_11 + VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (3 * 64)], %%B12_15 + + ;; ================================================= + ;; shuffle cipher text blocks for GHASH computation +%ifidn %%ENC_DEC, ENC + vpshufb %%B00_03, %%B00_03, %%SHFMSK + vpshufb %%B04_07, %%B04_07, %%SHFMSK + vpshufb %%B08_11, %%B08_11, %%SHFMSK + vpshufb %%B12_15, %%B12_15, %%SHFMSK +%else + vpshufb %%B00_03, %%DATA1, %%SHFMSK + vpshufb %%B04_07, %%DATA2, %%SHFMSK + vpshufb %%B08_11, %%DATA3, %%SHFMSK + vpshufb %%B12_15, %%DATA4, %%SHFMSK +%endif + + ;; ================================================= + ;; store shuffled cipher text for ghashing + vmovdqa64 [rsp + %%AESOUT_BLK_OFFSET + (0*64)], %%B00_03 + vmovdqa64 [rsp + %%AESOUT_BLK_OFFSET + (1*64)], %%B04_07 + vmovdqa64 [rsp + %%AESOUT_BLK_OFFSET + (2*64)], %%B08_11 + vmovdqa64 [rsp + %%AESOUT_BLK_OFFSET + (3*64)], %%B12_15 + +%ifidn %%DO_REDUCTION, final_reduction + ;; ================================================= + ;; Return GHASH value through %%GH1H +%endif + +%endmacro ; GHASH_16_ENCRYPT_16_PARALLEL + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; GHASH the last 8 ciphertext blocks. +;;; - optionally accepts GHASH product sums as input +%macro GHASH_LAST_8 10-13 +%define %%GDATA %1 ; [in] key pointer +%define %%BL47 %2 ; [in/clobbered] ZMM AES blocks 4 to 7 +%define %%BL03 %3 ; [in/cloberred] ZMM AES blocks 0 to 3 +%define %%ZTH %4 ; [cloberred] ZMM temporary +%define %%ZTM %5 ; [cloberred] ZMM temporary +%define %%ZTL %6 ; [cloberred] ZMM temporary +%define %%ZT01 %7 ; [cloberred] ZMM temporary +%define %%ZT02 %8 ; [cloberred] ZMM temporary +%define %%ZT03 %9 ; [cloberred] ZMM temporary +%define %%AAD_HASH %10 ; [out] XMM hash value +%define %%GH %11 ; [in/optional] ZMM with GHASH high product sum +%define %%GL %12 ; [in/optional] ZMM with GHASH low product sum +%define %%GM %13 ; [in/optional] ZMM with GHASH mid product sum + + VCLMUL_STEP1 %%GDATA, %%BL47, %%ZT01, %%ZTH, %%ZTM, %%ZTL + +%if %0 > 10 + ;; add optional sums before step2 + vpxorq %%ZTH, %%ZTH, %%GH + vpxorq %%ZTL, %%ZTL, %%GL + vpxorq %%ZTM, %%ZTM, %%GM +%endif + + VCLMUL_STEP2 %%GDATA, %%BL47, %%BL03, %%ZT01, %%ZT02, %%ZT03, %%ZTH, %%ZTM, %%ZTL + + vmovdqa64 XWORD(%%ZT03), [rel POLY2] + VCLMUL_REDUCE %%AAD_HASH, XWORD(%%ZT03), XWORD(%%BL47), XWORD(%%BL03), \ + XWORD(%%ZT01), XWORD(%%ZT02) +%endmacro ; GHASH_LAST_8 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; GHASH the last 7 cipher text blocks. +;;; - it uses same GHASH macros as GHASH_LAST_8 but with some twist +;;; - it loads GHASH keys for each of the data blocks, so that: +;;; - blocks 4, 5 and 6 will use GHASH keys 3, 2, 1 respectively +;;; - code ensures that unused block 7 and corresponding GHASH key are zeroed +;;; (clmul product is zero this way and will not affect the result) +;;; - blocks 0, 1, 2 and 3 will use USE GHASH keys 7, 6, 5 and 4 respectively +;;; - optionally accepts GHASH product sums as input +%macro GHASH_LAST_7 13-16 +%define %%GDATA %1 ; [in] key pointer +%define %%BL47 %2 ; [in/clobbered] ZMM AES blocks 4 to 7 +%define %%BL03 %3 ; [in/cloberred] ZMM AES blocks 0 to 3 +%define %%ZTH %4 ; [cloberred] ZMM temporary +%define %%ZTM %5 ; [cloberred] ZMM temporary +%define %%ZTL %6 ; [cloberred] ZMM temporary +%define %%ZT01 %7 ; [cloberred] ZMM temporary +%define %%ZT02 %8 ; [cloberred] ZMM temporary +%define %%ZT03 %9 ; [cloberred] ZMM temporary +%define %%ZT04 %10 ; [cloberred] ZMM temporary +%define %%AAD_HASH %11 ; [out] XMM hash value +%define %%MASKREG %12 ; [clobbered] mask register to use for loads +%define %%IA0 %13 ; [clobbered] GP temporary register +%define %%GH %14 ; [in/optional] ZMM with GHASH high product sum +%define %%GL %15 ; [in/optional] ZMM with GHASH low product sum +%define %%GM %16 ; [in/optional] ZMM with GHASH mid product sum + + vmovdqa64 XWORD(%%ZT04), [rel POLY2] + + VCLMUL_1_TO_8_STEP1 %%GDATA, %%BL47, %%ZT01, %%ZT02, %%ZTH, %%ZTM, %%ZTL, 7 + +%if %0 > 13 + ;; add optional sums before step2 + vpxorq %%ZTH, %%ZTH, %%GH + vpxorq %%ZTL, %%ZTL, %%GL + vpxorq %%ZTM, %%ZTM, %%GM +%endif + + VCLMUL_1_TO_8_STEP2 %%GDATA, %%BL47, %%BL03, \ + %%ZT01, %%ZT02, %%ZT03, \ + %%ZTH, %%ZTM, %%ZTL, 7 + + VCLMUL_REDUCE %%AAD_HASH, XWORD(%%ZT04), XWORD(%%BL47), XWORD(%%BL03), \ + XWORD(%%ZT01), XWORD(%%ZT02) +%endmacro ; GHASH_LAST_7 + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Encryption of a single block +%macro ENCRYPT_SINGLE_BLOCK 2 +%define %%GDATA %1 +%define %%XMM0 %2 + + vpxorq %%XMM0, %%XMM0, [%%GDATA+16*0] +%assign i 1 +%rep NROUNDS + vaesenc %%XMM0, [%%GDATA+16*i] +%assign i (i+1) +%endrep + vaesenclast %%XMM0, [%%GDATA+16*i] +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Save register content for the caller +%macro FUNC_SAVE 0 + ;; Required for Update/GMC_ENC + ;the number of pushes must equal STACK_OFFSET + mov rax, rsp + + sub rsp, STACK_FRAME_SIZE + and rsp, ~63 + + mov [rsp + STACK_GP_OFFSET + 0*8], r12 + mov [rsp + STACK_GP_OFFSET + 1*8], r13 + mov [rsp + STACK_GP_OFFSET + 2*8], r14 + mov [rsp + STACK_GP_OFFSET + 3*8], r15 + mov [rsp + STACK_GP_OFFSET + 4*8], rax ; stack + mov r14, rax ; r14 is used to retrieve stack args + mov [rsp + STACK_GP_OFFSET + 5*8], rbp + mov [rsp + STACK_GP_OFFSET + 6*8], rbx +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + STACK_GP_OFFSET + 7*8], rdi + mov [rsp + STACK_GP_OFFSET + 8*8], rsi +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + vmovdqu [rsp + STACK_XMM_OFFSET + 0*16], xmm6 + vmovdqu [rsp + STACK_XMM_OFFSET + 1*16], xmm7 + vmovdqu [rsp + STACK_XMM_OFFSET + 2*16], xmm8 + vmovdqu [rsp + STACK_XMM_OFFSET + 3*16], xmm9 + vmovdqu [rsp + STACK_XMM_OFFSET + 4*16], xmm10 + vmovdqu [rsp + STACK_XMM_OFFSET + 5*16], xmm11 + vmovdqu [rsp + STACK_XMM_OFFSET + 6*16], xmm12 + vmovdqu [rsp + STACK_XMM_OFFSET + 7*16], xmm13 + vmovdqu [rsp + STACK_XMM_OFFSET + 8*16], xmm14 + vmovdqu [rsp + STACK_XMM_OFFSET + 9*16], xmm15 +%endif +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Restore register content for the caller +%macro FUNC_RESTORE 0 + +%ifdef SAFE_DATA + clear_scratch_gps_asm + clear_scratch_zmms_asm +%else + vzeroupper +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15, [rsp + STACK_XMM_OFFSET + 9*16] + vmovdqu xmm14, [rsp + STACK_XMM_OFFSET + 8*16] + vmovdqu xmm13, [rsp + STACK_XMM_OFFSET + 7*16] + vmovdqu xmm12, [rsp + STACK_XMM_OFFSET + 6*16] + vmovdqu xmm11, [rsp + STACK_XMM_OFFSET + 5*16] + vmovdqu xmm10, [rsp + STACK_XMM_OFFSET + 4*16] + vmovdqu xmm9, [rsp + STACK_XMM_OFFSET + 3*16] + vmovdqu xmm8, [rsp + STACK_XMM_OFFSET + 2*16] + vmovdqu xmm7, [rsp + STACK_XMM_OFFSET + 1*16] + vmovdqu xmm6, [rsp + STACK_XMM_OFFSET + 0*16] +%endif + + ;; Required for Update/GMC_ENC + mov rbp, [rsp + STACK_GP_OFFSET + 5*8] + mov rbx, [rsp + STACK_GP_OFFSET + 6*8] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [rsp + STACK_GP_OFFSET + 7*8] + mov rsi, [rsp + STACK_GP_OFFSET + 8*8] +%endif + mov r12, [rsp + STACK_GP_OFFSET + 0*8] + mov r13, [rsp + STACK_GP_OFFSET + 1*8] + mov r14, [rsp + STACK_GP_OFFSET + 2*8] + mov r15, [rsp + STACK_GP_OFFSET + 3*8] + mov rsp, [rsp + STACK_GP_OFFSET + 4*8] ; stack +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding. +;;; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV, +;;; Additional Authentication data (A_IN), Additional Data length (A_LEN). +;;; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA_CTX. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_INIT 21 +%define %%GDATA_KEY %1 ; [in] GCM expanded keys pointer +%define %%GDATA_CTX %2 ; [in] GCM context pointer +%define %%IV %3 ; [in] IV pointer +%define %%A_IN %4 ; [in] AAD pointer +%define %%A_LEN %5 ; [in] AAD length in bytes +%define %%GPR1 %6 ; [clobbered] GP register +%define %%GPR2 %7 ; [clobbered] GP register +%define %%GPR3 %8 ; [clobbered] GP register +%define %%MASKREG %9 ; [clobbered] mask register +%define %%AAD_HASH %10 ; [out] XMM for AAD_HASH value (xmm14) +%define %%CUR_COUNT %11 ; [out] XMM with current counter (xmm2) +%define %%ZT0 %12 ; [clobbered] ZMM register +%define %%ZT1 %13 ; [clobbered] ZMM register +%define %%ZT2 %14 ; [clobbered] ZMM register +%define %%ZT3 %15 ; [clobbered] ZMM register +%define %%ZT4 %16 ; [clobbered] ZMM register +%define %%ZT5 %17 ; [clobbered] ZMM register +%define %%ZT6 %18 ; [clobbered] ZMM register +%define %%ZT7 %19 ; [clobbered] ZMM register +%define %%ZT8 %20 ; [clobbered] ZMM register +%define %%ZT9 %21 ; [clobbered] ZMM register + + CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, \ + %%ZT0, %%ZT1, %%ZT2, %%ZT3, %%ZT4, %%ZT5, %%ZT6, %%ZT7, %%ZT8, %%ZT9, \ + %%GPR1, %%GPR2, %%GPR3, %%MASKREG + + mov %%GPR1, %%A_LEN + vmovdqu64 [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx.aad hash = aad_hash + mov [%%GDATA_CTX + AadLen], %%GPR1 ; ctx.aad_length = aad_length + + xor %%GPR1, %%GPR1 + mov [%%GDATA_CTX + InLen], %%GPR1 ; ctx.in_length = 0 + mov [%%GDATA_CTX + PBlockLen], %%GPR1 ; ctx.partial_block_length = 0 + + ;; read 12 IV bytes and pad with 0x00000001 + vmovdqu8 %%CUR_COUNT, [rel ONEf] + mov %%GPR2, %%IV + mov %%GPR1, 0x0000_0000_0000_0fff + kmovq %%MASKREG, %%GPR1 + vmovdqu8 %%CUR_COUNT{%%MASKREG}, [%%GPR2] ; ctr = IV | 0x1 + + vmovdqu64 [%%GDATA_CTX + OrigIV], %%CUR_COUNT ; ctx.orig_IV = iv + + ;; store IV as counter in LE format + vpshufb %%CUR_COUNT, [rel SHUF_MASK] + vmovdqu [%%GDATA_CTX + CurCount], %%CUR_COUNT ; ctx.current_counter = iv +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Cipher and ghash of payloads shorter than 256 bytes +;;; - number of blocks in the message comes as argument +;;; - depending on the number of blocks an optimized variant of +;;; INITIAL_BLOCKS_PARTIAL is invoked +%macro GCM_ENC_DEC_SMALL 42 +%define %%GDATA_KEY %1 ; [in] key pointer +%define %%GDATA_CTX %2 ; [in] context pointer +%define %%CYPH_PLAIN_OUT %3 ; [in] output buffer +%define %%PLAIN_CYPH_IN %4 ; [in] input buffer +%define %%PLAIN_CYPH_LEN %5 ; [in] buffer length +%define %%ENC_DEC %6 ; [in] cipher direction +%define %%DATA_OFFSET %7 ; [in] data offset +%define %%LENGTH %8 ; [in] data length +%define %%NUM_BLOCKS %9 ; [in] number of blocks to process 1 to 16 +%define %%CTR %10 ; [in/out] XMM counter block +%define %%HASH_IN_OUT %11 ; [in/out] XMM GHASH value +%define %%INSTANCE_TYPE %12 ; [in] single or multi call +%define %%ZTMP0 %13 ; [clobbered] ZMM register +%define %%ZTMP1 %14 ; [clobbered] ZMM register +%define %%ZTMP2 %15 ; [clobbered] ZMM register +%define %%ZTMP3 %16 ; [clobbered] ZMM register +%define %%ZTMP4 %17 ; [clobbered] ZMM register +%define %%ZTMP5 %18 ; [clobbered] ZMM register +%define %%ZTMP6 %19 ; [clobbered] ZMM register +%define %%ZTMP7 %20 ; [clobbered] ZMM register +%define %%ZTMP8 %21 ; [clobbered] ZMM register +%define %%ZTMP9 %22 ; [clobbered] ZMM register +%define %%ZTMP10 %23 ; [clobbered] ZMM register +%define %%ZTMP11 %24 ; [clobbered] ZMM register +%define %%ZTMP12 %25 ; [clobbered] ZMM register +%define %%ZTMP13 %26 ; [clobbered] ZMM register +%define %%ZTMP14 %27 ; [clobbered] ZMM register +%define %%ZTMP15 %28 ; [clobbered] ZMM register +%define %%ZTMP16 %29 ; [clobbered] ZMM register +%define %%ZTMP17 %30 ; [clobbered] ZMM register +%define %%ZTMP18 %31 ; [clobbered] ZMM register +%define %%ZTMP19 %32 ; [clobbered] ZMM register +%define %%ZTMP20 %33 ; [clobbered] ZMM register +%define %%ZTMP21 %34 ; [clobbered] ZMM register +%define %%ZTMP22 %35 ; [clobbered] ZMM register +%define %%GH %36 ; [in] ZMM ghash sum (high) +%define %%GL %37 ; [in] ZMM ghash sum (low) +%define %%GM %38 ; [in] ZMM ghash sum (middle) +%define %%IA0 %39 ; [clobbered] GP register +%define %%IA1 %40 ; [clobbered] GP register +%define %%MASKREG %41 ; [clobbered] mask register +%define %%SHUFMASK %42 ; [in] ZMM with BE/LE shuffle mask + + cmp %%NUM_BLOCKS, 8 + je %%_small_initial_num_blocks_is_8 + jl %%_small_initial_num_blocks_is_7_1 + + + cmp %%NUM_BLOCKS, 12 + je %%_small_initial_num_blocks_is_12 + jl %%_small_initial_num_blocks_is_11_9 + + ;; 16, 15, 14 or 13 + cmp %%NUM_BLOCKS, 16 + je %%_small_initial_num_blocks_is_16 + cmp %%NUM_BLOCKS, 15 + je %%_small_initial_num_blocks_is_15 + cmp %%NUM_BLOCKS, 14 + je %%_small_initial_num_blocks_is_14 + jmp %%_small_initial_num_blocks_is_13 + +%%_small_initial_num_blocks_is_11_9: + ;; 11, 10 or 9 + cmp %%NUM_BLOCKS, 11 + je %%_small_initial_num_blocks_is_11 + cmp %%NUM_BLOCKS, 10 + je %%_small_initial_num_blocks_is_10 + jmp %%_small_initial_num_blocks_is_9 + +%%_small_initial_num_blocks_is_7_1: + cmp %%NUM_BLOCKS, 4 + je %%_small_initial_num_blocks_is_4 + jl %%_small_initial_num_blocks_is_3_1 + ;; 7, 6 or 5 + cmp %%NUM_BLOCKS, 7 + je %%_small_initial_num_blocks_is_7 + cmp %%NUM_BLOCKS, 6 + je %%_small_initial_num_blocks_is_6 + jmp %%_small_initial_num_blocks_is_5 + +%%_small_initial_num_blocks_is_3_1: + ;; 3, 2 or 1 + cmp %%NUM_BLOCKS, 3 + je %%_small_initial_num_blocks_is_3 + cmp %%NUM_BLOCKS, 2 + je %%_small_initial_num_blocks_is_2 + + ;; for %%NUM_BLOCKS == 1, just fall through and no 'jmp' needed + + ;; Use rep to generate different block size variants + ;; - one block size has to be the first one +%assign num_blocks 1 +%rep 16 +%%_small_initial_num_blocks_is_ %+ num_blocks : + INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \ + %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, num_blocks, \ + %%CTR, %%HASH_IN_OUT, %%ENC_DEC, %%INSTANCE_TYPE, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \ + %%ZTMP5, %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, \ + %%ZTMP10, %%ZTMP11, %%ZTMP12, %%ZTMP13, %%ZTMP14, \ + %%ZTMP15, %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \ + %%ZTMP20, %%ZTMP21, %%ZTMP22, \ + %%GH, %%GL, %%GM, \ + %%IA0, %%IA1, %%MASKREG, %%SHUFMASK +%if num_blocks != 16 + jmp %%_small_initial_blocks_encrypted +%endif +%assign num_blocks (num_blocks + 1) +%endrep + +%%_small_initial_blocks_encrypted: + +%endmacro ; GCM_ENC_DEC_SMALL + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct +; has been initialized by GCM_INIT +; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. +; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN), +; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC). +; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10-r15, and zmm0-zmm31, k1 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_ENC_DEC 7 +%define %%GDATA_KEY %1 ; [in] key pointer +%define %%GDATA_CTX %2 ; [in] context pointer +%define %%CYPH_PLAIN_OUT %3 ; [in] output buffer pointer +%define %%PLAIN_CYPH_IN %4 ; [in] input buffer pointer +%define %%PLAIN_CYPH_LEN %5 ; [in] buffer length +%define %%ENC_DEC %6 ; [in] cipher direction +%define %%INSTANCE_TYPE %7 ; [in] 'single_call' or 'multi_call' selection + +%define %%IA0 r10 +%define %%IA1 r12 +%define %%IA2 r13 +%define %%IA3 r15 +%define %%IA4 r11 +%define %%IA5 rax + +%define %%LENGTH %%IA2 +%define %%CTR_CHECK %%IA3 +%define %%DATA_OFFSET %%IA4 + +%define %%HASHK_PTR %%IA5 + +%define %%GCM_INIT_CTR_BLOCK xmm2 ; hardcoded in GCM_INIT for now + +%define %%AES_PARTIAL_BLOCK xmm8 +%define %%CTR_BLOCK2z zmm18 +%define %%CTR_BLOCKz zmm9 +%define %%CTR_BLOCKx xmm9 +%define %%AAD_HASHz zmm14 +%define %%AAD_HASHx xmm14 + +;;; ZTMP0 - ZTMP12 - used in by8 code, by128/48 code and GCM_ENC_DEC_SMALL +%define %%ZTMP0 zmm0 +%define %%ZTMP1 zmm3 +%define %%ZTMP2 zmm4 +%define %%ZTMP3 zmm5 +%define %%ZTMP4 zmm6 +%define %%ZTMP5 zmm7 +%define %%ZTMP6 zmm10 +%define %%ZTMP7 zmm11 +%define %%ZTMP8 zmm12 +%define %%ZTMP9 zmm13 +%define %%ZTMP10 zmm15 +%define %%ZTMP11 zmm16 +%define %%ZTMP12 zmm17 + +;;; ZTMP13 - ZTMP22 - used in by128/48 code and GCM_ENC_DEC_SMALL +;;; - some used by8 code as well through TMPxy names +%define %%ZTMP13 zmm19 +%define %%ZTMP14 zmm20 +%define %%ZTMP15 zmm21 +%define %%ZTMP16 zmm30 ; can be used in very/big_loop part +%define %%ZTMP17 zmm31 ; can be used in very/big_loop part +%define %%ZTMP18 zmm1 +%define %%ZTMP19 zmm2 +%define %%ZTMP20 zmm8 +%define %%ZTMP21 zmm22 +%define %%ZTMP22 zmm23 + +;;; Free to use: zmm24 - zmm29 +;;; - used by by128/48 and by8 +%define %%GH zmm24 +%define %%GL zmm25 +%define %%GM zmm26 +%define %%SHUF_MASK zmm29 +%define %%CTR_BLOCK_SAVE zmm28 + +;;; - used by by128/48 code only +%define %%ADDBE_4x4 zmm27 +%define %%ADDBE_1234 zmm28 ; conflicts with CTR_BLOCK_SAVE + +;; used by8 code only +%define %%GH4KEY %%ZTMP17 +%define %%GH8KEY %%ZTMP16 +%define %%BLK0 %%ZTMP18 +%define %%BLK1 %%ZTMP19 +%define %%ADD8BE zmm27 +%define %%ADD8LE %%ZTMP13 + +%define %%MASKREG k1 + +%ifdef GCM_BIG_DATA +;; reduction every 128 blocks, depth 32 blocks +;; @note 128 blocks is the maximum capacity of the stack frame when +;; GCM_BIG_DATA is defined +%assign very_big_loop_nblocks 128 +%assign very_big_loop_depth 32 +%endif + +;; reduction every 48 blocks, depth 32 blocks +;; @note 48 blocks is the maximum capacity of the stack frame when +;; GCM_BIG_DATA is not defined +%assign big_loop_nblocks 48 +%assign big_loop_depth 32 + +;;; Macro flow: +;;; - for message size bigger than very_big_loop_nblocks process data +;;; with "very_big_loop" parameters +;;; - for message size bigger than big_loop_nblocks process data +;;; with "big_loop" parameters +;;; - calculate the number of 16byte blocks in the message +;;; - process (number of 16byte blocks) mod 8 +;;; '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted' +;;; - process 8 16 byte blocks at a time until all are done in %%_encrypt_by_8_new + +%ifidn __OUTPUT_FORMAT__, win64 + cmp %%PLAIN_CYPH_LEN, 0 +%else + or %%PLAIN_CYPH_LEN, %%PLAIN_CYPH_LEN +%endif + je %%_enc_dec_done + + xor %%DATA_OFFSET, %%DATA_OFFSET + + ;; Update length of data processed +%ifidn __OUTPUT_FORMAT__, win64 + mov %%IA0, %%PLAIN_CYPH_LEN + add [%%GDATA_CTX + InLen], %%IA0 +%else + add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN +%endif + vmovdqu64 %%AAD_HASHx, [%%GDATA_CTX + AadHash] + +%ifidn %%INSTANCE_TYPE, multi_call + ;; NOTE: partial block processing makes only sense for multi_call here. + ;; Used for the update flow - if there was a previous partial + ;; block fill the remaining bytes here. + PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%PLAIN_CYPH_LEN, %%DATA_OFFSET, %%AAD_HASHx, %%ENC_DEC, \ + %%IA0, %%IA1, %%IA2, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \ + %%ZTMP5, %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, %%MASKREG +%endif + + ;; lift counter block from GCM_INIT to here +%ifidn %%INSTANCE_TYPE, single_call + vmovdqu64 %%CTR_BLOCKx, %%GCM_INIT_CTR_BLOCK +%else + vmovdqu64 %%CTR_BLOCKx, [%%GDATA_CTX + CurCount] +%endif + + ;; Save the amount of data left to process in %%LENGTH + mov %%LENGTH, %%PLAIN_CYPH_LEN +%ifidn %%INSTANCE_TYPE, multi_call + ;; NOTE: %%DATA_OFFSET is zero in single_call case. + ;; Consequently PLAIN_CYPH_LEN will never be zero after + ;; %%DATA_OFFSET subtraction below. + ;; There may be no more data if it was consumed in the partial block. + sub %%LENGTH, %%DATA_OFFSET + je %%_enc_dec_done +%endif ; %%INSTANCE_TYPE, multi_call + + vmovdqa64 %%SHUF_MASK, [rel SHUF_MASK] + vmovdqa64 %%ADDBE_4x4, [rel ddq_addbe_4444] + +%ifdef GCM_BIG_DATA + vmovdqa64 %%ADDBE_1234, [rel ddq_addbe_1234] + + cmp %%LENGTH, (very_big_loop_nblocks * 16) + jl %%_message_below_very_big_nblocks + + INITIAL_BLOCKS_Nx16 %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \ + %%AAD_HASHz, %%CTR_BLOCKz, %%CTR_CHECK, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \ + %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \ + %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \ + %%ZTMP20, %%ZTMP21, %%ZTMP22, \ + %%GH, %%GL, %%GM, \ + %%ADDBE_4x4, %%ADDBE_1234, \ + %%SHUF_MASK, %%ENC_DEC, very_big_loop_nblocks, very_big_loop_depth + + sub %%LENGTH, (very_big_loop_nblocks * 16) + cmp %%LENGTH, (very_big_loop_nblocks * 16) + jl %%_no_more_very_big_nblocks + +%%_encrypt_very_big_nblocks: + GHASH_ENCRYPT_Nx16_PARALLEL \ + %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \ + %%CTR_BLOCKz, %%SHUF_MASK, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \ + %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \ + %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \ + %%ZTMP20, %%ZTMP21, %%ZTMP22, \ + %%GH, %%GL, %%GM, \ + %%ADDBE_4x4, %%ADDBE_1234, %%AAD_HASHz, \ + %%ENC_DEC, very_big_loop_nblocks, very_big_loop_depth, %%CTR_CHECK + + sub %%LENGTH, (very_big_loop_nblocks * 16) + cmp %%LENGTH, (very_big_loop_nblocks * 16) + jge %%_encrypt_very_big_nblocks + +%%_no_more_very_big_nblocks: + vpshufb %%CTR_BLOCKx, XWORD(%%SHUF_MASK) + vmovdqa64 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx + + GHASH_LAST_Nx16 %%GDATA_KEY, %%AAD_HASHz, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \ + %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \ + %%GH, %%GL, %%GM, very_big_loop_nblocks, very_big_loop_depth + + or %%LENGTH, %%LENGTH + jz %%_ghash_done + +%%_message_below_very_big_nblocks: +%endif ; GCM_BIG_DATA + + cmp %%LENGTH, (big_loop_nblocks * 16) + jl %%_message_below_big_nblocks + + ;; overwritten above by CTR_BLOCK_SAVE + vmovdqa64 %%ADDBE_1234, [rel ddq_addbe_1234] + + INITIAL_BLOCKS_Nx16 %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \ + %%AAD_HASHz, %%CTR_BLOCKz, %%CTR_CHECK, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \ + %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \ + %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \ + %%ZTMP20, %%ZTMP21, %%ZTMP22, \ + %%GH, %%GL, %%GM, \ + %%ADDBE_4x4, %%ADDBE_1234, \ + %%SHUF_MASK, %%ENC_DEC, big_loop_nblocks, big_loop_depth + + sub %%LENGTH, (big_loop_nblocks * 16) + cmp %%LENGTH, (big_loop_nblocks * 16) + jl %%_no_more_big_nblocks + +%%_encrypt_big_nblocks: + GHASH_ENCRYPT_Nx16_PARALLEL \ + %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \ + %%CTR_BLOCKz, %%SHUF_MASK, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \ + %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \ + %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \ + %%ZTMP20, %%ZTMP21, %%ZTMP22, \ + %%GH, %%GL, %%GM, \ + %%ADDBE_4x4, %%ADDBE_1234, %%AAD_HASHz, \ + %%ENC_DEC, big_loop_nblocks, big_loop_depth, %%CTR_CHECK + + sub %%LENGTH, (big_loop_nblocks * 16) + cmp %%LENGTH, (big_loop_nblocks * 16) + jge %%_encrypt_big_nblocks + +%%_no_more_big_nblocks: + vpshufb %%CTR_BLOCKx, XWORD(%%SHUF_MASK) + vmovdqa64 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx + + GHASH_LAST_Nx16 %%GDATA_KEY, %%AAD_HASHz, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \ + %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \ + %%GH, %%GL, %%GM, big_loop_nblocks, big_loop_depth + + or %%LENGTH, %%LENGTH + jz %%_ghash_done + +%%_message_below_big_nblocks: + + ;; Less than 256 bytes will be handled by the small message code, which + ;; can process up to 16 x blocks (16 bytes each) + cmp %%LENGTH, (16 * 16) + jge %%_large_message_path + + ;; Determine how many blocks to process + ;; - process one additional block if there is a partial block + mov %%IA1, %%LENGTH + add %%IA1, 15 + shr %%IA1, 4 + ;; %%IA1 can be in the range from 0 to 16 + + GCM_ENC_DEC_SMALL \ + %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, \ + %%LENGTH, %%IA1, %%CTR_BLOCKx, %%AAD_HASHx, %%INSTANCE_TYPE, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \ + %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \ + %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \ + %%ZTMP20, %%ZTMP21, %%ZTMP22, \ + no_zmm, no_zmm, no_zmm, \ + %%IA0, %%IA3, %%MASKREG, %%SHUF_MASK + + vmovdqa64 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx + + jmp %%_ghash_done + +%%_large_message_path: + ;; Determine how many blocks to process in INITIAL + ;; - process one additional block in INITIAL if there is a partial block + mov %%IA1, %%LENGTH + and %%IA1, 0xff + add %%IA1, 15 + shr %%IA1, 4 + ;; Don't allow 8 INITIAL blocks since this will + ;; be handled by the x8 partial loop. + and %%IA1, 7 + je %%_initial_num_blocks_is_0 + cmp %%IA1, 1 + je %%_initial_num_blocks_is_1 + cmp %%IA1, 2 + je %%_initial_num_blocks_is_2 + cmp %%IA1, 3 + je %%_initial_num_blocks_is_3 + cmp %%IA1, 4 + je %%_initial_num_blocks_is_4 + cmp %%IA1, 5 + je %%_initial_num_blocks_is_5 + cmp %%IA1, 6 + je %%_initial_num_blocks_is_6 + +%assign number_of_blocks 7 +%rep 8 +%%_initial_num_blocks_is_ %+ number_of_blocks: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%LENGTH, %%DATA_OFFSET, number_of_blocks, %%CTR_BLOCKx, %%AAD_HASHz, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \ + %%ZTMP5, %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \ + %%IA0, %%IA1, %%ENC_DEC, %%MASKREG, %%SHUF_MASK, no_partial_block +%if number_of_blocks != 0 + jmp %%_initial_blocks_encrypted +%endif +%assign number_of_blocks (number_of_blocks - 1) +%endrep + +%%_initial_blocks_encrypted: + vmovdqa64 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx + + ;; move cipher blocks from intial blocks to input of by8 macro + ;; and for GHASH_LAST_8/7 + ;; - ghash value already xor'ed into block 0 + vmovdqa64 %%BLK0, %%ZTMP0 + vmovdqa64 %%BLK1, %%ZTMP1 + + ;; The entire message cannot get processed in INITIAL_BLOCKS + ;; - GCM_ENC_DEC_SMALL handles up to 16 blocks + ;; - INITIAL_BLOCKS processes up to 15 blocks + ;; - no need to check for zero length at this stage + + ;; In order to have only one reduction at the end + ;; start HASH KEY pointer needs to be determined based on length and + ;; call type. + ;; - note that 8 blocks are already ciphered in INITIAL_BLOCKS and + ;; subtracted from LENGTH + lea %%IA1, [%%LENGTH + (8 * 16)] + add %%IA1, 15 + and %%IA1, 0x3f0 +%ifidn %%INSTANCE_TYPE, multi_call + ;; if partial block and multi_call then change hash key start by one + mov %%IA0, %%LENGTH + and %%IA0, 15 + add %%IA0, 15 + and %%IA0, 16 + sub %%IA1, %%IA0 +%endif + lea %%HASHK_PTR, [%%GDATA_KEY + HashKey + 16] + sub %%HASHK_PTR, %%IA1 + ;; HASHK_PTR + ;; - points at the first hash key to start GHASH with + ;; - needs to be updated as the message is processed (incremented) + + ;; pre-load constants + vmovdqa64 %%ADD8BE, [rel ddq_addbe_8888] + vmovdqa64 %%ADD8LE, [rel ddq_add_8888] + vpxorq %%GH, %%GH + vpxorq %%GL, %%GL + vpxorq %%GM, %%GM + + ;; prepare counter 8 blocks + vshufi64x2 %%CTR_BLOCKz, %%CTR_BLOCKz, %%CTR_BLOCKz, 0 + vpaddd %%CTR_BLOCK2z, %%CTR_BLOCKz, [rel ddq_add_5678] + vpaddd %%CTR_BLOCKz, %%CTR_BLOCKz, [rel ddq_add_1234] + vpshufb %%CTR_BLOCKz, %%SHUF_MASK + vpshufb %%CTR_BLOCK2z, %%SHUF_MASK + + ;; Process 7 full blocks plus a partial block + cmp %%LENGTH, 128 + jl %%_encrypt_by_8_partial + +%%_encrypt_by_8_parallel: + ;; in_order vs. out_order is an optimization to increment the counter + ;; without shuffling it back into little endian. + ;; %%CTR_CHECK keeps track of when we need to increment in order so + ;; that the carry is handled correctly. + + vmovq %%CTR_CHECK, XWORD(%%CTR_BLOCK_SAVE) + +%%_encrypt_by_8_new: + and WORD(%%CTR_CHECK), 255 + add WORD(%%CTR_CHECK), 8 + + vmovdqu64 %%GH4KEY, [%%HASHK_PTR + (4 * 16)] + vmovdqu64 %%GH8KEY, [%%HASHK_PTR + (0 * 16)] + + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%DATA_OFFSET, %%CTR_BLOCKz, %%CTR_BLOCK2z,\ + %%BLK0, %%BLK1, %%AES_PARTIAL_BLOCK, \ + out_order, %%ENC_DEC, full, %%IA0, %%IA1, %%LENGTH, %%INSTANCE_TYPE, \ + %%GH4KEY, %%GH8KEY, %%SHUF_MASK, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, \ + %%ZTMP7, %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, %%ZTMP12, \ + %%MASKREG, no_reduction, %%GL, %%GH, %%GM + + add %%HASHK_PTR, (8 * 16) + add %%DATA_OFFSET, 128 + sub %%LENGTH, 128 + jz %%_encrypt_done + + cmp WORD(%%CTR_CHECK), (256 - 8) + jae %%_encrypt_by_8 + + vpaddd %%CTR_BLOCKz, %%ADD8BE + vpaddd %%CTR_BLOCK2z, %%ADD8BE + + cmp %%LENGTH, 128 + jl %%_encrypt_by_8_partial + + jmp %%_encrypt_by_8_new + +%%_encrypt_by_8: + vpshufb %%CTR_BLOCKz, %%SHUF_MASK + vpshufb %%CTR_BLOCK2z, %%SHUF_MASK + vpaddd %%CTR_BLOCKz, %%ADD8LE + vpaddd %%CTR_BLOCK2z, %%ADD8LE + vpshufb %%CTR_BLOCKz, %%SHUF_MASK + vpshufb %%CTR_BLOCK2z, %%SHUF_MASK + + cmp %%LENGTH, 128 + jge %%_encrypt_by_8_new + +%%_encrypt_by_8_partial: + ;; Test to see if we need a by 8 with partial block. At this point + ;; bytes remaining should be either zero or between 113-127. + ;; 'in_order' shuffle needed to align key for partial block xor. + ;; 'out_order' is a little faster because it avoids extra shuffles. + ;; - counter blocks for the next 8 blocks are prepared and in BE format + ;; - we can go ahead with out_order scenario + + vmovdqu64 %%GH4KEY, [%%HASHK_PTR + (4 * 16)] + vmovdqu64 %%GH8KEY, [%%HASHK_PTR + (0 * 16)] + + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%DATA_OFFSET, %%CTR_BLOCKz, %%CTR_BLOCK2z, \ + %%BLK0, %%BLK1, %%AES_PARTIAL_BLOCK, \ + out_order, %%ENC_DEC, partial, %%IA0, %%IA1, %%LENGTH, %%INSTANCE_TYPE, \ + %%GH4KEY, %%GH8KEY, %%SHUF_MASK, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, \ + %%ZTMP7, %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, %%ZTMP12, \ + %%MASKREG, no_reduction, %%GL, %%GH, %%GM + + add %%HASHK_PTR, (8 * 16) + add %%DATA_OFFSET, (128 - 16) + sub %%LENGTH, (128 - 16) + +%ifidn %%INSTANCE_TYPE, multi_call + mov [%%GDATA_CTX + PBlockLen], %%LENGTH + vmovdqu64 [%%GDATA_CTX + PBlockEncKey], %%AES_PARTIAL_BLOCK +%endif + +%%_encrypt_done: + ;; Extract the last counter block in LE format + vextracti32x4 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCK2z, 3 + vpshufb XWORD(%%CTR_BLOCK_SAVE), XWORD(%%SHUF_MASK) + + ;; GHASH last cipher text blocks in xmm1-xmm8 + ;; - if block 8th is partial in a multi-call path then skip the block +%ifidn %%INSTANCE_TYPE, multi_call + cmp qword [%%GDATA_CTX + PBlockLen], 0 + jz %%_hash_last_8 + + ;; save the 8th partial block as GHASH_LAST_7 will clobber %%BLK1 + vextracti32x4 XWORD(%%ZTMP7), %%BLK1, 3 + + GHASH_LAST_7 %%GDATA_KEY, %%BLK1, %%BLK0, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, \ + %%AAD_HASHx, %%MASKREG, %%IA0, %%GH, %%GL, %%GM + + ;; XOR the partial word into the hash + vpxorq %%AAD_HASHx, %%AAD_HASHx, XWORD(%%ZTMP7) + jmp %%_ghash_done +%%_hash_last_8: +%endif + GHASH_LAST_8 %%GDATA_KEY, %%BLK1, %%BLK0, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%AAD_HASHx, \ + %%GH, %%GL, %%GM +%%_ghash_done: + vmovdqu64 [%%GDATA_CTX + CurCount], XWORD(%%CTR_BLOCK_SAVE) + vmovdqu64 [%%GDATA_CTX + AadHash], %%AAD_HASHx +%%_enc_dec_done: + +%endmacro ; GCM_ENC_DEC + +;;; =========================================================================== +;;; =========================================================================== +;;; Encrypt/decrypt the initial 16 blocks +%macro INITIAL_BLOCKS_16 22 +%define %%IN %1 ; [in] input buffer +%define %%OUT %2 ; [in] output buffer +%define %%KP %3 ; [in] pointer to expanded keys +%define %%DATA_OFFSET %4 ; [in] data offset +%define %%GHASH %5 ; [in] ZMM with AAD (low 128 bits) +%define %%CTR %6 ; [in] ZMM with CTR BE blocks 4x128 bits +%define %%CTR_CHECK %7 ; [in/out] GPR with counter overflow check +%define %%ADDBE_4x4 %8 ; [in] ZMM 4x128bits with value 4 (big endian) +%define %%ADDBE_1234 %9 ; [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian) +%define %%T0 %10 ; [clobered] temporary ZMM register +%define %%T1 %11 ; [clobered] temporary ZMM register +%define %%T2 %12 ; [clobered] temporary ZMM register +%define %%T3 %13 ; [clobered] temporary ZMM register +%define %%T4 %14 ; [clobered] temporary ZMM register +%define %%T5 %15 ; [clobered] temporary ZMM register +%define %%T6 %16 ; [clobered] temporary ZMM register +%define %%T7 %17 ; [clobered] temporary ZMM register +%define %%T8 %18 ; [clobered] temporary ZMM register +%define %%SHUF_MASK %19 ; [in] ZMM with BE/LE shuffle mask +%define %%ENC_DEC %20 ; [in] ENC (encrypt) or DEC (decrypt) selector +%define %%BLK_OFFSET %21 ; [in] stack frame offset to ciphered blocks +%define %%DATA_DISPL %22 ; [in] fixed numerical data displacement/offset + +%define %%B00_03 %%T5 +%define %%B04_07 %%T6 +%define %%B08_11 %%T7 +%define %%B12_15 %%T8 + +%assign stack_offset (%%BLK_OFFSET) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; prepare counter blocks + + cmp BYTE(%%CTR_CHECK), (256 - 16) + jae %%_next_16_overflow + vpaddd %%B00_03, %%CTR, %%ADDBE_1234 + vpaddd %%B04_07, %%B00_03, %%ADDBE_4x4 + vpaddd %%B08_11, %%B04_07, %%ADDBE_4x4 + vpaddd %%B12_15, %%B08_11, %%ADDBE_4x4 + jmp %%_next_16_ok +%%_next_16_overflow: + vpshufb %%CTR, %%CTR, %%SHUF_MASK + vmovdqa64 %%B12_15, [rel ddq_add_4444] + vpaddd %%B00_03, %%CTR, [rel ddq_add_1234] + vpaddd %%B04_07, %%B00_03, %%B12_15 + vpaddd %%B08_11, %%B04_07, %%B12_15 + vpaddd %%B12_15, %%B08_11, %%B12_15 + vpshufb %%B00_03, %%SHUF_MASK + vpshufb %%B04_07, %%SHUF_MASK + vpshufb %%B08_11, %%SHUF_MASK + vpshufb %%B12_15, %%SHUF_MASK +%%_next_16_ok: + vshufi64x2 %%CTR, %%B12_15, %%B12_15, 1111_1111b + add BYTE(%%CTR_CHECK), 16 + + ;; === load 16 blocks of data + VX512LDR %%T0, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*0)] + VX512LDR %%T1, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*1)] + VX512LDR %%T2, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*2)] + VX512LDR %%T3, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*3)] + + ;; move to AES encryption rounds +%assign i 0 + vbroadcastf64x2 %%T4, [%%KP + (16*i)] + vpxorq %%B00_03, %%B00_03, %%T4 + vpxorq %%B04_07, %%B04_07, %%T4 + vpxorq %%B08_11, %%B08_11, %%T4 + vpxorq %%B12_15, %%B12_15, %%T4 +%assign i (i + 1) + +%rep NROUNDS + vbroadcastf64x2 %%T4, [%%KP + (16*i)] + vaesenc %%B00_03, %%B00_03, %%T4 + vaesenc %%B04_07, %%B04_07, %%T4 + vaesenc %%B08_11, %%B08_11, %%T4 + vaesenc %%B12_15, %%B12_15, %%T4 +%assign i (i + 1) +%endrep + + vbroadcastf64x2 %%T4, [%%KP + (16*i)] + vaesenclast %%B00_03, %%B00_03, %%T4 + vaesenclast %%B04_07, %%B04_07, %%T4 + vaesenclast %%B08_11, %%B08_11, %%T4 + vaesenclast %%B12_15, %%B12_15, %%T4 + + ;; xor against text + vpxorq %%B00_03, %%B00_03, %%T0 + vpxorq %%B04_07, %%B04_07, %%T1 + vpxorq %%B08_11, %%B08_11, %%T2 + vpxorq %%B12_15, %%B12_15, %%T3 + + ;; store + VX512STR [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*0)], %%B00_03 + VX512STR [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*1)], %%B04_07 + VX512STR [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*2)], %%B08_11 + VX512STR [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*3)], %%B12_15 + +%ifidn %%ENC_DEC, DEC + ;; decryption - cipher text needs to go to GHASH phase + vpshufb %%B00_03, %%T0, %%SHUF_MASK + vpshufb %%B04_07, %%T1, %%SHUF_MASK + vpshufb %%B08_11, %%T2, %%SHUF_MASK + vpshufb %%B12_15, %%T3, %%SHUF_MASK +%else + ;; encryption + vpshufb %%B00_03, %%B00_03, %%SHUF_MASK + vpshufb %%B04_07, %%B04_07, %%SHUF_MASK + vpshufb %%B08_11, %%B08_11, %%SHUF_MASK + vpshufb %%B12_15, %%B12_15, %%SHUF_MASK +%endif + +%ifnidn %%GHASH, no_ghash + ;; === xor cipher block 0 with GHASH for the next GHASH round + vpxorq %%B00_03, %%B00_03, %%GHASH +%endif + + vmovdqa64 [rsp + stack_offset + (0 * 64)], %%B00_03 + vmovdqa64 [rsp + stack_offset + (1 * 64)], %%B04_07 + vmovdqa64 [rsp + stack_offset + (2 * 64)], %%B08_11 + vmovdqa64 [rsp + stack_offset + (3 * 64)], %%B12_15 +%endmacro ;INITIAL_BLOCKS_16 + +;;; =========================================================================== +;;; =========================================================================== +;;; Encrypt the initial N x 16 blocks +;;; - A x 16 blocks are encrypted/decrypted first (pipeline depth) +;;; - B x 16 blocks are encrypted/decrypted and previous A x 16 are ghashed +;;; - A + B = N +%macro INITIAL_BLOCKS_Nx16 39 +%define %%IN %1 ; [in] input buffer +%define %%OUT %2 ; [in] output buffer +%define %%KP %3 ; [in] pointer to expanded keys +%define %%DATA_OFFSET %4 ; [in/out] data offset +%define %%GHASH %5 ; [in] ZMM with AAD (low 128 bits) +%define %%CTR %6 ; [in/out] ZMM with CTR: in - LE & 128b; out - BE & 4x128b +%define %%CTR_CHECK %7 ; [in/out] GPR with counter overflow check +%define %%T0 %8 ; [clobered] temporary ZMM register +%define %%T1 %9 ; [clobered] temporary ZMM register +%define %%T2 %10 ; [clobered] temporary ZMM register +%define %%T3 %11 ; [clobered] temporary ZMM register +%define %%T4 %12 ; [clobered] temporary ZMM register +%define %%T5 %13 ; [clobered] temporary ZMM register +%define %%T6 %14 ; [clobered] temporary ZMM register +%define %%T7 %15 ; [clobered] temporary ZMM register +%define %%T8 %16 ; [clobered] temporary ZMM register +%define %%T9 %17 ; [clobered] temporary ZMM register +%define %%T10 %18 ; [clobered] temporary ZMM register +%define %%T11 %19 ; [clobered] temporary ZMM register +%define %%T12 %20 ; [clobered] temporary ZMM register +%define %%T13 %21 ; [clobered] temporary ZMM register +%define %%T14 %22 ; [clobered] temporary ZMM register +%define %%T15 %23 ; [clobered] temporary ZMM register +%define %%T16 %24 ; [clobered] temporary ZMM register +%define %%T17 %25 ; [clobered] temporary ZMM register +%define %%T18 %26 ; [clobered] temporary ZMM register +%define %%T19 %27 ; [clobered] temporary ZMM register +%define %%T20 %28 ; [clobered] temporary ZMM register +%define %%T21 %29 ; [clobered] temporary ZMM register +%define %%T22 %30 ; [clobered] temporary ZMM register +%define %%GH %31 ; [out] ZMM ghash sum (high) +%define %%GL %32 ; [out] ZMM ghash sum (low) +%define %%GM %33 ; [out] ZMM ghash sum (middle) +%define %%ADDBE_4x4 %34 ; [in] ZMM 4x128bits with value 4 (big endian) +%define %%ADDBE_1234 %35 ; [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian) +%define %%SHUF_MASK %36 ; [in] ZMM with BE/LE shuffle mask +%define %%ENC_DEC %37 ; [in] ENC (encrypt) or DEC (decrypt) selector +%define %%NBLOCKS %38 ; [in] number of blocks: multiple of 16 +%define %%DEPTH_BLK %39 ; [in] pipline depth, number of blocks (mulitple of 16) + +%assign aesout_offset (STACK_LOCAL_OFFSET + (0 * 16)) +%assign ghashin_offset (STACK_LOCAL_OFFSET + (0 * 16)) +%assign hkey_offset HashKey_ %+ %%NBLOCKS +%assign data_in_out_offset 0 + + ;; set up CTR_CHECK + vmovd DWORD(%%CTR_CHECK), XWORD(%%CTR) + and DWORD(%%CTR_CHECK), 255 + + ;; in LE format after init, convert to BE + vshufi64x2 %%CTR, %%CTR, %%CTR, 0 + vpshufb %%CTR, %%CTR, %%SHUF_MASK + + ;; ==== AES lead in + + ;; first 16 blocks - just cipher + INITIAL_BLOCKS_16 %%IN, %%OUT, %%KP, %%DATA_OFFSET, \ + %%GHASH, %%CTR, %%CTR_CHECK, %%ADDBE_4x4, %%ADDBE_1234, \ + %%T0, %%T1, %%T2, %%T3, %%T4, \ + %%T5, %%T6, %%T7, %%T8, \ + %%SHUF_MASK, %%ENC_DEC, aesout_offset, data_in_out_offset + +%assign aesout_offset (aesout_offset + (16 * 16)) +%assign data_in_out_offset (data_in_out_offset + (16 * 16)) + +%if (%%DEPTH_BLK > 16) +%rep ((%%DEPTH_BLK - 16) / 16) + INITIAL_BLOCKS_16 %%IN, %%OUT, %%KP, %%DATA_OFFSET, \ + no_ghash, %%CTR, %%CTR_CHECK, %%ADDBE_4x4, %%ADDBE_1234, \ + %%T0, %%T1, %%T2, %%T3, %%T4, \ + %%T5, %%T6, %%T7, %%T8, \ + %%SHUF_MASK, %%ENC_DEC, aesout_offset, data_in_out_offset +%assign aesout_offset (aesout_offset + (16 * 16)) +%assign data_in_out_offset (data_in_out_offset + (16 * 16)) +%endrep +%endif + + ;; ==== GHASH + AES follows + + ;; first 16 blocks stitched + GHASH_16_ENCRYPT_16_PARALLEL %%KP, %%OUT, %%IN, %%DATA_OFFSET, \ + %%CTR, %%CTR_CHECK, \ + hkey_offset, aesout_offset, ghashin_offset, %%SHUF_MASK, \ + %%T0, %%T1, %%T2, %%T3, \ + %%T4, %%T5, %%T6, %%T7, \ + %%T8, %%T9, %%T10, %%T11,\ + %%T12, %%T13, %%T14, %%T15,\ + %%T16, %%T17, %%T18, %%T19, \ + %%T20, %%T21, %%T22, \ + %%ADDBE_4x4, %%ADDBE_1234, \ + %%GL, %%GH, %%GM, \ + first_time, %%ENC_DEC, data_in_out_offset, no_ghash_in + +%if ((%%NBLOCKS - %%DEPTH_BLK) > 16) +%rep ((%%NBLOCKS - %%DEPTH_BLK - 16) / 16) +%assign ghashin_offset (ghashin_offset + (16 * 16)) +%assign hkey_offset (hkey_offset + (16 * 16)) +%assign aesout_offset (aesout_offset + (16 * 16)) +%assign data_in_out_offset (data_in_out_offset + (16 * 16)) + + ;; mid 16 blocks - stitched + GHASH_16_ENCRYPT_16_PARALLEL %%KP, %%OUT, %%IN, %%DATA_OFFSET, \ + %%CTR, %%CTR_CHECK, \ + hkey_offset, aesout_offset, ghashin_offset, %%SHUF_MASK, \ + %%T0, %%T1, %%T2, %%T3, \ + %%T4, %%T5, %%T6, %%T7, \ + %%T8, %%T9, %%T10, %%T11,\ + %%T12, %%T13, %%T14, %%T15,\ + %%T16, %%T17, %%T18, %%T19, \ + %%T20, %%T21, %%T22, \ + %%ADDBE_4x4, %%ADDBE_1234, \ + %%GL, %%GH, %%GM, \ + no_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in +%endrep +%endif + add %%DATA_OFFSET, (%%NBLOCKS * 16) + +%endmacro ;INITIAL_BLOCKS_Nx16 + +;;; =========================================================================== +;;; =========================================================================== +;;; GHASH the last 16 blocks of cipher text (last part of by 32/64/128 code) +%macro GHASH_LAST_Nx16 23 +%define %%KP %1 ; [in] pointer to expanded keys +%define %%GHASH %2 ; [out] ghash output +%define %%T1 %3 ; [clobbered] temporary ZMM +%define %%T2 %4 ; [clobbered] temporary ZMM +%define %%T3 %5 ; [clobbered] temporary ZMM +%define %%T4 %6 ; [clobbered] temporary ZMM +%define %%T5 %7 ; [clobbered] temporary ZMM +%define %%T6 %8 ; [clobbered] temporary ZMM +%define %%T7 %9 ; [clobbered] temporary ZMM +%define %%T8 %10 ; [clobbered] temporary ZMM +%define %%T9 %11 ; [clobbered] temporary ZMM +%define %%T10 %12 ; [clobbered] temporary ZMM +%define %%T11 %13 ; [clobbered] temporary ZMM +%define %%T12 %14 ; [clobbered] temporary ZMM +%define %%T13 %15 ; [clobbered] temporary ZMM +%define %%T14 %16 ; [clobbered] temporary ZMM +%define %%T15 %17 ; [clobbered] temporary ZMM +%define %%T16 %18 ; [clobbered] temporary ZMM +%define %%GH %19 ; [in/cloberred] ghash sum (high) +%define %%GL %20 ; [in/cloberred] ghash sum (low) +%define %%GM %21 ; [in/cloberred] ghash sum (medium) +%define %%LOOP_BLK %22 ; [in] numerical number of blocks handled by the loop +%define %%DEPTH_BLK %23 ; [in] numerical number, pipeline depth (ghash vs aes) + +%define %%T0H %%T1 +%define %%T0L %%T2 +%define %%T0M1 %%T3 +%define %%T0M2 %%T4 + +%define %%T1H %%T5 +%define %%T1L %%T6 +%define %%T1M1 %%T7 +%define %%T1M2 %%T8 + +%define %%T2H %%T9 +%define %%T2L %%T10 +%define %%T2M1 %%T11 +%define %%T2M2 %%T12 + +%define %%BLK1 %%T13 +%define %%BLK2 %%T14 + +%define %%HK1 %%T15 +%define %%HK2 %%T16 + +%assign hashk HashKey_ %+ %%DEPTH_BLK +%assign cipher_blk (STACK_LOCAL_OFFSET + ((%%LOOP_BLK - %%DEPTH_BLK) * 16)) + + ;; load cipher blocks and ghash keys + vmovdqa64 %%BLK1, [rsp + cipher_blk] + vmovdqa64 %%BLK2, [rsp + cipher_blk + 64] + vmovdqu64 %%HK1, [%%KP + hashk] + vmovdqu64 %%HK2, [%%KP + hashk + 64] + ;; ghash blocks 0-3 + vpclmulqdq %%T0H, %%BLK1, %%HK1, 0x11 ; %%TH = a1*b1 + vpclmulqdq %%T0L, %%BLK1, %%HK1, 0x00 ; %%TL = a0*b0 + vpclmulqdq %%T0M1, %%BLK1, %%HK1, 0x01 ; %%TM1 = a1*b0 + vpclmulqdq %%T0M2, %%BLK1, %%HK1, 0x10 ; %%TM2 = a0*b1 + ;; ghash blocks 4-7 + vpclmulqdq %%T1H, %%BLK2, %%HK2, 0x11 ; %%TTH = a1*b1 + vpclmulqdq %%T1L, %%BLK2, %%HK2, 0x00 ; %%TTL = a0*b0 + vpclmulqdq %%T1M1, %%BLK2, %%HK2, 0x01 ; %%TTM1 = a1*b0 + vpclmulqdq %%T1M2, %%BLK2, %%HK2, 0x10 ; %%TTM2 = a0*b1 + vpternlogq %%T0H, %%T1H, %%GH, 0x96 ; T0H = T0H + T1H + GH + vpternlogq %%T0L, %%T1L, %%GL, 0x96 ; T0L = T0L + T1L + GL + vpternlogq %%T0M1, %%T1M1, %%GM, 0x96 ; T0M1 = T0M1 + T1M1 + GM + vpxorq %%T0M2, %%T0M2, %%T1M2 ; T0M2 = T0M2 + T1M2 + +%rep ((%%DEPTH_BLK - 8) / 8) +%assign hashk (hashk + 128) +%assign cipher_blk (cipher_blk + 128) + + ;; remaining blocks + ;; load next 8 cipher blocks and corresponding ghash keys + vmovdqa64 %%BLK1, [rsp + cipher_blk] + vmovdqa64 %%BLK2, [rsp + cipher_blk + 64] + vmovdqu64 %%HK1, [%%KP + hashk] + vmovdqu64 %%HK2, [%%KP + hashk + 64] + ;; ghash blocks 0-3 + vpclmulqdq %%T1H, %%BLK1, %%HK1, 0x11 ; %%TH = a1*b1 + vpclmulqdq %%T1L, %%BLK1, %%HK1, 0x00 ; %%TL = a0*b0 + vpclmulqdq %%T1M1, %%BLK1, %%HK1, 0x01 ; %%TM1 = a1*b0 + vpclmulqdq %%T1M2, %%BLK1, %%HK1, 0x10 ; %%TM2 = a0*b1 + ;; ghash blocks 4-7 + vpclmulqdq %%T2H, %%BLK2, %%HK2, 0x11 ; %%TTH = a1*b1 + vpclmulqdq %%T2L, %%BLK2, %%HK2, 0x00 ; %%TTL = a0*b0 + vpclmulqdq %%T2M1, %%BLK2, %%HK2, 0x01 ; %%TTM1 = a1*b0 + vpclmulqdq %%T2M2, %%BLK2, %%HK2, 0x10 ; %%TTM2 = a0*b1 + ;; update sums + vpternlogq %%T0H, %%T1H, %%T2H, 0x96 ; TH = T0H + T1H + T2H + vpternlogq %%T0L, %%T1L, %%T2L, 0x96 ; TL = T0L + T1L + T2L + vpternlogq %%T0M1, %%T1M1, %%T2M1, 0x96 ; TM1 = T0M1 + T1M1 xor T2M1 + vpternlogq %%T0M2, %%T1M2, %%T2M2, 0x96 ; TM2 = T0M2 + T1M1 xor T2M2 +%endrep + + ;; integrate TM into TH and TL + vpxorq %%T0M1, %%T0M1, %%T0M2 + vpsrldq %%T1M1, %%T0M1, 8 + vpslldq %%T1M2, %%T0M1, 8 + vpxorq %%T0H, %%T0H, %%T1M1 + vpxorq %%T0L, %%T0L, %%T1M2 + + ;; add TH and TL 128-bit words horizontally + VHPXORI4x128 %%T0H, %%T2M1 + VHPXORI4x128 %%T0L, %%T2M2 + + ;; reduction + vmovdqa64 %%HK1, [rel POLY2] + VCLMUL_REDUCE %%GHASH, %%HK1, %%T0H, %%T0L, %%T0M1, %%T0M2 +%endmacro + +;;; =========================================================================== +;;; =========================================================================== +;;; Encrypt & ghash multiples of 16 blocks + +%macro GHASH_ENCRYPT_Nx16_PARALLEL 39 +%define %%IN %1 ; [in] input buffer +%define %%OUT %2 ; [in] output buffer +%define %%GDATA_KEY %3 ; [in] pointer to expanded keys +%define %%DATA_OFFSET %4 ; [in/out] data offset +%define %%CTR_BE %5 ; [in/out] ZMM last counter block +%define %%SHFMSK %6 ; [in] ZMM with byte swap mask for pshufb +%define %%ZT0 %7 ; [clobered] temporary ZMM register +%define %%ZT1 %8 ; [clobered] temporary ZMM register +%define %%ZT2 %9 ; [clobered] temporary ZMM register +%define %%ZT3 %10 ; [clobered] temporary ZMM register +%define %%ZT4 %11 ; [clobered] temporary ZMM register +%define %%ZT5 %12 ; [clobered] temporary ZMM register +%define %%ZT6 %13 ; [clobered] temporary ZMM register +%define %%ZT7 %14 ; [clobered] temporary ZMM register +%define %%ZT8 %15 ; [clobered] temporary ZMM register +%define %%ZT9 %16 ; [clobered] temporary ZMM register +%define %%ZT10 %17 ; [clobered] temporary ZMM register +%define %%ZT11 %18 ; [clobered] temporary ZMM register +%define %%ZT12 %19 ; [clobered] temporary ZMM register +%define %%ZT13 %20 ; [clobered] temporary ZMM register +%define %%ZT14 %21 ; [clobered] temporary ZMM register +%define %%ZT15 %22 ; [clobered] temporary ZMM register +%define %%ZT16 %23 ; [clobered] temporary ZMM register +%define %%ZT17 %24 ; [clobered] temporary ZMM register +%define %%ZT18 %25 ; [clobered] temporary ZMM register +%define %%ZT19 %26 ; [clobered] temporary ZMM register +%define %%ZT20 %27 ; [clobered] temporary ZMM register +%define %%ZT21 %28 ; [clobered] temporary ZMM register +%define %%ZT22 %29 ; [clobered] temporary ZMM register +%define %%GTH %30 ; [in/out] ZMM GHASH sum (high) +%define %%GTL %31 ; [in/out] ZMM GHASH sum (low) +%define %%GTM %32 ; [in/out] ZMM GHASH sum (medium) +%define %%ADDBE_4x4 %33 ; [in] ZMM 4x128bits with value 4 (big endian) +%define %%ADDBE_1234 %34 ; [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian) +%define %%GHASH %35 ; [clobbered] ZMM with intermidiate GHASH value +%define %%ENC_DEC %36 ; [in] ENC (encrypt) or DEC (decrypt) selector +%define %%NUM_BLOCKS %37 ; [in] number of blocks to process in the loop +%define %%DEPTH_BLK %38 ; [in] pipeline depth in blocks +%define %%CTR_CHECK %39 ; [in/out] counter to check byte overflow + +%assign aesout_offset (STACK_LOCAL_OFFSET + (0 * 16)) +%assign ghashin_offset (STACK_LOCAL_OFFSET + ((%%NUM_BLOCKS - %%DEPTH_BLK) * 16)) +%assign hkey_offset HashKey_ %+ %%DEPTH_BLK +%assign data_in_out_offset 0 + + ;; mid 16 blocks +%if (%%DEPTH_BLK > 16) +%rep ((%%DEPTH_BLK - 16) / 16) + GHASH_16_ENCRYPT_16_PARALLEL %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \ + %%CTR_BE, %%CTR_CHECK, \ + hkey_offset, aesout_offset, ghashin_offset, %%SHFMSK, \ + %%ZT0, %%ZT1, %%ZT2, %%ZT3, \ + %%ZT4, %%ZT5, %%ZT6, %%ZT7, \ + %%ZT8, %%ZT9, %%ZT10, %%ZT11,\ + %%ZT12, %%ZT13, %%ZT14, %%ZT15,\ + %%ZT16, %%ZT17, %%ZT18, %%ZT19, \ + %%ZT20, %%ZT21, %%ZT22, \ + %%ADDBE_4x4, %%ADDBE_1234, \ + %%GTL, %%GTH, %%GTM, \ + no_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in + +%assign aesout_offset (aesout_offset + (16 * 16)) +%assign ghashin_offset (ghashin_offset + (16 * 16)) +%assign hkey_offset (hkey_offset + (16 * 16)) +%assign data_in_out_offset (data_in_out_offset + (16 * 16)) +%endrep +%endif + + ;; 16 blocks with reduction + GHASH_16_ENCRYPT_16_PARALLEL %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \ + %%CTR_BE, %%CTR_CHECK, \ + HashKey_16, aesout_offset, ghashin_offset, %%SHFMSK, \ + %%ZT0, %%ZT1, %%ZT2, %%ZT3, \ + %%ZT4, %%ZT5, %%ZT6, %%ZT7, \ + %%ZT8, %%ZT9, %%ZT10, %%ZT11,\ + %%ZT12, %%ZT13, %%ZT14, %%ZT15,\ + %%ZT16, %%ZT17, %%ZT18, %%ZT19, \ + %%ZT20, %%ZT21, %%ZT22, \ + %%ADDBE_4x4, %%ADDBE_1234, \ + %%GTL, %%GTH, %%GTM, \ + final_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in + +%assign aesout_offset (aesout_offset + (16 * 16)) +%assign data_in_out_offset (data_in_out_offset + (16 * 16)) +%assign ghashin_offset (STACK_LOCAL_OFFSET + (0 * 16)) +%assign hkey_offset HashKey_ %+ %%NUM_BLOCKS + + ;; === xor cipher block 0 with GHASH (ZT4) + vmovdqa64 %%GHASH, %%ZT4 + + ;; start the pipeline again + GHASH_16_ENCRYPT_16_PARALLEL %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \ + %%CTR_BE, %%CTR_CHECK, \ + hkey_offset, aesout_offset, ghashin_offset, %%SHFMSK, \ + %%ZT0, %%ZT1, %%ZT2, %%ZT3, \ + %%ZT4, %%ZT5, %%ZT6, %%ZT7, \ + %%ZT8, %%ZT9, %%ZT10, %%ZT11,\ + %%ZT12, %%ZT13, %%ZT14, %%ZT15,\ + %%ZT16, %%ZT17, %%ZT18, %%ZT19, \ + %%ZT20, %%ZT21, %%ZT22, \ + %%ADDBE_4x4, %%ADDBE_1234, \ + %%GTL, %%GTH, %%GTM, \ + first_time, %%ENC_DEC, data_in_out_offset, %%GHASH + +%if ((%%NUM_BLOCKS - %%DEPTH_BLK) > 16) +%rep ((%%NUM_BLOCKS - %%DEPTH_BLK - 16 ) / 16) + +%assign aesout_offset (aesout_offset + (16 * 16)) +%assign data_in_out_offset (data_in_out_offset + (16 * 16)) +%assign ghashin_offset (ghashin_offset + (16 * 16)) +%assign hkey_offset (hkey_offset + (16 * 16)) + + GHASH_16_ENCRYPT_16_PARALLEL %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \ + %%CTR_BE, %%CTR_CHECK, \ + hkey_offset, aesout_offset, ghashin_offset, %%SHFMSK, \ + %%ZT0, %%ZT1, %%ZT2, %%ZT3, \ + %%ZT4, %%ZT5, %%ZT6, %%ZT7, \ + %%ZT8, %%ZT9, %%ZT10, %%ZT11,\ + %%ZT12, %%ZT13, %%ZT14, %%ZT15,\ + %%ZT16, %%ZT17, %%ZT18, %%ZT19, \ + %%ZT20, %%ZT21, %%ZT22, \ + %%ADDBE_4x4, %%ADDBE_1234, \ + %%GTL, %%GTH, %%GTM, \ + no_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in +%endrep +%endif + + add %%DATA_OFFSET, (%%NUM_BLOCKS * 16) + +%endmacro ;GHASH_ENCRYPT_Nx16_PARALLEL +;;; =========================================================================== + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes. +; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX) and whether encoding or decoding (ENC_DEC). +; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN) +; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_COMPLETE 6 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%AUTH_TAG %3 +%define %%AUTH_TAG_LEN %4 +%define %%ENC_DEC %5 +%define %%INSTANCE_TYPE %6 +%define %%PLAIN_CYPH_LEN rax + + vmovdqu xmm13, [%%GDATA_KEY + HashKey] + ;; Start AES as early as possible + vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0 + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0) + +%ifidn %%INSTANCE_TYPE, multi_call + ;; If the GCM function is called as a single function call rather + ;; than invoking the individual parts (init, update, finalize) we + ;; can remove a write to read dependency on AadHash. + vmovdqu xmm14, [%%GDATA_CTX + AadHash] + + ;; Encrypt the final partial block. If we did this as a single call then + ;; the partial block was handled in the main GCM_ENC_DEC macro. + mov r12, [%%GDATA_CTX + PBlockLen] + cmp r12, 0 + + je %%_partial_done + + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + vmovdqu [%%GDATA_CTX + AadHash], xmm14 + +%%_partial_done: + +%endif + + mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes) + mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen] + + shl r12, 3 ; convert into number of bits + vmovd xmm15, r12d ; len(A) in xmm15 + + shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128) + vmovq xmm1, %%PLAIN_CYPH_LEN + vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000 + vpxor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C) + + vpxor xmm14, xmm15 + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 + vpshufb xmm14, [rel SHUF_MASK] ; perform a 16Byte swap + + vpxor xmm9, xmm9, xmm14 + + +%%_return_T: + mov r10, %%AUTH_TAG ; r10 = authTag + mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len + + cmp r11, 16 + je %%_T_16 + + cmp r11, 12 + je %%_T_12 + + cmp r11, 8 + je %%_T_8 + + simd_store_avx_15 r10, xmm9, r11, r12, rax + jmp %%_return_T_done +%%_T_8: + vmovq rax, xmm9 + mov [r10], rax + jmp %%_return_T_done +%%_T_12: + vmovq rax, xmm9 + mov [r10], rax + vpsrldq xmm9, xmm9, 8 + vmovd eax, xmm9 + mov [r10 + 8], eax + jmp %%_return_T_done +%%_T_16: + vmovdqu [r10], xmm9 + +%%_return_T_done: + +%ifdef SAFE_DATA + ;; Clear sensitive data from context structure + vpxor xmm0, xmm0 + vmovdqu [%%GDATA_CTX + AadHash], xmm0 + vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm0 +%endif +%endmacro ; GCM_COMPLETE + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_precomp_128_vaes_avx512 / +; aes_gcm_precomp_192_vaes_avx512 / +; aes_gcm_precomp_256_vaes_avx512 +; (struct gcm_key_data *key_data) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(precomp,_),function,) +FN_NAME(precomp,_): +;; Parameter is passed through register +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_precomp +%endif + + FUNC_SAVE + + vpxor xmm6, xmm6 + ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey + + vpshufb xmm6, [rel SHUF_MASK] + ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; + vmovdqa xmm2, xmm6 + vpsllq xmm6, xmm6, 1 + vpsrlq xmm2, xmm2, 63 + vmovdqa xmm1, xmm2 + vpslldq xmm2, xmm2, 8 + vpsrldq xmm1, xmm1, 8 + vpor xmm6, xmm6, xmm2 + ;reduction + vpshufd xmm2, xmm1, 00100100b + vpcmpeqd xmm2, [rel TWOONE] + vpand xmm2, xmm2, [rel POLY] + vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly + + + PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + + FUNC_RESTORE +exit_precomp: + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_init_128_vaes_avx512 / aes_gcm_init_192_vaes_avx512 / aes_gcm_init_256_vaes_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *iv, +; const u8 *aad, +; u64 aad_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(init,_),function,) +FN_NAME(init,_): + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_init + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_init + + ;; Check IV != NULL + cmp arg3, 0 + jz exit_init + + ;; Check if aad_len == 0 + cmp arg5, 0 + jz skip_aad_check_init + + ;; Check aad != NULL (aad_len != 0) + cmp arg4, 0 + jz exit_init + +skip_aad_check_init: +%endif + GCM_INIT arg1, arg2, arg3, arg4, arg5, r10, r11, r12, k1, xmm14, xmm2, \ + zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10 + +exit_init: + + FUNC_RESTORE + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_update_vaes_avx512 / aes_gcm_enc_192_update_vaes_avx512 / +; aes_gcm_enc_256_update_vaes_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(enc,_update_),function,) +FN_NAME(enc,_update_): + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_update_enc + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_update_enc + + ;; Check if plaintext_len == 0 + cmp arg5, 0 + jz skip_in_out_check_update_enc + + ;; Check out != NULL (plaintext_len != 0) + cmp arg3, 0 + jz exit_update_enc + + ;; Check in != NULL (plaintext_len != 0) + cmp arg4, 0 + jz exit_update_enc + +skip_in_out_check_update_enc: +%endif + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call + +exit_update_enc: + FUNC_RESTORE + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_update_vaes_avx512 / aes_gcm_dec_192_update_vaes_avx512 / +; aes_gcm_dec_256_update_vaes_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(dec,_update_),function,) +FN_NAME(dec,_update_): + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_update_dec + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_update_dec + + ;; Check if plaintext_len == 0 + cmp arg5, 0 + jz skip_in_out_check_update_dec + + ;; Check out != NULL (plaintext_len != 0) + cmp arg3, 0 + jz exit_update_dec + + ;; Check in != NULL (plaintext_len != 0) + cmp arg4, 0 + jz exit_update_dec + +skip_in_out_check_update_dec: +%endif + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call + +exit_update_dec: + FUNC_RESTORE + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_finalize_vaes_avx512 / aes_gcm_enc_192_finalize_vaes_avx512 / +; aes_gcm_enc_256_finalize_vaes_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(enc,_finalize_),function,) +FN_NAME(enc,_finalize_): + +;; All parameters are passed through registers +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_enc_fin + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_enc_fin + + ;; Check auth_tag != NULL + cmp arg3, 0 + jz exit_enc_fin + + ;; Check auth_tag_len == 0 or > 16 + cmp arg4, 0 + jz exit_enc_fin + + cmp arg4, 16 + ja exit_enc_fin +%endif + + FUNC_SAVE + GCM_COMPLETE arg1, arg2, arg3, arg4, ENC, multi_call + + FUNC_RESTORE + +exit_enc_fin: + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_finalize_vaes_avx512 / aes_gcm_dec_192_finalize_vaes_avx512 +; aes_gcm_dec_256_finalize_vaes_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(dec,_finalize_),function,) +FN_NAME(dec,_finalize_): + +;; All parameters are passed through registers +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_dec_fin + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_dec_fin + + ;; Check auth_tag != NULL + cmp arg3, 0 + jz exit_dec_fin + + ;; Check auth_tag_len == 0 or > 16 + cmp arg4, 0 + jz exit_dec_fin + + cmp arg4, 16 + ja exit_dec_fin +%endif + + FUNC_SAVE + GCM_COMPLETE arg1, arg2, arg3, arg4, DEC, multi_call + + FUNC_RESTORE + +exit_dec_fin: + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_vaes_avx512 / aes_gcm_enc_192_vaes_avx512 / aes_gcm_enc_256_vaes_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(enc,_),function,) +FN_NAME(enc,_): + + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_enc + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_enc + + ;; Check IV != NULL + cmp arg6, 0 + jz exit_enc + + ;; Check auth_tag != NULL + cmp arg9, 0 + jz exit_enc + + ;; Check auth_tag_len == 0 or > 16 + cmp arg10, 0 + jz exit_enc + + cmp arg10, 16 + ja exit_enc + + ;; Check if plaintext_len == 0 + cmp arg5, 0 + jz skip_in_out_check_enc + + ;; Check out != NULL (plaintext_len != 0) + cmp arg3, 0 + jz exit_enc + + ;; Check in != NULL (plaintext_len != 0) + cmp arg4, 0 + jz exit_enc + +skip_in_out_check_enc: + ;; Check if aad_len == 0 + cmp arg8, 0 + jz skip_aad_check_enc + + ;; Check aad != NULL (aad_len != 0) + cmp arg7, 0 + jz exit_enc + +skip_aad_check_enc: +%endif + GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12, k1, xmm14, xmm2, \ + zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10 + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call + GCM_COMPLETE arg1, arg2, arg9, arg10, ENC, single_call + +exit_enc: + FUNC_RESTORE + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_vaes_avx512 / aes_gcm_dec_192_vaes_avx512 / aes_gcm_dec_256_vaes_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(dec,_),function,) +FN_NAME(dec,_): + + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz exit_dec + + ;; Check context_data != NULL + cmp arg2, 0 + jz exit_dec + + ;; Check IV != NULL + cmp arg6, 0 + jz exit_dec + + ;; Check auth_tag != NULL + cmp arg9, 0 + jz exit_dec + + ;; Check auth_tag_len == 0 or > 16 + cmp arg10, 0 + jz exit_dec + + cmp arg10, 16 + ja exit_dec + + ;; Check if plaintext_len == 0 + cmp arg5, 0 + jz skip_in_out_check_dec + + ;; Check out != NULL (plaintext_len != 0) + cmp arg3, 0 + jz exit_dec + + ;; Check in != NULL (plaintext_len != 0) + cmp arg4, 0 + jz exit_dec + +skip_in_out_check_dec: + ;; Check if aad_len == 0 + cmp arg8, 0 + jz skip_aad_check_dec + + ;; Check aad != NULL (aad_len != 0) + cmp arg7, 0 + jz exit_dec + +skip_aad_check_dec: +%endif + GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12, k1, xmm14, xmm2, \ + zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10 + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call + GCM_COMPLETE arg1, arg2, arg9, arg10, DEC, single_call + +exit_dec: + FUNC_RESTORE + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif |