diff options
Diffstat (limited to 'src/spdk/intel-ipsec-mb/avx512/cntr_vaes_avx512.asm')
-rw-r--r-- | src/spdk/intel-ipsec-mb/avx512/cntr_vaes_avx512.asm | 1524 |
1 files changed, 1524 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/avx512/cntr_vaes_avx512.asm b/src/spdk/intel-ipsec-mb/avx512/cntr_vaes_avx512.asm new file mode 100644 index 000000000..50ff86b6e --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx512/cntr_vaes_avx512.asm @@ -0,0 +1,1524 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2019, Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%include "include/os.asm" +%include "include/reg_sizes.asm" +%include "mb_mgr_datastruct.asm" +%include "job_aes_hmac.asm" +%include "include/memcpy.asm" + +%include "include/aes_common.asm" +%include "include/const.inc" + +section .data +default rel + +align 16 +ONE: + dq 0x0000000000000001, 0x0000000000000000 + +align 64 +SHUF_MASK: + dq 0x08090A0B0C0D0E0F, 0x0001020304050607 + dq 0x08090A0B0C0D0E0F, 0x0001020304050607 + dq 0x08090A0B0C0D0E0F, 0x0001020304050607 + dq 0x08090A0B0C0D0E0F, 0x0001020304050607 + +align 64 +ddq_add_13_16: + dq 0x000000000000000d, 0x0000000000000000 + dq 0x000000000000000e, 0x0000000000000000 + dq 0x000000000000000f, 0x0000000000000000 + dq 0x0000000000000010, 0x0000000000000000 + +align 64 +ddq_add_9_12: + dq 0x0000000000000009, 0x0000000000000000 + dq 0x000000000000000a, 0x0000000000000000 + dq 0x000000000000000b, 0x0000000000000000 + dq 0x000000000000000c, 0x0000000000000000 + +align 64 +ddq_add_5_8: + dq 0x0000000000000005, 0x0000000000000000 + dq 0x0000000000000006, 0x0000000000000000 + dq 0x0000000000000007, 0x0000000000000000 + dq 0x0000000000000008, 0x0000000000000000 + +align 64 +ddq_add_1_4: + dq 0x0000000000000001, 0x0000000000000000 + dq 0x0000000000000002, 0x0000000000000000 + dq 0x0000000000000003, 0x0000000000000000 + dq 0x0000000000000004, 0x0000000000000000 + +align 64 +ddq_add_12_15: + dq 0x000000000000000c, 0x0000000000000000 + dq 0x000000000000000d, 0x0000000000000000 + dq 0x000000000000000e, 0x0000000000000000 + dq 0x000000000000000f, 0x0000000000000000 + +align 64 +ddq_add_8_11: + dq 0x0000000000000008, 0x0000000000000000 + dq 0x0000000000000009, 0x0000000000000000 + dq 0x000000000000000a, 0x0000000000000000 + dq 0x000000000000000b, 0x0000000000000000 + +align 64 +ddq_add_4_7: + dq 0x0000000000000004, 0x0000000000000000 + dq 0x0000000000000005, 0x0000000000000000 + dq 0x0000000000000006, 0x0000000000000000 + dq 0x0000000000000007, 0x0000000000000000 + +align 64 +ddq_add_0_3: + dq 0x0000000000000000, 0x0000000000000000 + dq 0x0000000000000001, 0x0000000000000000 + dq 0x0000000000000002, 0x0000000000000000 + dq 0x0000000000000003, 0x0000000000000000 + +align 64 +ddq_add_16: + dq 0x0000000000000010, 0x0000000000000000 + dq 0x0000000000000010, 0x0000000000000000 + dq 0x0000000000000010, 0x0000000000000000 + dq 0x0000000000000010, 0x0000000000000000 + +align 64 +byte64_len_to_mask_table: + dq 0x0000000000000000, 0x0000000000000001 + dq 0x0000000000000003, 0x0000000000000007 + dq 0x000000000000000f, 0x000000000000001f + dq 0x000000000000003f, 0x000000000000007f + dq 0x00000000000000ff, 0x00000000000001ff + dq 0x00000000000003ff, 0x00000000000007ff + dq 0x0000000000000fff, 0x0000000000001fff + dq 0x0000000000003fff, 0x0000000000007fff + dq 0x000000000000ffff, 0x000000000001ffff + dq 0x000000000003ffff, 0x000000000007ffff + dq 0x00000000000fffff, 0x00000000001fffff + dq 0x00000000003fffff, 0x00000000007fffff + dq 0x0000000000ffffff, 0x0000000001ffffff + dq 0x0000000003ffffff, 0x0000000007ffffff + dq 0x000000000fffffff, 0x000000001fffffff + dq 0x000000003fffffff, 0x000000007fffffff + dq 0x00000000ffffffff, 0x00000001ffffffff + dq 0x00000003ffffffff, 0x00000007ffffffff + dq 0x0000000fffffffff, 0x0000001fffffffff + dq 0x0000003fffffffff, 0x0000007fffffffff + dq 0x000000ffffffffff, 0x000001ffffffffff + dq 0x000003ffffffffff, 0x000007ffffffffff + dq 0x00000fffffffffff, 0x00001fffffffffff + dq 0x00003fffffffffff, 0x00007fffffffffff + dq 0x0000ffffffffffff, 0x0001ffffffffffff + dq 0x0003ffffffffffff, 0x0007ffffffffffff + dq 0x000fffffffffffff, 0x001fffffffffffff + dq 0x003fffffffffffff, 0x007fffffffffffff + dq 0x00ffffffffffffff, 0x01ffffffffffffff + dq 0x03ffffffffffffff, 0x07ffffffffffffff + dq 0x0fffffffffffffff, 0x1fffffffffffffff + dq 0x3fffffffffffffff, 0x7fffffffffffffff + dq 0xffffffffffffffff + +align 16 +initial_12_IV_counter: + dq 0x0000000000000000, 0x0100000000000000 + +mask_16_bytes: + dq 0x000000000000ffff + +section .text +default rel + +%ifdef LINUX +%define arg1 rdi +%else +%define arg1 rcx +%endif + +%define ZKEY0 zmm17 +%define ZKEY1 zmm18 +%define ZKEY2 zmm19 +%define ZKEY3 zmm20 +%define ZKEY4 zmm21 +%define ZKEY5 zmm22 +%define ZKEY6 zmm23 +%define ZKEY7 zmm24 +%define ZKEY8 zmm25 +%define ZKEY9 zmm26 +%define ZKEY10 zmm27 +%define ZKEY11 zmm28 +%define ZKEY12 zmm29 +%define ZKEY13 zmm30 +%define ZKEY14 zmm31 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Stack frame definition +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%ifidn __OUTPUT_FORMAT__, win64 + %define GP_STORAGE (7*8) ; space for 7 GP registers +%else + %define GP_STORAGE (5*8) ; space for 5 GP registers +%endif + +%define STACK_FRAME_SIZE GP_STORAGE + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Utility Macros +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; This macro is used to maintain the bits from the output text +;;; when writing out the output blocks, in case there are some bits +;;; that do not require encryption +%macro PRESERVE_BITS 12-13 +%define %%RBITS %1 ; [in] Remaining bits in last byte +%define %%LENGTH %2 ; [in] Length of the last set of blocks +%define %%CYPH_PLAIN_OUT %3 ; [in] Pointer to output buffer +%define %%ZIN_OUT %4 ; [in/out] ZMM with last set of output blocks +%define %%ZTMP0 %5 ; [clobbered] ZMM temporary +%define %%ZTMP1 %6 ; [clobbered] ZMM temporary +%define %%ZTMP2 %7 ; [clobbered] ZMM temporary +%define %%IA0 %8 ; [clobbered] GP temporary +%define %%IA1 %9 ; [clobbered] GP temporary +%define %%blocks_to_skip %10 ; [in] Number of blocks to skip from output +%define %%FULL_PARTIAL %11 ; [in] Last block type selection "full" or "partial" +%define %%MASKREG %12 ; [clobbered] Mask register +%define %%DATA_OFFSET %13 ; [in/out] Data offset +%define %%NUM_ARGS %0 + +;; offset = number of sets of 4 blocks to skip +%assign offset (((%%blocks_to_skip) / 4) * 64) +;; num_left_blocks = number of blocks in the last set +%assign num_left_blocks (((%%blocks_to_skip) & 3) + 1) ;; Range 1-4 blocks + +%if %%NUM_ARGS == 13 + ;; Load output to get last partial byte +%ifidn %%FULL_PARTIAL, partial + vmovdqu8 %%ZTMP0{%%MASKREG}, [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + offset] +%else + vmovdqu8 %%ZTMP0, [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + offset] +%endif ; %%FULL_PARTIAL == partial +%else + ;; Load output to get last partial byte (loading up to the last 4 blocks) + ZMM_LOAD_MASKED_BLOCKS_0_16 num_left_blocks, %%CYPH_PLAIN_OUT, offset, \ + %%ZTMP0, no_zmm, no_zmm, no_zmm, %%MASKREG +%endif ;; %%NUM_ARGS == 13 + + ;; Save RCX in temporary GP register + mov %%IA0, rcx + mov DWORD(%%IA1), 0xff + mov cl, BYTE(%%RBITS) + shr DWORD(%%IA1), cl ;; e.g. 3 remaining bits -> mask = 00011111 + mov rcx, %%IA0 + + vmovq XWORD(%%ZTMP1), %%IA1 + + ;; Get number of full bytes in last block. + ;; Subtracting the bytes in the blocks to skip to the length of whole + ;; set of blocks gives us the number of bytes in the last block, + ;; but the last block has a partial byte at the end, so an extra byte + ;; needs to be subtracted + mov %%IA1, %%LENGTH + sub %%IA1, (%%blocks_to_skip * 16 + 1) + XVPSLLB XWORD(%%ZTMP1), %%IA1, XWORD(%%ZTMP2), %%IA0 +%if num_left_blocks == 4 + vshufi64x2 %%ZTMP1, %%ZTMP1, %%ZTMP1, 0x15 +%elif num_left_blocks == 3 + vshufi64x2 %%ZTMP1, %%ZTMP1, %%ZTMP1, 0x45 +%elif num_left_blocks == 2 + vshufi64x2 %%ZTMP1, %%ZTMP1, %%ZTMP1, 0x51 +%endif ;; No need to shift if there is only one block + + ;; At this point, ZTMP1 contains a mask with all 0s, but with some ones + ;; in the partial byte + + ;; First, clear the last bits (not to be ciphered) of the last output block + ;; %%ZIN_OUT = %%ZIN_OUT AND NOT %%ZTMP1 (0x50 = andA!C) + vpternlogq %%ZIN_OUT, %%ZTMP1, %%ZTMP1, 0x50 + + ;; Then, set these last bits to the last bits coming from the output + ;; %%ZIN_OUT = %%ZIN_OUT OR (%%ZTMP0 AND %%ZTMP1) (0xF8 = orAandBC) + vpternlogq %%ZIN_OUT, %%ZTMP0, %%ZTMP1, 0xF8 + +%endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; This macro is used to "warm-up" pipeline for ENCRYPT_16_PARALLEL +;;; macro code. It is called only for data lengths 256 and above. +;;; The flow is as follows: +;;; - encrypt the initial %%num_initial_blocks blocks (can be 0) +;;; - encrypt the next 16 blocks +;;; - the last 16th block can be partial (lengths between 257 and 367) +;;; - partial block ciphering is handled within this macro + +%macro INITIAL_BLOCKS 26 +%define %%KEY %1 ; [in] pointer to key +%define %%CYPH_PLAIN_OUT %2 ; [in] output buffer +%define %%PLAIN_CYPH_IN %3 ; [in] input buffer +%define %%LENGTH %4 ; [in/out] number of bytes to process +%define %%DATA_OFFSET %5 ; [in/out] data offset +%define %%num_initial_blocks %6 ; [in] can be between 0 and 15 +%define %%CTR %7 ; [in] XMM first counter block +%define %%CTR_1_4 %8 ; [out] ZMM next 1-4 counter blocks +%define %%CTR_5_8 %9 ; [out] ZMM next 5-8 counter blocks +%define %%CTR_9_12 %10 ; [out] ZMM next 9-12 counter blocks +%define %%CTR_13_16 %11 ; [out] ZMM next 13-16 counter blocks +%define %%ZT1 %12 ; [clobbered] ZMM temporary +%define %%ZT2 %13 ; [clobbered] ZMM temporary +%define %%ZT3 %14 ; [clobbered] ZMM temporary +%define %%ZT4 %15 ; [clobbered] ZMM temporary +%define %%ZT5 %16 ; [clobbered] ZMM temporary +%define %%ZT6 %17 ; [clobbered] ZMM temporary +%define %%ZT7 %18 ; [clobbered] ZMM temporary +%define %%ZT8 %19 ; [clobbered] ZMM temporary +%define %%IA0 %20 ; [clobbered] GP temporary +%define %%IA1 %21 ; [clobbered] GP temporary +%define %%MASKREG %22 ; [clobbered] mask register +%define %%SHUFREG %23 ; [in] ZMM register with shuffle mask +%define %%NROUNDS %24 ; [in] number of rounds; numerical value +%define %%CNTR_TYPE %25 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT) +%define %%RBITS %26 ; [in] Number of remaining bits in last byte + +%define %%T1 XWORD(%%ZT1) +%define %%T2 XWORD(%%ZT2) +%define %%T3 XWORD(%%ZT3) +%define %%T4 XWORD(%%ZT4) +%define %%T5 XWORD(%%ZT5) +%define %%T6 XWORD(%%ZT6) +%define %%T7 XWORD(%%ZT7) +%define %%T8 XWORD(%%ZT8) + +%ifidn %%CNTR_TYPE, CNTR +%define %%VPADD vpaddd +%else +%define %%VPADD vpaddq +%endif + +%if %%num_initial_blocks > 0 + ;; load plain/cipher text + ZMM_LOAD_BLOCKS_0_16 %%num_initial_blocks, %%PLAIN_CYPH_IN, 0, \ + %%ZT5, %%ZT6, %%ZT7, %%ZT8, load_4_instead_of_3 + + ;; prepare AES counter blocks +%if %%num_initial_blocks > 1 +%if %%num_initial_blocks == 2 + vshufi64x2 YWORD(%%ZT1), YWORD(%%CTR), YWORD(%%CTR), 0 + %%VPADD YWORD(%%ZT1), YWORD(%%ZT1), [rel ddq_add_0_3] +%elif %%num_initial_blocks <= 4 + vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0 + %%VPADD %%ZT1, ZWORD(%%CTR), [rel ddq_add_0_3] +%elif %%num_initial_blocks <= 8 + vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0 + %%VPADD %%ZT1, ZWORD(%%CTR), [rel ddq_add_0_3] + %%VPADD %%ZT2, ZWORD(%%CTR), [rel ddq_add_4_7] +%elif %%num_initial_blocks <= 12 + vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0 + %%VPADD %%ZT1, ZWORD(%%CTR), [rel ddq_add_0_3] + %%VPADD %%ZT2, ZWORD(%%CTR), [rel ddq_add_4_7] + %%VPADD %%ZT3, ZWORD(%%CTR), [rel ddq_add_8_11] +%else + vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0 + %%VPADD %%ZT1, ZWORD(%%CTR), [rel ddq_add_0_3] + %%VPADD %%ZT2, ZWORD(%%CTR), [rel ddq_add_4_7] + %%VPADD %%ZT3, ZWORD(%%CTR), [rel ddq_add_8_11] + %%VPADD %%ZT4, ZWORD(%%CTR), [rel ddq_add_12_15] +%endif +%endif + + ;; extract new counter value (%%T1) + ;; shuffle the counters for AES rounds +%if %%num_initial_blocks == 1 + vpshufb %%T1, %%CTR, XWORD(%%SHUFREG) +%elif %%num_initial_blocks == 2 + vextracti32x4 %%CTR, YWORD(%%ZT1), 1 + vpshufb YWORD(%%ZT1), YWORD(%%SHUFREG) +%elif %%num_initial_blocks <= 4 + vextracti32x4 %%CTR, %%ZT1, (%%num_initial_blocks - 1) + vpshufb %%ZT1, %%SHUFREG +%elif %%num_initial_blocks == 5 + vmovdqa64 %%CTR, %%T2 + vpshufb %%ZT1, %%SHUFREG + vpshufb %%T2, XWORD(%%SHUFREG) +%elif %%num_initial_blocks == 6 + vextracti32x4 %%CTR, YWORD(%%ZT2), 1 + vpshufb %%ZT1, %%SHUFREG + vpshufb YWORD(%%ZT2), YWORD(%%SHUFREG) +%elif %%num_initial_blocks = 7 + vextracti32x4 %%CTR, %%ZT2, 2 + vpshufb %%ZT1, %%SHUFREG + vpshufb %%ZT2, %%SHUFREG +%elif %%num_initial_blocks = 8 + vextracti32x4 %%CTR, %%ZT2, 3 + vpshufb %%ZT1, %%SHUFREG + vpshufb %%ZT2, %%SHUFREG +%elif %%num_initial_blocks = 9 + vmovdqa64 %%CTR, %%T3 + vpshufb %%ZT1, %%SHUFREG + vpshufb %%ZT2, %%SHUFREG + vpshufb %%T3, XWORD(%%SHUFREG) +%elif %%num_initial_blocks = 10 + vextracti32x4 %%CTR, YWORD(%%ZT3), 1 + vpshufb %%ZT1, %%SHUFREG + vpshufb %%ZT2, %%SHUFREG + vpshufb YWORD(%%ZT3), YWORD(%%SHUFREG) +%elif %%num_initial_blocks = 11 + vextracti32x4 %%CTR, %%ZT3, 2 + vpshufb %%ZT1, %%SHUFREG + vpshufb %%ZT2, %%SHUFREG + vpshufb %%ZT3, %%SHUFREG +%elif %%num_initial_blocks = 12 + vextracti32x4 %%CTR, %%ZT3, 3 + vpshufb %%ZT1, %%SHUFREG + vpshufb %%ZT2, %%SHUFREG + vpshufb %%ZT3, %%SHUFREG +%elif %%num_initial_blocks = 13 + vmovdqa64 %%CTR, %%T4 + vpshufb %%ZT1, %%SHUFREG + vpshufb %%ZT2, %%SHUFREG + vpshufb %%ZT3, %%SHUFREG + vpshufb %%T4, XWORD(%%SHUFREG) +%elif %%num_initial_blocks = 14 + vextracti32x4 %%CTR, YWORD(%%ZT4), 1 + vpshufb %%ZT1, %%SHUFREG + vpshufb %%ZT2, %%SHUFREG + vpshufb %%ZT3, %%SHUFREG + vpshufb YWORD(%%ZT4), YWORD(%%SHUFREG) +%elif %%num_initial_blocks = 15 + vextracti32x4 %%CTR, %%ZT4, 2 + vpshufb %%ZT1, %%SHUFREG + vpshufb %%ZT2, %%SHUFREG + vpshufb %%ZT3, %%SHUFREG + vpshufb %%ZT4, %%SHUFREG +%endif + + ;; AES rounds and XOR with plain/cipher text +%assign j 0 +%rep (%%NROUNDS + 2) + ZMM_AESENC_ROUND_BLOCKS_0_16 \ + %%ZT1, %%ZT2, %%ZT3, %%ZT4, ZKEY %+ j, j, \ + %%ZT5, %%ZT6, %%ZT7, %%ZT8, %%num_initial_blocks, \ + %%NROUNDS +%assign j (j + 1) +%endrep + + ;; write cipher/plain text back to output + ZMM_STORE_BLOCKS_0_16 %%num_initial_blocks, %%CYPH_PLAIN_OUT, 0, \ + %%ZT1, %%ZT2, %%ZT3, %%ZT4 + + ;; adjust data offset and length + sub %%LENGTH, (%%num_initial_blocks * 16) + add %%DATA_OFFSET, (%%num_initial_blocks * 16) +%endif ; %%num_initial_blocks > 0 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; - cipher of %%num_initial_blocks is done + ;; - prepare counter blocks for the next 16 blocks (ZT5-ZT8) + ;; - shuffle the blocks for AES + ;; - encrypt the next 16 blocks + + ;; get text load/store mask (assume full mask by default) + mov %%IA0, 0xffff_ffff_ffff_ffff +%if %%num_initial_blocks > 0 + ;; NOTE: 'jge' is always taken for %%num_initial_blocks = 0 + ;; This macro is executed for length 256 and up, + ;; zero length is checked in CNTR_ENC_DEC. + ;; We know there is partial block if: + ;; LENGTH - 16*num_initial_blocks < 256 + cmp %%LENGTH, 256 + jge %%_initial_partial_block_continue + mov %%IA1, rcx + mov rcx, 256 + sub rcx, %%LENGTH + shr %%IA0, cl + mov rcx, %%IA1 +%%_initial_partial_block_continue: +%endif + kmovq %%MASKREG, %%IA0 + ;; load plain or cipher text + vmovdqu8 %%ZT5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + vmovdqu8 %%ZT6, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 64] + vmovdqu8 %%ZT7, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 128] + vmovdqu8 %%ZT8{%%MASKREG}{z}, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 192] + + ;; prepare next counter blocks + vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0 +%if %%num_initial_blocks > 0 + vpaddd %%CTR_1_4, ZWORD(%%CTR), [rel ddq_add_1_4] + vpaddd %%CTR_5_8, ZWORD(%%CTR), [rel ddq_add_5_8] + vpaddd %%CTR_9_12, ZWORD(%%CTR), [rel ddq_add_9_12] + vpaddd %%CTR_13_16, ZWORD(%%CTR), [rel ddq_add_13_16] +%else + vpaddd %%CTR_1_4, ZWORD(%%CTR), [rel ddq_add_0_3] + vpaddd %%CTR_5_8, ZWORD(%%CTR), [rel ddq_add_4_7] + vpaddd %%CTR_9_12, ZWORD(%%CTR), [rel ddq_add_8_11] + vpaddd %%CTR_13_16, ZWORD(%%CTR), [rel ddq_add_12_15] +%endif + + vpshufb %%ZT1, %%CTR_1_4, %%SHUFREG + vpshufb %%ZT2, %%CTR_5_8, %%SHUFREG + vpshufb %%ZT3, %%CTR_9_12, %%SHUFREG + vpshufb %%ZT4, %%CTR_13_16, %%SHUFREG + + ;; AES rounds and XOR with plain/cipher text +%assign j 0 +%rep (%%NROUNDS + 2) + ZMM_AESENC_ROUND_BLOCKS_0_16 \ + %%ZT1, %%ZT2, %%ZT3, %%ZT4, ZKEY %+ j, j, \ + %%ZT5, %%ZT6, %%ZT7, %%ZT8, 16, %%NROUNDS +%assign j (j + 1) +%endrep + +%ifidn %%CNTR_TYPE, CNTR_BIT + ;; check if this is the end of the message + cmp %%LENGTH, 256 + jg %%store_output + ;; Check if there is a partial byte + or %%RBITS, %%RBITS + jz %%store_output + + ;; Copy the bits that are not ciphered from the output text, + ;; into the last bits of the output block, before writing it out + PRESERVE_BITS %%RBITS, %%LENGTH, %%CYPH_PLAIN_OUT, %%ZT4, %%ZT5, %%ZT6, %%ZT7, \ + %%IA0, %%IA1, 15, partial, %%MASKREG, %%DATA_OFFSET + +%endif + +%%store_output: + ;; write cipher/plain text back to output + vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], %%ZT1 + vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 64], %%ZT2 + vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 128], %%ZT3 + vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 192]{%%MASKREG}, %%ZT4 + + ;; check if there is partial block + cmp %%LENGTH, 256 + jl %%_initial_partial_done + ;; adjust offset and length + add %%DATA_OFFSET, 256 + sub %%LENGTH, 256 + jmp %%_initial_blocks_done +%%_initial_partial_done: + ;; zero the length (all encryption is complete) + xor %%LENGTH, %%LENGTH +%%_initial_blocks_done: + +%endmacro ; INITIAL_BLOCKS +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; INITIAL_BLOCKS_PARTIAL macro with support for a partial final block. +;;; It may look similar to INITIAL_BLOCKS but its usage is different: +;;; - It is not meant to cipher counter blocks for the main by16 loop. +;;; Just ciphers amount of blocks. +;;; - Small packets (<256 bytes) +;;; +;;; num_initial_blocks is expected to include the partial final block +;;; in the count. +%macro INITIAL_BLOCKS_PARTIAL 21 +%define %%KEY %1 ; [in] key pointer +%define %%CYPH_PLAIN_OUT %2 ; [in] text out pointer +%define %%PLAIN_CYPH_IN %3 ; [in] text out pointer +%define %%LENGTH %4 ; [in/clobbered] length in bytes +%define %%num_initial_blocks %5 ; [in] can be from 1 to 16 (not 0) +%define %%CTR %6 ; [in/out] current counter value +%define %%ZT1 %7 ; [clobbered] ZMM temporary +%define %%ZT2 %8 ; [clobbered] ZMM temporary +%define %%ZT3 %9 ; [clobbered] ZMM temporary +%define %%ZT4 %10 ; [clobbered] ZMM temporary +%define %%ZT5 %11 ; [clobbered] ZMM temporary +%define %%ZT6 %12 ; [clobbered] ZMM temporary +%define %%ZT7 %13 ; [clobbered] ZMM temporary +%define %%ZT8 %14 ; [clobbered] ZMM temporary +%define %%IA0 %15 ; [clobbered] GP temporary +%define %%IA1 %16 ; [clobbered] GP temporary +%define %%MASKREG %17 ; [clobbered] mask register +%define %%SHUFREG %18 ; [in] ZMM register with shuffle mask +%define %%NROUNDS %19 ; [in] number of rounds; numerical value +%define %%CNTR_TYPE %20 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT) +%define %%RBITS %21 ; [in] Number of remaining bits in last byte + +%define %%T1 XWORD(%%ZT1) +%define %%T2 XWORD(%%ZT2) +%define %%T3 XWORD(%%ZT3) +%define %%T4 XWORD(%%ZT4) +%define %%T5 XWORD(%%ZT5) +%define %%T6 XWORD(%%ZT6) +%define %%T7 XWORD(%%ZT7) +%define %%T8 XWORD(%%ZT8) + + ;; get load/store mask + lea %%IA0, [rel byte64_len_to_mask_table] + mov %%IA1, %%LENGTH +%if %%num_initial_blocks > 12 + sub %%IA1, 192 +%elif %%num_initial_blocks > 8 + sub %%IA1, 128 +%elif %%num_initial_blocks > 4 + sub %%IA1, 64 +%endif + kmovq %%MASKREG, [%%IA0 + %%IA1*8] + + ;; load plain/cipher text + ZMM_LOAD_MASKED_BLOCKS_0_16 %%num_initial_blocks, %%PLAIN_CYPH_IN, 0, \ + %%ZT5, %%ZT6, %%ZT7, %%ZT8, %%MASKREG + + ;; prepare AES counter blocks +%if %%num_initial_blocks == 1 + vmovdqa64 XWORD(%%ZT1), XWORD(%%CTR) +%elif %%num_initial_blocks == 2 + vshufi64x2 YWORD(%%ZT1), YWORD(%%CTR), YWORD(%%CTR), 0 + vpaddd YWORD(%%ZT1), YWORD(%%ZT1), [rel ddq_add_0_3] +%elif %%num_initial_blocks <= 4 + vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0 + vpaddd %%ZT1, ZWORD(%%CTR), [rel ddq_add_0_3] +%elif %%num_initial_blocks <= 8 + vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0 + vpaddd %%ZT1, ZWORD(%%CTR), [rel ddq_add_0_3] + vpaddd %%ZT2, ZWORD(%%CTR), [rel ddq_add_4_7] +%elif %%num_initial_blocks <= 12 + vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0 + vpaddd %%ZT1, ZWORD(%%CTR), [rel ddq_add_0_3] + vpaddd %%ZT2, ZWORD(%%CTR), [rel ddq_add_4_7] + vpaddd %%ZT3, ZWORD(%%CTR), [rel ddq_add_8_11] +%else + vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0 + vpaddd %%ZT1, ZWORD(%%CTR), [rel ddq_add_0_3] + vpaddd %%ZT2, ZWORD(%%CTR), [rel ddq_add_4_7] + vpaddd %%ZT3, ZWORD(%%CTR), [rel ddq_add_8_11] + vpaddd %%ZT4, ZWORD(%%CTR), [rel ddq_add_12_15] +%endif + + ;; shuffle the counters for AES rounds + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \ + %%ZT1, %%ZT2, %%ZT3, %%ZT4, \ + %%ZT1, %%ZT2, %%ZT3, %%ZT4, \ + %%SHUFREG, %%SHUFREG, %%SHUFREG, %%SHUFREG + + ;; AES rounds and XOR with plain/cipher text +%assign j 0 +%rep (%%NROUNDS + 2) + ZMM_AESENC_ROUND_BLOCKS_0_16 \ + %%ZT1, %%ZT2, %%ZT3, %%ZT4, ZKEY %+ j, j, \ + %%ZT5, %%ZT6, %%ZT7, %%ZT8, %%num_initial_blocks, \ + %%NROUNDS +%assign j (j + 1) +%endrep + +%ifidn %%CNTR_TYPE, CNTR_BIT + ;; Check if there is a partial byte + or %%RBITS, %%RBITS + jz %%store_output + + ;; Copy the bits that are not ciphered from the output text, + ;; into the last bits of the output block, before writing it out +%if %%num_initial_blocks <= 4 + PRESERVE_BITS %%RBITS, %%LENGTH, %%CYPH_PLAIN_OUT, %%ZT1, %%ZT5, %%ZT6, %%ZT7, \ + %%IA0, %%IA1, (%%num_initial_blocks - 1), \ + partial, %%MASKREG +%elif %%num_initial_blocks <= 8 + PRESERVE_BITS %%RBITS, %%LENGTH, %%CYPH_PLAIN_OUT, %%ZT2, %%ZT5, %%ZT6, %%ZT7, \ + %%IA0, %%IA1, (%%num_initial_blocks - 1), \ + partial, %%MASKREG +%elif %%num_initial_blocks <= 12 + PRESERVE_BITS %%RBITS, %%LENGTH, %%CYPH_PLAIN_OUT, %%ZT3, %%ZT5, %%ZT6, %%ZT7, \ + %%IA0, %%IA1, (%%num_initial_blocks - 1), \ + partial, %%MASKREG +%else + PRESERVE_BITS %%RBITS, %%LENGTH, %%CYPH_PLAIN_OUT, %%ZT4, %%ZT5, %%ZT6, %%ZT7, \ + %%IA0, %%IA1, (%%num_initial_blocks - 1), \ + partial, %%MASKREG +%endif + +%endif + +%%store_output: + ;; write cipher/plain text back to output + ZMM_STORE_MASKED_BLOCKS_0_16 %%num_initial_blocks, %%CYPH_PLAIN_OUT, 0, \ + %%ZT1, %%ZT2, %%ZT3, %%ZT4, %%MASKREG + +%endmacro ; INITIAL_BLOCKS_PARTIAL + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Main CNTR macro +;;; - operates on single stream +;;; - encrypts 16 blocks at a time +%macro ENCRYPT_16_PARALLEL 26 +%define %%KEY %1 ; [in] key pointer +%define %%CYPH_PLAIN_OUT %2 ; [in] pointer to output buffer +%define %%PLAIN_CYPH_IN %3 ; [in] pointer to input buffer +%define %%DATA_OFFSET %4 ; [in] data offset +%define %%CTR_1_4 %5 ; [in/out] ZMM next 1-4 counter blocks +%define %%CTR_5_8 %6 ; [in/out] ZMM next 5-8 counter blocks +%define %%CTR_9_12 %7 ; [in/out] ZMM next 9-12 counter blocks +%define %%CTR_13_16 %8 ; [in/out] ZMM next 13-16 counter blocks +%define %%FULL_PARTIAL %9 ; [in] last block type selection "full" or "partial" +%define %%IA0 %10 ; [clobbered] temporary GP register +%define %%IA1 %11 ; [clobbered] temporary GP register +%define %%LENGTH %12 ; [in] length +%define %%ZT1 %13 ; [clobbered] temporary ZMM (cipher) +%define %%ZT2 %14 ; [clobbered] temporary ZMM (cipher) +%define %%ZT3 %15 ; [clobbered] temporary ZMM (cipher) +%define %%ZT4 %16 ; [clobbered] temporary ZMM (cipher) +%define %%ZT5 %17 ; [clobbered] temporary ZMM (cipher) +%define %%ZT6 %18 ; [clobbered] temporary ZMM (cipher) +%define %%ZT7 %19 ; [clobbered] temporary ZMM (cipher) +%define %%ZT8 %20 ; [clobbered] temporary ZMM (cipher) +%define %%MASKREG %21 ; [clobbered] mask register for partial loads/stores +%define %%SHUFREG %22 ; [in] ZMM register with shuffle mask +%define %%ADD8REG %23 ; [in] ZMM register with increment by 8 mask +%define %%NROUNDS %24 ; [in] number of rounds; numerical value +%define %%CNTR_TYPE %25 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT) +%define %%RBITS %26 ; [in] Number of remaining bits in last byte + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; load/store mask (partial case) and load the text data +%ifidn %%FULL_PARTIAL, full + vmovdqu8 %%ZT5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + vmovdqu8 %%ZT6, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 64] + vmovdqu8 %%ZT7, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 128] + vmovdqu8 %%ZT8, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 192] +%else + lea %%IA0, [rel byte64_len_to_mask_table] + mov %%IA1, %%LENGTH + sub %%IA1, (3*64) + kmovq %%MASKREG, [%%IA0 + 8*%%IA1] + vmovdqu8 %%ZT5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + vmovdqu8 %%ZT6, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 64] + vmovdqu8 %%ZT7, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 128] + vmovdqu8 %%ZT8{%%MASKREG}{z}, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 192] +%endif + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; populate counter blocks + ;; %%CTR is shuffled outside the scope of this macro + ;; it has to be kept in unshuffled form + vpaddd %%CTR_1_4, %%CTR_1_4, %%ADD8REG + vpaddd %%CTR_5_8, %%CTR_5_8, %%ADD8REG + vpaddd %%CTR_9_12, %%CTR_9_12, %%ADD8REG + vpaddd %%CTR_13_16, %%CTR_13_16, %%ADD8REG + vpshufb %%ZT1, %%CTR_1_4, %%SHUFREG + vpshufb %%ZT2, %%CTR_5_8, %%SHUFREG + vpshufb %%ZT3, %%CTR_9_12, %%SHUFREG + vpshufb %%ZT4, %%CTR_13_16, %%SHUFREG + +%assign j 0 +%rep (%%NROUNDS + 2) + ZMM_AESENC_ROUND_BLOCKS_0_16 \ + %%ZT1, %%ZT2, %%ZT3, %%ZT4, ZKEY %+ j, j, \ + %%ZT5, %%ZT6, %%ZT7, %%ZT8, 16, %%NROUNDS +%assign j (j + 1) +%endrep + +%ifidn %%CNTR_TYPE, CNTR_BIT + ;; Check if this is the last round + cmp %%LENGTH, 256 + jg %%store_output + ;; Check if there is a partial byte + or %%RBITS, %%RBITS + jz %%store_output + + ;; Copy the bits that are not ciphered from the output text, + ;; into the last bits of the output block, before writing it out + PRESERVE_BITS %%RBITS, %%LENGTH, %%CYPH_PLAIN_OUT, %%ZT4, %%ZT5, %%ZT6, %%ZT7, \ + %%IA0, %%IA1, 15, %%FULL_PARTIAL, %%MASKREG, %%DATA_OFFSET + +%endif + +%%store_output: + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; store the text data +%ifidn %%FULL_PARTIAL, full + vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], %%ZT1 + vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 64], %%ZT2 + vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 128], %%ZT3 + vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 192], %%ZT4 +%else + vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], %%ZT1 + vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 64], %%ZT2 + vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 128], %%ZT3 + vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 192]{%%MASKREG}, %%ZT4 +%endif + +%endmacro ; ENCRYPT_16_PARALLEL + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Save register content for the caller +%macro FUNC_SAVE 1 +%define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT) + mov rax, rsp + + sub rsp, STACK_FRAME_SIZE + and rsp, ~63 + + mov [rsp + 0*8], r12 + mov [rsp + 1*8], r13 +%ifidn %%CNTR_TYPE, CNTR_BIT + mov [rsp + 2*8], r14 +%endif + mov [rsp + 3*8], rax ; stack +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + 4*8], rdi + mov [rsp + 5*8], rsi +%endif + +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Restore register content for the caller +%macro FUNC_RESTORE 1 +%define %%CNTR_TYPE %1 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT) + + vzeroupper +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [rsp + 4*8] + mov rsi, [rsp + 5*8] +%endif + mov r12, [rsp + 0*8] + mov r13, [rsp + 1*8] +%ifidn %%CNTR_TYPE, CNTR_BIT + mov r14, [rsp + 2*8] +%endif + mov rsp, [rsp + 3*8] ; stack +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; Cipher payloads shorter than 256 bytes +;;; - number of blocks in the message comes as argument +;;; - depending on the number of blocks an optimized variant of +;;; INITIAL_BLOCKS_PARTIAL is invoked +%macro CNTR_ENC_DEC_SMALL 21 +%define %%KEY %1 ; [in] key pointer +%define %%CYPH_PLAIN_OUT %2 ; [in] output buffer +%define %%PLAIN_CYPH_IN %3 ; [in] input buffer +%define %%LENGTH %4 ; [in] data length +%define %%NUM_BLOCKS %5 ; [in] number of blocks to process 1 to 8 +%define %%CTR %6 ; [in/out] XMM counter block +%define %%ZTMP1 %7 ; [clobbered] ZMM register +%define %%ZTMP2 %8 ; [clobbered] ZMM register +%define %%ZTMP3 %9 ; [clobbered] ZMM register +%define %%ZTMP4 %10 ; [clobbered] ZMM register +%define %%ZTMP5 %11 ; [clobbered] ZMM register +%define %%ZTMP6 %12 ; [clobbered] ZMM register +%define %%ZTMP7 %13 ; [clobbered] ZMM register +%define %%ZTMP8 %14 ; [clobbered] ZMM register +%define %%IA0 %15 ; [clobbered] GP register +%define %%IA1 %16 ; [clobbered] GP register +%define %%MASKREG %17 ; [clobbered] mask register +%define %%SHUFREG %18 ; [in] ZMM register with shuffle mask +%define %%NROUNDS %19 ; [in] number of rounds; numerical value +%define %%CNTR_TYPE %20 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT) +%define %%RBITS %21 ; [in] Number of remaining bits in last byte + + cmp %%NUM_BLOCKS, 8 + je %%_small_initial_num_blocks_is_8 + jl %%_small_initial_blocks_is_1_7 + + ; Initial blocks 9-16 + cmp %%NUM_BLOCKS, 12 + je %%_small_initial_num_blocks_is_12 + jl %%_small_initial_blocks_is_9_11 + + ; Initial blocks 13-16 + cmp %%NUM_BLOCKS, 16 + je %%_small_initial_num_blocks_is_16 + cmp %%NUM_BLOCKS, 15 + je %%_small_initial_num_blocks_is_15 + cmp %%NUM_BLOCKS, 14 + je %%_small_initial_num_blocks_is_14 + cmp %%NUM_BLOCKS, 13 + je %%_small_initial_num_blocks_is_13 + +%%_small_initial_blocks_is_9_11: + cmp %%NUM_BLOCKS, 11 + je %%_small_initial_num_blocks_is_11 + cmp %%NUM_BLOCKS, 10 + je %%_small_initial_num_blocks_is_10 + cmp %%NUM_BLOCKS, 9 + je %%_small_initial_num_blocks_is_9 + +%%_small_initial_blocks_is_1_7: + cmp %%NUM_BLOCKS, 4 + je %%_small_initial_num_blocks_is_4 + jl %%_small_initial_blocks_is_1_3 + + ; Initial blocks 5-7 + cmp %%NUM_BLOCKS, 7 + je %%_small_initial_num_blocks_is_7 + cmp %%NUM_BLOCKS, 6 + je %%_small_initial_num_blocks_is_6 + cmp %%NUM_BLOCKS, 5 + je %%_small_initial_num_blocks_is_5 + +%%_small_initial_blocks_is_1_3: + cmp %%NUM_BLOCKS, 3 + je %%_small_initial_num_blocks_is_3 + cmp %%NUM_BLOCKS, 2 + je %%_small_initial_num_blocks_is_2 + + jmp %%_small_initial_num_blocks_is_1 + + +%%_small_initial_num_blocks_is_16: + INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \ + %%PLAIN_CYPH_IN, %%LENGTH, 16, \ + %%CTR, \ + %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \ + %%ZTMP6, %%ZTMP7, %%ZTMP8, \ + %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \ + %%CNTR_TYPE, %%RBITS + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_15: + INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \ + %%PLAIN_CYPH_IN, %%LENGTH, 15, \ + %%CTR, \ + %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \ + %%ZTMP6, %%ZTMP7, %%ZTMP8, \ + %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \ + %%CNTR_TYPE, %%RBITS + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_14: + INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \ + %%PLAIN_CYPH_IN, %%LENGTH, 14, \ + %%CTR, \ + %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \ + %%ZTMP6, %%ZTMP7, %%ZTMP8, \ + %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \ + %%CNTR_TYPE, %%RBITS + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_13: + INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \ + %%PLAIN_CYPH_IN, %%LENGTH, 13, \ + %%CTR, \ + %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \ + %%ZTMP6, %%ZTMP7, %%ZTMP8, \ + %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \ + %%CNTR_TYPE, %%RBITS + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_12: + INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \ + %%PLAIN_CYPH_IN, %%LENGTH, 12, \ + %%CTR, \ + %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \ + %%ZTMP6, %%ZTMP7, %%ZTMP8, \ + %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \ + %%CNTR_TYPE, %%RBITS + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_11: + INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \ + %%PLAIN_CYPH_IN, %%LENGTH, 11, \ + %%CTR, \ + %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \ + %%ZTMP6, %%ZTMP7, %%ZTMP8, \ + %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \ + %%CNTR_TYPE, %%RBITS + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_10: + INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \ + %%PLAIN_CYPH_IN, %%LENGTH, 10, \ + %%CTR, \ + %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \ + %%ZTMP6, %%ZTMP7, %%ZTMP8, \ + %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \ + %%CNTR_TYPE, %%RBITS + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_9: + INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \ + %%PLAIN_CYPH_IN, %%LENGTH, 9, \ + %%CTR, \ + %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \ + %%ZTMP6, %%ZTMP7, %%ZTMP8, \ + %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \ + %%CNTR_TYPE, %%RBITS + jmp %%_small_initial_blocks_encrypted +%%_small_initial_num_blocks_is_8: + INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \ + %%PLAIN_CYPH_IN, %%LENGTH, 8, \ + %%CTR, \ + %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \ + %%ZTMP6, %%ZTMP7, %%ZTMP8, \ + %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \ + %%CNTR_TYPE, %%RBITS + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_7: + INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \ + %%PLAIN_CYPH_IN, %%LENGTH, 7, \ + %%CTR, \ + %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \ + %%ZTMP6, %%ZTMP7, %%ZTMP8, \ + %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \ + %%CNTR_TYPE, %%RBITS + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_6: + INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \ + %%PLAIN_CYPH_IN, %%LENGTH, 6, \ + %%CTR, \ + %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \ + %%ZTMP6, %%ZTMP7, %%ZTMP8, \ + %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \ + %%CNTR_TYPE, %%RBITS + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_5: + INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \ + %%PLAIN_CYPH_IN, %%LENGTH, 5, \ + %%CTR, \ + %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \ + %%ZTMP6, %%ZTMP7, %%ZTMP8, \ + %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \ + %%CNTR_TYPE, %%RBITS + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_4: + INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \ + %%PLAIN_CYPH_IN, %%LENGTH, 4, \ + %%CTR, \ + %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \ + %%ZTMP6, %%ZTMP7, %%ZTMP8, \ + %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \ + %%CNTR_TYPE, %%RBITS + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_3: + INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \ + %%PLAIN_CYPH_IN, %%LENGTH, 3, \ + %%CTR, \ + %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \ + %%ZTMP6, %%ZTMP7, %%ZTMP8, \ + %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \ + %%CNTR_TYPE, %%RBITS + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_2: + INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \ + %%PLAIN_CYPH_IN, %%LENGTH, 2, \ + %%CTR, \ + %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \ + %%ZTMP6, %%ZTMP7, %%ZTMP8, \ + %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \ + %%CNTR_TYPE, %%RBITS + jmp %%_small_initial_blocks_encrypted + +%%_small_initial_num_blocks_is_1: + INITIAL_BLOCKS_PARTIAL %%KEY, %%CYPH_PLAIN_OUT, \ + %%PLAIN_CYPH_IN, %%LENGTH, 1, \ + %%CTR, \ + %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, \ + %%ZTMP6, %%ZTMP7, %%ZTMP8, \ + %%IA0, %%IA1, %%MASKREG, %%SHUFREG, %%NROUNDS, \ + %%CNTR_TYPE, %%RBITS +%%_small_initial_blocks_encrypted: + +%endmacro ; CNTR_ENC_DEC_SMALL + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; CNTR_ENC_DEC Encodes/Decodes given data. +; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. +; Input: job structure and number of AES rounds +; Output: job structure +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro CNTR_ENC_DEC 3 +%define %%JOB %1 ; [in/out] job +%define %%NROUNDS %2 ; [in] number of rounds; numerical value +%define %%CNTR_TYPE %3 ; [in] Type of CNTR operation to do (CNTR/CNTR_BIT) + +%define %%KEY rax +%define %%CYPH_PLAIN_OUT rdx +%define %%PLAIN_CYPH_IN r8 +%define %%LENGTH r9 +%define %%DATA_OFFSET r13 +%define %%RBITS r14 + +%define %%IA0 r10 +%define %%IA1 r11 +%define %%IA2 r12 + +%define %%CTR_BLOCKx xmm0 +%define %%CTR_BLOCK_1_4 zmm1 +%define %%CTR_BLOCK_5_8 zmm2 +%define %%CTR_BLOCK_9_12 zmm3 +%define %%CTR_BLOCK_13_16 zmm4 + +%define %%ZTMP0 zmm5 +%define %%ZTMP1 zmm6 +%define %%ZTMP2 zmm7 +%define %%ZTMP3 zmm8 +%define %%ZTMP4 zmm9 +%define %%ZTMP5 zmm10 +%define %%ZTMP6 zmm11 +%define %%ZTMP7 zmm12 +%define %%SHUFREG zmm13 +%define %%ADD8REG zmm14 + +%define %%MASKREG k1 + +;;; Macro flow: +;;; - calculate the number of 16byte blocks in the message +;;; - process (number of 16byte blocks) mod 16 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted' +;;; - process 16x16 byte blocks at a time until all are done in %%_encrypt_by_16_new + + mov %%LENGTH, [%%JOB + _msg_len_to_cipher] + ;; calculate len + ;; convert bits to bytes (message length in bits for CNTR_BIT) +%ifidn %%CNTR_TYPE, CNTR_BIT + mov %%RBITS, %%LENGTH + add %%LENGTH, 7 + shr %%LENGTH, 3 ; LENGTH will hold number of bytes (including partial byte) + and %%RBITS, 7 ; Get remainder bits in last byte (0-7) +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + cmp %%LENGTH, 0 +%else + or %%LENGTH, %%LENGTH +%endif + je %%_enc_dec_done + + xor %%DATA_OFFSET, %%DATA_OFFSET + + mov %%PLAIN_CYPH_IN, [%%JOB + _src] + add %%PLAIN_CYPH_IN, [%%JOB + _cipher_start_src_offset_in_bytes] + mov %%CYPH_PLAIN_OUT, [%%JOB + _dst] + mov %%KEY, [%%JOB + _aes_enc_key_expanded] + + ;; Prepare round keys (only first 10, due to lack of registers) +%assign i 0 +%rep (%%NROUNDS + 2) + vbroadcastf64x2 ZKEY %+ i, [%%KEY + 16*i] +%assign i (i + 1) +%endrep + + mov %%IA1, [%%JOB + _iv] +%ifidn %%CNTR_TYPE, CNTR + ;; Prepare initial mask to read 12 IV bytes + mov %%IA0, 0x0000_0000_0000_0fff + vmovdqa %%CTR_BLOCKx, [rel initial_12_IV_counter] + mov %%IA2, [%%JOB + _iv_len_in_bytes] + test %%IA2, 16 + ;; Set mask to read 16 IV bytes if iv_len = 16 + cmovnz %%IA0, [rel mask_16_bytes] + + kmovq %%MASKREG, %%IA0 + vmovdqu8 %%CTR_BLOCKx{%%MASKREG}, [%%IA1] +%else ;; CNTR_BIT + ;; Read the full 16 bytes of IV + vmovdqu8 %%CTR_BLOCKx, [%%IA1] +%endif ;; CNTR/CNTR_BIT + + vmovdqa64 %%SHUFREG, [rel SHUF_MASK] + ;; store IV as counter in LE format + vpshufb %%CTR_BLOCKx, XWORD(%%SHUFREG) + + ;; Determine how many blocks to process in INITIAL + mov %%IA1, %%LENGTH + shr %%IA1, 4 + and %%IA1, 0xf + + ;; Process one additional block in INITIAL if there is a partial block + mov %%IA0, %%LENGTH + and %%IA0, 0xf + add %%IA0, 0xf + shr %%IA0, 4 + add %%IA1, %%IA0 + ;; %%IA1 can be in the range from 0 to 16 + + ;; Less than 256B will be handled by the small message code, which + ;; can process up to 16 x blocks (16 bytes each) + cmp %%LENGTH, 256 + jge %%_large_message_path + + CNTR_ENC_DEC_SMALL \ + %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%LENGTH, \ + %%IA1, %%CTR_BLOCKx, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \ + %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%IA0, %%IA2, %%MASKREG, %%SHUFREG, %%NROUNDS, \ + %%CNTR_TYPE, %%RBITS + + jmp %%_enc_dec_done + +%%_large_message_path: + ;; Still, don't allow 16 INITIAL blocks since this will + ;; can be handled by the x16 partial loop. + and %%IA1, 0xf + je %%_initial_num_blocks_is_0 + cmp %%IA1, 15 + je %%_initial_num_blocks_is_15 + cmp %%IA1, 14 + je %%_initial_num_blocks_is_14 + cmp %%IA1, 13 + je %%_initial_num_blocks_is_13 + cmp %%IA1, 12 + je %%_initial_num_blocks_is_12 + cmp %%IA1, 11 + je %%_initial_num_blocks_is_11 + cmp %%IA1, 10 + je %%_initial_num_blocks_is_10 + cmp %%IA1, 9 + je %%_initial_num_blocks_is_9 + cmp %%IA1, 8 + je %%_initial_num_blocks_is_8 + cmp %%IA1, 7 + je %%_initial_num_blocks_is_7 + cmp %%IA1, 6 + je %%_initial_num_blocks_is_6 + cmp %%IA1, 5 + je %%_initial_num_blocks_is_5 + cmp %%IA1, 4 + je %%_initial_num_blocks_is_4 + cmp %%IA1, 3 + je %%_initial_num_blocks_is_3 + cmp %%IA1, 2 + je %%_initial_num_blocks_is_2 + jmp %%_initial_num_blocks_is_1 + + and %%IA1, 0xf + je %%_initial_num_blocks_is_0 + + cmp %%IA1, 8 + je %%_initial_num_blocks_is_8 + jl %%_initial_blocks_is_1_7 + + ; Initial blocks 9-15 + cmp %%IA1, 12 + je %%_initial_num_blocks_is_12 + jl %%_initial_blocks_is_9_11 + + ; Initial blocks 13-15 + cmp %%IA1, 15 + je %%_initial_num_blocks_is_15 + cmp %%IA1, 14 + je %%_initial_num_blocks_is_14 + cmp %%IA1, 13 + je %%_initial_num_blocks_is_13 + +%%_initial_blocks_is_9_11: + cmp %%IA1, 11 + je %%_initial_num_blocks_is_11 + cmp %%IA1, 10 + je %%_initial_num_blocks_is_10 + cmp %%IA1, 9 + je %%_initial_num_blocks_is_9 + +%%_initial_blocks_is_1_7: + cmp %%IA1, 4 + je %%_initial_num_blocks_is_4 + jl %%_initial_blocks_is_1_3 + + ; Initial blocks 5-7 + cmp %%IA1, 7 + je %%_initial_num_blocks_is_7 + cmp %%IA1, 6 + je %%_initial_num_blocks_is_6 + cmp %%IA1, 5 + je %%_initial_num_blocks_is_5 + +%%_initial_blocks_is_1_3: + cmp %%IA1, 3 + je %%_initial_num_blocks_is_3 + cmp %%IA1, 2 + je %%_initial_num_blocks_is_2 + + jmp %%_initial_num_blocks_is_1 + +%%_initial_num_blocks_is_15: + INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%LENGTH, %%DATA_OFFSET, 15, %%CTR_BLOCKx, \ + %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \ + %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \ + %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS + jmp %%_initial_blocks_encrypted +%%_initial_num_blocks_is_14: + INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%LENGTH, %%DATA_OFFSET, 14, %%CTR_BLOCKx, \ + %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \ + %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \ + %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS + jmp %%_initial_blocks_encrypted +%%_initial_num_blocks_is_13: + INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%LENGTH, %%DATA_OFFSET, 13, %%CTR_BLOCKx, \ + %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \ + %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \ + %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS + jmp %%_initial_blocks_encrypted +%%_initial_num_blocks_is_12: + INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%LENGTH, %%DATA_OFFSET, 12, %%CTR_BLOCKx, \ + %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \ + %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \ + %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS + jmp %%_initial_blocks_encrypted +%%_initial_num_blocks_is_11: + INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%LENGTH, %%DATA_OFFSET, 11, %%CTR_BLOCKx, \ + %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \ + %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \ + %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS + jmp %%_initial_blocks_encrypted +%%_initial_num_blocks_is_10: + INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%LENGTH, %%DATA_OFFSET, 10, %%CTR_BLOCKx, \ + %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \ + %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \ + %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS + jmp %%_initial_blocks_encrypted +%%_initial_num_blocks_is_9: + INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%LENGTH, %%DATA_OFFSET, 9, %%CTR_BLOCKx, \ + %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \ + %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \ + %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS + jmp %%_initial_blocks_encrypted +%%_initial_num_blocks_is_8: + INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%LENGTH, %%DATA_OFFSET, 8, %%CTR_BLOCKx, \ + %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \ + %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \ + %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS + jmp %%_initial_blocks_encrypted +%%_initial_num_blocks_is_7: + INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%LENGTH, %%DATA_OFFSET, 7, %%CTR_BLOCKx, \ + %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \ + %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \ + %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS + jmp %%_initial_blocks_encrypted +%%_initial_num_blocks_is_6: + INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%LENGTH, %%DATA_OFFSET, 6, %%CTR_BLOCKx, \ + %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \ + %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \ + %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS + jmp %%_initial_blocks_encrypted +%%_initial_num_blocks_is_5: + INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%LENGTH, %%DATA_OFFSET, 5, %%CTR_BLOCKx, \ + %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \ + %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \ + %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS + jmp %%_initial_blocks_encrypted +%%_initial_num_blocks_is_4: + INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%LENGTH, %%DATA_OFFSET, 4, %%CTR_BLOCKx, \ + %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \ + %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \ + %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS + jmp %%_initial_blocks_encrypted +%%_initial_num_blocks_is_3: + INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%LENGTH, %%DATA_OFFSET, 3, %%CTR_BLOCKx, \ + %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \ + %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \ + %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS + jmp %%_initial_blocks_encrypted +%%_initial_num_blocks_is_2: + INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%LENGTH, %%DATA_OFFSET, 2, %%CTR_BLOCKx, \ + %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \ + %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \ + %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS + jmp %%_initial_blocks_encrypted +%%_initial_num_blocks_is_1: + INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%LENGTH, %%DATA_OFFSET, 1, %%CTR_BLOCKx, \ + %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \ + %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \ + %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS + jmp %%_initial_blocks_encrypted +%%_initial_num_blocks_is_0: + INITIAL_BLOCKS %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%LENGTH, %%DATA_OFFSET, 0, %%CTR_BLOCKx, \ + %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, %%CTR_BLOCK_9_12, \ + %%CTR_BLOCK_13_16, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \ + %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, %%IA0, %%IA1, %%MASKREG, \ + %%SHUFREG, %%NROUNDS, %%CNTR_TYPE, %%RBITS + +%%_initial_blocks_encrypted: + or %%LENGTH, %%LENGTH + je %%_enc_dec_done + + vmovdqa64 %%ADD8REG, [rel ddq_add_16] + ;; Process 15 full blocks plus a partial block + cmp %%LENGTH, 256 + jl %%_encrypt_by_16_partial + +%%_encrypt_by_16: + ENCRYPT_16_PARALLEL %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%DATA_OFFSET, %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, \ + %%CTR_BLOCK_9_12, %%CTR_BLOCK_13_16, \ + full, %%IA0, %%IA1, %%LENGTH, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \ + %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%MASKREG, %%SHUFREG, %%ADD8REG, %%NROUNDS, %%CNTR_TYPE, \ + %%RBITS + add %%DATA_OFFSET, 256 + sub %%LENGTH, 256 + cmp %%LENGTH, 256 + jge %%_encrypt_by_16 + +%%_encrypt_by_16_done: + ;; Test to see if we need a by 16 with partial block. At this point + ;; bytes remaining should be either zero or between 241-255. + or %%LENGTH, %%LENGTH + je %%_enc_dec_done + +%%_encrypt_by_16_partial: + + ENCRYPT_16_PARALLEL %%KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \ + %%DATA_OFFSET, %%CTR_BLOCK_1_4, %%CTR_BLOCK_5_8, \ + %%CTR_BLOCK_9_12, %%CTR_BLOCK_13_16, \ + partial, %%IA0, %%IA1, %%LENGTH, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \ + %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%MASKREG, %%SHUFREG, %%ADD8REG, %%NROUNDS, %%CNTR_TYPE, \ + %%RBITS + +%%_enc_dec_done: + +%endmacro ; CNTR_ENC_DEC + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_cntr_128_submit_vaes_avx512 (JOB_AES_HMAC *job) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(aes_cntr_128_submit_vaes_avx512,function,internal) +aes_cntr_128_submit_vaes_avx512: + FUNC_SAVE CNTR + ;; arg1 - [in] job + ;; arg2 - [in] NROUNDS + ;; arg3 - [in] Type of CNTR operation to do (CNTR/CNTR_BIT) + CNTR_ENC_DEC arg1, 9, CNTR + FUNC_RESTORE CNTR + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_cntr_192_submit_vaes_avx512 (JOB_AES_HMAC *job) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(aes_cntr_192_submit_vaes_avx512,function,internal) +aes_cntr_192_submit_vaes_avx512: + FUNC_SAVE CNTR + ;; arg1 - [in] job + ;; arg2 - [in] NROUNDS + ;; arg3 - [in] Type of CNTR operation to do (CNTR/CNTR_BIT) + CNTR_ENC_DEC arg1, 11, CNTR + FUNC_RESTORE CNTR + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_cntr_256_submit_vaes_avx512 (JOB_AES_HMAC *job) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(aes_cntr_256_submit_vaes_avx512,function,internal) +aes_cntr_256_submit_vaes_avx512: + FUNC_SAVE CNTR + ;; arg1 - [in] job + ;; arg2 - [in] NROUNDS + ;; arg3 - [in] Type of CNTR operation to do (CNTR/CNTR_BIT) + CNTR_ENC_DEC arg1, 13, CNTR + FUNC_RESTORE CNTR + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_cntr_bit_128_submit_vaes_avx512 (JOB_AES_HMAC *job) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(aes_cntr_bit_128_submit_vaes_avx512,function,internal) +aes_cntr_bit_128_submit_vaes_avx512: + FUNC_SAVE CNTR_BIT + ;; arg1 - [in] job + ;; arg2 - [in] NROUNDS + ;; arg3 - [in] Type of CNTR operation to do (CNTR/CNTR_BIT) + CNTR_ENC_DEC arg1, 9, CNTR_BIT + FUNC_RESTORE CNTR_BIT + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_cntr_bit_192_submit_vaes_avx512 (JOB_AES_HMAC *job) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(aes_cntr_bit_192_submit_vaes_avx512,function,internal) +aes_cntr_bit_192_submit_vaes_avx512: + FUNC_SAVE CNTR_BIT + ;; arg1 - [in] job + ;; arg2 - [in] NROUNDS + ;; arg3 - [in] Type of CNTR operation to do (CNTR/CNTR_BIT) + CNTR_ENC_DEC arg1, 11, CNTR_BIT + FUNC_RESTORE CNTR_BIT + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_cntr_bit_256_submit_vaes_avx512 (JOB_AES_HMAC *job) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(aes_cntr_bit_256_submit_vaes_avx512,function,internal) +aes_cntr_bit_256_submit_vaes_avx512: + FUNC_SAVE CNTR_BIT + ;; arg1 - [in] job + ;; arg2 - [in] NROUNDS + ;; arg3 - [in] Type of CNTR operation to do (CNTR/CNTR_BIT) + CNTR_ENC_DEC arg1, 13, CNTR_BIT + FUNC_RESTORE CNTR_BIT + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif |