;; ;; Copyright (c) 2019, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: ;; ;; * Redistributions of source code must retain the above copyright notice, ;; this list of conditions and the following disclaimer. ;; * Redistributions in binary form must reproduce the above copyright ;; notice, this list of conditions and the following disclaimer in the ;; documentation and/or other materials provided with the distribution. ;; * Neither the name of Intel Corporation nor the names of its contributors ;; may be used to endorse or promote products derived from this software ;; without specific prior written permission. ;; ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; %use smartalign %include "job_aes_hmac.asm" %include "include/os.asm" %include "include/memcpy.asm" ;;; This is implementation of stitched algorithms: AES128-CTR + CRC32 + BIP ;;; This combination is required by PON/xPON/gPON standard. ;;; Note: BIP is running XOR of double words ;;; Order of operations: ;;; - encrypt: HEC update (XGEM header), CRC32 (Ethernet FCS), AES-CTR and BIP ;;; - decrypt: BIP, AES-CTR and CRC32 (Ethernet FCS) extern byteswap_const extern ddq_add_1 section .data default rel ;;; Precomputed constants for CRC32 (Ethernet FCS) ;;; Details of the CRC algorithm and 4 byte buffer of ;;; {0x01, 0x02, 0x03, 0x04}: ;;; Result Poly Init RefIn RefOut XorOut ;;; 0xB63CFBCD 0x04C11DB7 0xFFFFFFFF true true 0xFFFFFFFF align 16 rk1: dq 0x00000000ccaa009e, 0x00000001751997d0 align 16 rk5: dq 0x00000000ccaa009e, 0x0000000163cd6124 align 16 rk7: dq 0x00000001f7011640, 0x00000001db710640 align 16 pshufb_shf_table: ;; use these values for shift registers with the pshufb instruction dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 dq 0x0706050403020100, 0x000e0d0c0b0a0908 align 16 init_crc_value: dq 0x00000000FFFFFFFF, 0x0000000000000000 align 16 mask: dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 align 16 mask2: dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF align 16 mask3: dq 0x8080808080808080, 0x8080808080808080 align 16 mask_out_top_bytes: dq 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF dq 0x0000000000000000, 0x0000000000000000 align 16 ddq_add_1_1: dq 0x1, 0x1 ;; Precomputed constants for HEC calculation (XGEM header) ;; POLY 0x53900000: ;; k1 = 0xf9800000 ;; k2 = 0xa0900000 ;; k3 = 0x7cc00000 ;; q = 0x46b927ec ;; p_res = 0x53900000 align 16 k3_q: dq 0x7cc00000, 0x46b927ec align 16 p_res: dq 0x53900000, 0 align 16 mask_out_top_64bits: dq 0xffffffff_ffffffff, 0 section .text %define NUM_AES_ROUNDS 10 %define xcounter xmm0 %define xbip xmm1 %define xcrc xmm2 %define xcrckey xmm3 %define xtmp1 xmm4 %define xtmp2 xmm5 %define xtmp3 xmm6 %define xtmp4 xmm7 %define xtmp5 xmm8 %define xtmp6 xmm9 %define xtmp7 xmm10 %define xtmp8 xmm11 %define xtmp9 xmm12 %define xtmp10 xmm13 %define xtmp11 xmm14 %ifdef LINUX %define arg1 rdi %define arg2 rsi %define arg3 rdx %define arg4 rcx %define tmp_1 r8 %define tmp_2 r9 %define tmp_3 r10 %define tmp_4 r11 %define tmp_5 r12 %define tmp_6 r13 %define tmp_7 r14 %else %define arg1 rcx %define arg2 rdx %define arg3 r8 %define arg4 r9 %define tmp_1 r10 %define tmp_2 r11 %define tmp_3 rax %define tmp_4 r12 %define tmp_5 r13 %define tmp_6 r14 %define tmp_7 r15 %endif %define job arg1 %define p_in arg2 %define p_keys arg3 %define p_out arg4 %define num_bytes tmp_1 ; bytes to cipher %define tmp tmp_2 %define ctr_check tmp_3 ; counter block overflow check %define bytes_to_crc tmp_4 ; number of bytes to crc ( < num_bytes) %define ethernet_fcs tmp_6 ; not used together with tmp3 %define tmp2 tmp_5 %define tmp3 tmp_6 %define write_back_crc tmp_7 %define decrypt_not_done tmp_7 ;;; ============================================================================ ;;; Does all AES encryption rounds %macro AES_ENC_ROUNDS 3 %define %%KP %1 ; [in] pointer to expanded keys %define %%N_ROUNDS %2 ; [in] max rounds (128bit: 10, 12, 14) %define %%BLOCK %3 ; [in/out] XMM with encrypted block %assign round 0 vpxor %%BLOCK, %%BLOCK, [%%KP + (round * 16)] %rep (%%N_ROUNDS - 1) %assign round (round + 1) vaesenc %%BLOCK, %%BLOCK, [%%KP + (round * 16)] %endrep %assign round (round + 1) vaesenclast %%BLOCK, %%BLOCK, [%%KP + (round * 16)] %endmacro ;;; ============================================================================ ;;; Does all AES encryption rounds on 4 blocks %macro AES_ENC_ROUNDS_4 7 %define %%KP %1 ; [in] pointer to expanded keys %define %%N_ROUNDS %2 ; [in] max rounds (128bit: 10, 12, 14) %define %%BLOCK1 %3 ; [in/out] XMM with encrypted block %define %%BLOCK2 %4 ; [in/out] XMM with encrypted block %define %%BLOCK3 %5 ; [in/out] XMM with encrypted block %define %%BLOCK4 %6 ; [in/out] XMM with encrypted block %define %%XT1 %7 ; [clobbered] temporary XMM register %assign round 0 vmovdqa %%XT1, [%%KP + (round * 16)] vpxor %%BLOCK1, %%BLOCK1, %%XT1 vpxor %%BLOCK2, %%BLOCK2, %%XT1 vpxor %%BLOCK3, %%BLOCK3, %%XT1 vpxor %%BLOCK4, %%BLOCK4, %%XT1 %rep (%%N_ROUNDS - 1) %assign round (round + 1) vmovdqa %%XT1, [%%KP + (round * 16)] vaesenc %%BLOCK1, %%BLOCK1, %%XT1 vaesenc %%BLOCK2, %%BLOCK2, %%XT1 vaesenc %%BLOCK3, %%BLOCK3, %%XT1 vaesenc %%BLOCK4, %%BLOCK4, %%XT1 %endrep %assign round (round + 1) vmovdqa %%XT1, [%%KP + (round * 16)] vaesenclast %%BLOCK1, %%BLOCK1, %%XT1 vaesenclast %%BLOCK2, %%BLOCK2, %%XT1 vaesenclast %%BLOCK3, %%BLOCK3, %%XT1 vaesenclast %%BLOCK4, %%BLOCK4, %%XT1 %endmacro ;;; ============================================================================ ;;; CRC multiply before XOR against data block %macro CRC_CLMUL 3 %define %%XCRC_IN_OUT %1 ; [in/out] XMM with CRC (can be anything if "no_crc" below) %define %%XCRC_MUL %2 ; [in] XMM with CRC constant (can be anything if "no_crc" below) %define %%XTMP %3 ; [clobbered] temporary XMM vpclmulqdq %%XTMP, %%XCRC_IN_OUT, %%XCRC_MUL, 0x01 vpclmulqdq %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%XCRC_MUL, 0x10 vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%XTMP %endmacro ;;; ============================================================================ ;;; PON stitched algorithm round on a single AES block (16 bytes): ;;; AES-CTR (optional, depending on %%CIPH) ;;; - prepares counter block ;;; - encrypts counter block ;;; - loads text ;;; - xor's text against encrypted blocks ;;; - stores cipher text ;;; BIP ;;; - BIP update on 4 x 32-bits ;;; CRC32 ;;; - CRC32 calculation ;;; Note: via selection of no_crc, no_bip, no_load, no_store different macro ;;; behaviour can be achieved to match needs of the overall algorithm. %macro DO_PON 15 %define %%KP %1 ; [in] GP, pointer to expanded keys %define %%N_ROUNDS %2 ; [in] number of AES rounds (10, 12 or 14) %define %%CTR %3 ; [in/out] XMM with counter block %define %%INP %4 ; [in/out] GP with input text pointer or "no_load" %define %%OUTP %5 ; [in/out] GP with output text pointer or "no_store" %define %%XBIP_IN_OUT %6 ; [in/out] XMM with BIP value or "no_bip" %define %%XCRC_IN_OUT %7 ; [in/out] XMM with CRC (can be anything if "no_crc" below) %define %%XCRC_MUL %8 ; [in] XMM with CRC constant (can be anything if "no_crc" below) %define %%TXMM0 %9 ; [clobbered|out] XMM temporary or data out (no_store) %define %%TXMM1 %10 ; [clobbered|in] XMM temporary or data in (no_load) %define %%TXMM2 %11 ; [clobbered] XMM temporary %define %%CRC_TYPE %12 ; [in] "first_crc" or "next_crc" or "no_crc" %define %%DIR %13 ; [in] "ENC" or "DEC" %define %%CIPH %14 ; [in] "CTR" or "NO_CTR" %define %%CTR_CHECK %15 ; [in/out] GP with 64bit counter (to identify overflow) %ifidn %%CIPH, CTR ;; prepare counter blocks for encryption vpshufb %%TXMM0, %%CTR, [rel byteswap_const] ;; perform 1 increment on whole 128 bits add %%CTR_CHECK, 1 jc %%_ctr_overflow vpaddq %%CTR, %%CTR, [rel ddq_add_1] jmp %%_ctr_overflow_done %%_ctr_overflow: vpaddq %%CTR, %%CTR, [rel ddq_add_1_1] %%_ctr_overflow_done: %endif ;; CRC calculation %ifidn %%CRC_TYPE, next_crc ;; CRC_MUL macro could be used here but its xor affects ;; performance (blocks cipher xor's) so doing CLMUL ;; only here and xor is done after the cipher. vpclmulqdq %%TXMM2, %%XCRC_IN_OUT, %%XCRC_MUL, 0x01 vpclmulqdq %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%XCRC_MUL, 0x10 %endif %ifnidn %%INP, no_load vmovdqu %%TXMM1, [%%INP] %endif %ifidn %%CIPH, CTR ;; AES rounds AES_ENC_ROUNDS %%KP, %%N_ROUNDS, %%TXMM0 ;; xor plaintext/ciphertext against encrypted counter blocks vpxor %%TXMM0, %%TXMM0, %%TXMM1 %else ;; CIPH = NO_CTR ;; register copy is needed as no_load/no_store options need it vmovdqa %%TXMM0, %%TXMM1 %endif ;; CIPH = CTR %ifnidn %%CRC_TYPE, no_crc %ifidn %%CRC_TYPE, next_crc ;; Finish split CRC_MUL() operation vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXMM2 %endif %ifidn %%CIPH, CTR ;; CRC calculation for ENCRYPTION/DECRYPTION ;; - always XOR against plaintext block %ifidn %%DIR, ENC vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXMM1 %else vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXMM0 %endif ; DECRYPT %else ;; CIPH = NO_CTR ;; CRC calculation for NO CIPHER option vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXMM1 %endif ;; CIPH = CTR %endif ;; CRC_TYPE != NO_CRC ;; store the result in the output buffer %ifnidn %%OUTP, no_store %ifidn %%CIPH, CTR vmovdqu [%%OUTP], %%TXMM0 %else ;; CIPH = NO_CTR vmovdqu [%%OUTP], %%TXMM1 %endif ;; CIPH = CTR %endif ;; update BIP value - always use cipher text for BIP %ifnidn %%XBIP_IN_OUT, no_bip %ifidn %%CIPH, CTR %ifidn %%DIR, ENC vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%TXMM0 %else vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%TXMM1 %endif ; DECRYPT %else ;; CIPH = NO_CTR vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%TXMM1 %endif ;; CIPH = CTR %endif ;; !NO_BIP ;; increment in/out pointers %ifnidn %%INP, no_load add %%INP, 16 %endif %ifnidn %%OUTP, no_store add %%OUTP, 16 %endif %endmacro ; DO_PON ;;; ============================================================================ ;;; PON stitched algorithm round on a single AES block (16 bytes): ;;; AES-CTR (optional, depending on %%CIPH) ;;; - prepares counter block ;;; - encrypts counter block ;;; - loads text ;;; - xor's text against encrypted blocks ;;; - stores cipher text ;;; BIP ;;; - BIP update on 4 x 32-bits ;;; CRC32 ;;; - CRC32 calculation ;;; Note: via selection of no_crc, no_bip, no_load, no_store different macro ;;; behaviour can be achieved to match needs of the overall algorithm. %macro DO_PON_4 23 %define %%KP %1 ; [in] GP, pointer to expanded keys %define %%N_ROUNDS %2 ; [in] number of AES rounds (10, 12 or 14) %define %%CTR %3 ; [in/out] XMM with counter block %define %%INP %4 ; [in/out] GP with input text pointer or "no_load" %define %%OUTP %5 ; [in/out] GP with output text pointer or "no_store" %define %%XBIP_IN_OUT %6 ; [in/out] XMM with BIP value or "no_bip" %define %%XCRC_IN_OUT %7 ; [in/out] XMM with CRC (can be anything if "no_crc" below) %define %%XCRC_MUL %8 ; [in] XMM with CRC constant (can be anything if "no_crc" below) %define %%T0 %9 ; [clobbered] XMM temporary %define %%T1 %10 ; [clobbered] XMM temporary %define %%T2 %11 ; [clobbered] XMM temporary %define %%T3 %12 ; [clobbered] XMM temporary %define %%T4 %13 ; [clobbered] XMM temporary %define %%T5 %14 ; [clobbered] XMM temporary %define %%T6 %15 ; [clobbered] XMM temporary %define %%T7 %16 ; [clobbered] XMM temporary %define %%T8 %17 ; [clobbered] XMM temporary %define %%T9 %18 ; [clobbered] XMM temporary %define %%T10 %19 ; [clobbered] XMM temporary %define %%CRC_TYPE %20 ; [in] "first_crc" or "next_crc" or "no_crc" %define %%DIR %21 ; [in] "ENC" or "DEC" %define %%CIPH %22 ; [in] "CTR" or "NO_CTR" %define %%CTR_CHECK %23 ; [in/out] GP with 64bit counter (to identify overflow) %define %%CTR1 %%T3 %define %%CTR2 %%T4 %define %%CTR3 %%T5 %define %%CTR4 %%T6 %define %%TXT1 %%T7 %define %%TXT2 %%T8 %define %%TXT3 %%T9 %define %%TXT4 %%T10 %ifidn %%CIPH, CTR ;; prepare counter blocks for encryption vmovdqa %%T0, [rel ddq_add_1] vmovdqa %%T2, [rel byteswap_const] ;; CTR1: copy saved CTR value as CTR1 vmovdqa %%CTR1, %%CTR cmp %%CTR_CHECK, 0xffff_ffff_ffff_ffff - 4 ja %%_ctr_will_overflow ;; case in which 64-bit counter will not overflow vpaddq %%CTR2, %%CTR1, %%T0 vpaddq %%CTR3, %%CTR2, %%T0 vpaddq %%CTR4, %%CTR3, %%T0 vpaddq %%CTR, %%CTR4, %%T0 vpshufb %%CTR1, %%CTR1, %%T2 vpshufb %%CTR2, %%CTR2, %%T2 vpshufb %%CTR3, %%CTR3, %%T2 vpshufb %%CTR4, %%CTR4, %%T2 add %%CTR_CHECK, 4 jmp %%_ctr_update_done %%_ctr_will_overflow: vmovdqa %%T1, [rel ddq_add_1_1] ;; CTR2: perform 1 increment on whole 128 bits add %%CTR_CHECK, 1 jc %%_ctr2_overflow vpaddq %%CTR2, %%CTR1, %%T0 jmp %%_ctr2_overflow_done %%_ctr2_overflow: vpaddq %%CTR2, %%CTR1, %%T1 %%_ctr2_overflow_done: vpshufb %%CTR1, %%CTR1, %%T2 ;; CTR3: perform 1 increment on whole 128 bits add %%CTR_CHECK, 1 jc %%_ctr3_overflow vpaddq %%CTR3, %%CTR2, %%T0 jmp %%_ctr3_overflow_done %%_ctr3_overflow: vpaddq %%CTR3, %%CTR2, %%T1 %%_ctr3_overflow_done: vpshufb %%CTR2, %%CTR2, %%T2 ;; CTR4: perform 1 increment on whole 128 bits add %%CTR_CHECK, 1 jc %%_ctr4_overflow vpaddq %%CTR4, %%CTR3, %%T0 jmp %%_ctr4_overflow_done %%_ctr4_overflow: vpaddq %%CTR4, %%CTR3, %%T1 %%_ctr4_overflow_done: vpshufb %%CTR3, %%CTR3, %%T2 ;; CTR: perform 1 increment on whole 128 bits (for the next iteration) add %%CTR_CHECK, 1 jc %%_ctr_overflow vpaddq %%CTR, %%CTR4, %%T0 jmp %%_ctr_overflow_done %%_ctr_overflow: vpaddq %%CTR, %%CTR4, %%T1 %%_ctr_overflow_done: vpshufb %%CTR4, %%CTR4, %%T2 %%_ctr_update_done: %endif %ifidn %%CRC_TYPE, next_crc ;; CRC_MUL macro could be used here but its xor affects ;; performance (blocks cipher xor's) so doing CLMUL ;; only here and xor is done after the cipher. vpclmulqdq %%T2, %%XCRC_IN_OUT, %%XCRC_MUL, 0x01 vpclmulqdq %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%XCRC_MUL, 0x10 %endif ;; load plaintext/ciphertext vmovdqu %%TXT1, [%%INP] vmovdqu %%TXT2, [%%INP + 16] vmovdqu %%TXT3, [%%INP + 32] vmovdqu %%TXT4, [%%INP + 48] %ifidn %%CIPH, CTR AES_ENC_ROUNDS_4 %%KP, %%N_ROUNDS, %%CTR1, %%CTR2, %%CTR3, %%CTR4, %%T0 ;; xor plaintext/ciphertext against encrypted counter blocks vpxor %%CTR1, %%CTR1, %%TXT1 vpxor %%CTR2, %%CTR2, %%TXT2 vpxor %%CTR3, %%CTR3, %%TXT3 vpxor %%CTR4, %%CTR4, %%TXT4 %endif ;; CIPH = CTR %ifidn %%CRC_TYPE, next_crc ;; Finish split CRC_MUL() operation vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%T2 %endif %ifidn %%CIPH, CTR %ifidn %%DIR, ENC ;; CRC calculation for ENCRYPTION (blocks 1 & 2) ;; - XOR CRC against plaintext block vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT1 CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2 vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT2 CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2 %else ;; CRC calculation for DECRYPTION (blocks 1 & 2) ;; - XOR CRC against plaintext block vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%CTR1 CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2 vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%CTR2 CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2 %endif ; DECRYPT %else ;; CIPH = NO_CTR ;; CRC calculation for NO CIPHER option (blocks 1 & 2) ;; - XOR CRC against plaintext block vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT1 CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2 vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT2 CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2 %endif ;; CIPH = CTR ;; store ciphertext/plaintext %ifidn %%CIPH, CTR vmovdqu [%%OUTP], %%CTR1 vmovdqu [%%OUTP + 16], %%CTR2 vmovdqu [%%OUTP + 32], %%CTR3 vmovdqu [%%OUTP + 48], %%CTR4 %else ;; CIPH = NO_CTR vmovdqu [%%OUTP], %%TXT1 vmovdqu [%%OUTP + 16], %%TXT2 vmovdqu [%%OUTP + 32], %%TXT3 vmovdqu [%%OUTP + 48], %%TXT4 %endif ;; CIPH = CTR ;; update BIP value %ifidn %%CIPH, CTR ;; - always use ciphertext for BIP %ifidn %%DIR, ENC vpxor %%T0, %%CTR1, %%CTR2 vpxor %%T1, %%CTR3, %%CTR4 %else vpxor %%T0, %%TXT1, %%TXT2 vpxor %%T1, %%TXT3, %%TXT4 %endif ; DECRYPT %else ;; CIPH = NO_CTR vpxor %%T0, %%TXT1, %%TXT2 vpxor %%T1, %%TXT3, %%TXT4 %endif ;; CIPH = CTR vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%T0 vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%T1 ;; increment in/out pointers add %%INP, 64 add %%OUTP, 64 %ifidn %%CIPH, CTR %ifidn %%DIR, ENC ;; CRC calculation for ENCRYPTION (blocks 3 & 4) ;; - XOR CRC against plaintext block vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT3 CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2 vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT4 %else ;; CRC calculation for DECRYPTION (blocks 3 & 4) ;; - XOR CRC against plaintext block vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%CTR3 CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2 vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%CTR4 %endif ; DECRYPT %else ;; CIPH = NO_CTR ;; CRC calculation for NO CIPHER option (blocks 3 & 4) ;; - XOR CRC against plaintext block vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT3 CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2 vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT4 %endif ;; CIPH = CTR %endmacro ; DO_PON_4 ;;; ============================================================================ ;;; CIPHER and BIP specified number of bytes %macro CIPHER_BIP_REST 14 %define %%NUM_BYTES %1 ; [in/clobbered] number of bytes to cipher %define %%DIR %2 ; [in] "ENC" or "DEC" %define %%CIPH %3 ; [in] "CTR" or "NO_CTR" %define %%PTR_IN %4 ; [in/clobbered] GPR pointer to input buffer %define %%PTR_OUT %5 ; [in/clobbered] GPR pointer to output buffer %define %%PTR_KEYS %6 ; [in] GPR pointer to expanded keys %define %%XBIP_IN_OUT %7 ; [in/out] XMM 128-bit BIP state %define %%XCTR_IN_OUT %8 ; [in/out] XMM 128-bit AES counter block %define %%XMMT1 %9 ; [clobbered] temporary XMM %define %%XMMT2 %10 ; [clobbered] temporary XMM %define %%XMMT3 %11 ; [clobbered] temporary XMM %define %%CTR_CHECK %12 ; [in/out] GP with 64bit counter (to identify overflow) %define %%GPT1 %13 ; [clobbered] temporary GP %define %%GPT2 %14 ; [clobbered] temporary GP align 16 %%_cipher_last_blocks: cmp %%NUM_BYTES, 16 jb %%_partial_block_left DO_PON %%PTR_KEYS, NUM_AES_ROUNDS, %%XCTR_IN_OUT, %%PTR_IN, %%PTR_OUT, %%XBIP_IN_OUT, \ no_crc, no_crc, %%XMMT1, %%XMMT2, %%XMMT3, no_crc, %%DIR, %%CIPH, %%CTR_CHECK sub %%NUM_BYTES, 16 jz %%_bip_done jmp %%_cipher_last_blocks %%_partial_block_left: simd_load_avx_15_1 %%XMMT2, %%PTR_IN, %%NUM_BYTES ;; DO_PON() is not loading nor storing the data in this case: ;; XMMT2 = data in ;; XMMT1 = data out DO_PON %%PTR_KEYS, NUM_AES_ROUNDS, %%XCTR_IN_OUT, no_load, no_store, no_bip, \ no_crc, no_crc, %%XMMT1, %%XMMT2, %%XMMT3, no_crc, %%DIR, %%CIPH, %%CTR_CHECK ;; bip update for partial block (mask out bytes outside the message) lea %%GPT1, [rel mask_out_top_bytes + 16] sub %%GPT1, %%NUM_BYTES vmovdqu %%XMMT3, [%%GPT1] ;; put masked cipher text into XMMT2 for BIP update %ifidn %%DIR, ENC vpand %%XMMT2, %%XMMT1, %%XMMT3 %else vpand %%XMMT2, %%XMMT2, %%XMMT3 %endif vpxor %%XBIP_IN_OUT, %%XMMT2 ;; store partial bytes in the output buffer simd_store_avx_15 %%PTR_OUT, %%XMMT1, %%NUM_BYTES, %%GPT1, %%GPT2 %%_bip_done: %endmacro ; CIPHER_BIP_REST ;; ============================================================================= ;; Barrett reduction from 128-bits to 32-bits modulo Ethernet FCS polynomial %macro CRC32_REDUCE_128_TO_32 5 %define %%CRC %1 ; [out] GP to store 32-bit Ethernet FCS value %define %%XCRC %2 ; [in/clobbered] XMM with CRC %define %%XT1 %3 ; [clobbered] temporary xmm register %define %%XT2 %4 ; [clobbered] temporary xmm register %define %%XT3 %5 ; [clobbered] temporary xmm register %define %%XCRCKEY %%XT3 ;; compute crc of a 128-bit value vmovdqa %%XCRCKEY, [rel rk5] ;; 64b fold vpclmulqdq %%XT1, %%XCRC, %%XCRCKEY, 0x00 vpsrldq %%XCRC, %%XCRC, 8 vpxor %%XCRC, %%XCRC, %%XT1 ;; 32b fold vpslldq %%XT1, %%XCRC, 4 vpclmulqdq %%XT1, %%XT1, %%XCRCKEY, 0x10 vpxor %%XCRC, %%XCRC, %%XT1 %%_crc_barrett: ;; Barrett reduction vpand %%XCRC, [rel mask2] vmovdqa %%XT1, %%XCRC vmovdqa %%XT2, %%XCRC vmovdqa %%XCRCKEY, [rel rk7] vpclmulqdq %%XCRC, %%XCRCKEY, 0x00 vpxor %%XCRC, %%XT2 vpand %%XCRC, [rel mask] vmovdqa %%XT2, %%XCRC vpclmulqdq %%XCRC, %%XCRCKEY, 0x10 vpxor %%XCRC, %%XT2 vpxor %%XCRC, %%XT1 vpextrd DWORD(%%CRC), %%XCRC, 2 ; 32-bit CRC value not DWORD(%%CRC) %endmacro ;; ============================================================================= ;; Barrett reduction from 128-bits to 32-bits modulo 0x53900000 polynomial %macro HEC_REDUCE_128_TO_32 4 %define %%XMM_IN_OUT %1 ; [in/out] xmm register with data in and out %define %%XT1 %2 ; [clobbered] temporary xmm register %define %%XT2 %3 ; [clobbered] temporary xmm register %define %%XT3 %4 ; [clobbered] temporary xmm register %define %%K3_Q %%XT1 %define %%P_RES %%XT2 %define %%XTMP %%XT3 ;; 128 to 64 bit reduction vmovdqa %%K3_Q, [k3_q] vmovdqa %%P_RES, [p_res] vpclmulqdq %%XTMP, %%XMM_IN_OUT, %%K3_Q, 0x01 ; K3 vpxor %%XTMP, %%XTMP, %%XMM_IN_OUT vpclmulqdq %%XTMP, %%XTMP, %%K3_Q, 0x01 ; K3 vpxor %%XMM_IN_OUT, %%XTMP, %%XMM_IN_OUT vpand %%XMM_IN_OUT, [rel mask_out_top_64bits] ;; 64 to 32 bit reduction vpsrldq %%XTMP, %%XMM_IN_OUT, 4 vpclmulqdq %%XTMP, %%XTMP, %%K3_Q, 0x10 ; Q vpxor %%XTMP, %%XTMP, %%XMM_IN_OUT vpsrldq %%XTMP, %%XTMP, 4 vpclmulqdq %%XTMP, %%XTMP, %%P_RES, 0x00 ; P vpxor %%XMM_IN_OUT, %%XTMP, %%XMM_IN_OUT %endmacro ;; ============================================================================= ;; Barrett reduction from 64-bits to 32-bits modulo 0x53900000 polynomial %macro HEC_REDUCE_64_TO_32 4 %define %%XMM_IN_OUT %1 ; [in/out] xmm register with data in and out %define %%XT1 %2 ; [clobbered] temporary xmm register %define %%XT2 %3 ; [clobbered] temporary xmm register %define %%XT3 %4 ; [clobbered] temporary xmm register %define %%K3_Q %%XT1 %define %%P_RES %%XT2 %define %%XTMP %%XT3 vmovdqa %%K3_Q, [k3_q] vmovdqa %%P_RES, [p_res] ;; 64 to 32 bit reduction vpsrldq %%XTMP, %%XMM_IN_OUT, 4 vpclmulqdq %%XTMP, %%XTMP, %%K3_Q, 0x10 ; Q vpxor %%XTMP, %%XTMP, %%XMM_IN_OUT vpsrldq %%XTMP, %%XTMP, 4 vpclmulqdq %%XTMP, %%XTMP, %%P_RES, 0x00 ; P vpxor %%XMM_IN_OUT, %%XTMP, %%XMM_IN_OUT %endmacro ;; ============================================================================= ;; HEC compute and header update for 32-bit XGEM headers %macro HEC_COMPUTE_32 6 %define %%HEC_IN_OUT %1 ; [in/out] GP register with HEC in LE format %define %%GT1 %2 ; [clobbered] temporary GP register %define %%XT1 %4 ; [clobbered] temporary xmm register %define %%XT2 %5 ; [clobbered] temporary xmm register %define %%XT3 %6 ; [clobbered] temporary xmm register %define %%XT4 %7 ; [clobbered] temporary xmm register mov DWORD(%%GT1), DWORD(%%HEC_IN_OUT) ;; shift out 13 bits of HEC value for CRC computation shr DWORD(%%GT1), 13 ;; mask out current HEC value to merge with an updated HEC at the end and DWORD(%%HEC_IN_OUT), 0xffff_e000 ;; prepare the message for CRC computation vmovd %%XT1, DWORD(%%GT1) vpslldq %%XT1, 4 ; shift left by 32-bits HEC_REDUCE_64_TO_32 %%XT1, %%XT2, %%XT3, %%XT4 ;; extract 32-bit value ;; - normally perform 20 bit shift right but bit 0 is a parity bit vmovd DWORD(%%GT1), %%XT1 shr DWORD(%%GT1), (20 - 1) ;; merge header bytes with updated 12-bit CRC value and ;; compute parity or DWORD(%%GT1), DWORD(%%HEC_IN_OUT) popcnt DWORD(%%HEC_IN_OUT), DWORD(%%GT1) and DWORD(%%HEC_IN_OUT), 1 or DWORD(%%HEC_IN_OUT), DWORD(%%GT1) %endmacro ;; ============================================================================= ;; HEC compute and header update for 64-bit XGEM headers %macro HEC_COMPUTE_64 6 %define %%HEC_IN_OUT %1 ; [in/out] GP register with HEC in LE format %define %%GT1 %2 ; [clobbered] temporary GP register %define %%XT1 %3 ; [clobbered] temporary xmm register %define %%XT2 %4 ; [clobbered] temporary xmm register %define %%XT3 %5 ; [clobbered] temporary xmm register %define %%XT4 %6 ; [clobbered] temporary xmm register mov %%GT1, %%HEC_IN_OUT ;; shift out 13 bits of HEC value for CRC computation shr %%GT1, 13 ;; mask out current HEC value to merge with an updated HEC at the end and %%HEC_IN_OUT, 0xffff_ffff_ffff_e000 ;; prepare the message for CRC computation vmovq %%XT1, %%GT1 vpslldq %%XT1, 4 ; shift left by 32-bits HEC_REDUCE_128_TO_32 %%XT1, %%XT2, %%XT3, %%XT4 ;; extract 32-bit value ;; - normally perform 20 bit shift right but bit 0 is a parity bit vmovd DWORD(%%GT1), %%XT1 shr DWORD(%%GT1), (20 - 1) ;; merge header bytes with updated 12-bit CRC value and ;; compute parity or %%GT1, %%HEC_IN_OUT popcnt %%HEC_IN_OUT, %%GT1 and %%HEC_IN_OUT, 1 or %%HEC_IN_OUT, %%GT1 %endmacro ;;; ============================================================================ ;;; PON stitched algorithm of AES128-CTR, CRC and BIP ;;; - this is master macro that implements encrypt/decrypt API ;;; - calls other macros and directly uses registers ;;; defined at the top of the file %macro AES128_CTR_PON 2 %define %%DIR %1 ; [in] direction "ENC" or "DEC" %define %%CIPH %2 ; [in] cipher "CTR" or "NO_CTR" push r12 push r13 push r14 %ifndef LINUX push r15 %endif %ifidn %%DIR, ENC ;; by default write back CRC for encryption mov DWORD(write_back_crc), 1 %else ;; mark decryption as finished mov DWORD(decrypt_not_done), 1 %endif ;; START BIP (and update HEC if encrypt direction) ;; - load XGEM header (8 bytes) for BIP (not part of encrypted payload) ;; - convert it into LE ;; - update HEC field in the header ;; - convert it into BE ;; - store back the header (with updated HEC) ;; - start BIP ;; (free to use tmp_1, tmp2 and tmp_3 at this stage) mov tmp_2, [job + _src] add tmp_2, [job + _hash_start_src_offset_in_bytes] mov tmp_3, [tmp_2] %ifidn %%DIR, ENC bswap tmp_3 ; go to LE HEC_COMPUTE_64 tmp_3, tmp_1, xtmp1, xtmp2, xtmp3, xtmp4 mov bytes_to_crc, tmp_3 shr bytes_to_crc, (48 + 2) ; PLI = MSB 14 bits bswap tmp_3 ; go back to BE mov [tmp_2], tmp_3 vmovq xbip, tmp_3 %else vmovq xbip, tmp_3 mov bytes_to_crc, tmp_3 bswap bytes_to_crc ; go to LE shr bytes_to_crc, (48 + 2) ; PLI = MSB 14 bits %endif cmp bytes_to_crc, 4 ja %%_crc_not_zero ;; XGEM payload shorter or equal to 4 bytes %ifidn %%DIR, ENC ;; On encryption, do not write Ethernet FCS back into the message xor DWORD(write_back_crc), DWORD(write_back_crc) %else ;; Mark decryption as not finished ;; - Ethernet FCS is not computed ;; - decrypt + BIP to be done at the end xor DWORD(decrypt_not_done), DWORD(decrypt_not_done) %endif mov DWORD(bytes_to_crc), 4 ; it will be zero after the next line (avoid jmp) %%_crc_not_zero: sub bytes_to_crc, 4 ; subtract size of the CRC itself %ifidn %%CIPH, CTR ;; - read 16 bytes of IV ;; - convert to little endian format ;; - save least significant 8 bytes in GP register for overflow check mov tmp, [job + _iv] vmovdqu xcounter, [tmp] vpshufb xcounter, [rel byteswap_const] vmovq ctr_check, xcounter %endif ;; get input buffer (after XGEM header) mov p_in, [job + _src] add p_in, [job + _cipher_start_src_offset_in_bytes] ;; get output buffer mov p_out, [job + _dst] %ifidn %%CIPH, CTR ;; get key pointers mov p_keys, [job + _aes_enc_key_expanded] %endif ;; initial CRC value vmovdqa xcrc, [rel init_crc_value] ;; load CRC constants vmovdqa xcrckey, [rel rk1] ; rk1 and rk2 in xcrckey ;; get number of bytes to cipher %ifidn %%CIPH, CTR mov num_bytes, [job + _msg_len_to_cipher_in_bytes] %else ;; Message length to cipher is 0 ;; - length is obtained from message length to hash (BIP) minus XGEM header size mov num_bytes, [job + _msg_len_to_hash_in_bytes] sub num_bytes, 8 %endif or bytes_to_crc, bytes_to_crc jz %%_crc_done cmp bytes_to_crc, 32 jae %%_at_least_32_bytes %ifidn %%DIR, DEC ;; decrypt the buffer first mov tmp, num_bytes CIPHER_BIP_REST tmp, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \ xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3 ;; correct in/out pointers - go back to start of the buffers mov tmp, num_bytes and tmp, -16 ; partial block handler doesn't increment pointers sub p_in, tmp sub p_out, tmp %endif ; DECRYPTION ;; less than 32 bytes cmp bytes_to_crc, 16 je %%_exact_16_left jl %%_less_than_16_left ;; load the plaintext %ifidn %%DIR, ENC vmovdqu xtmp1, [p_in] %else vmovdqu xtmp1, [p_out] %endif vpxor xcrc, xtmp1 ; xor the initial crc value jmp %%_crc_two_xmms %%_exact_16_left: %ifidn %%DIR, ENC vmovdqu xtmp1, [p_in] %else vmovdqu xtmp1, [p_out] %endif vpxor xcrc, xtmp1 ; xor the initial crc value jmp %%_128_done %%_less_than_16_left: %ifidn %%DIR, ENC simd_load_avx_15_1 xtmp1, p_in, bytes_to_crc %else simd_load_avx_15_1 xtmp1, p_out, bytes_to_crc %endif vpxor xcrc, xtmp1 ; xor the initial crc value lea tmp, [rel pshufb_shf_table] vmovdqu xtmp1, [tmp + bytes_to_crc] vpshufb xcrc, xtmp1 jmp %%_128_done %%_at_least_32_bytes: cmp bytes_to_crc, 64 jb %%_crc_below_64_bytes DO_PON_4 p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \ xcrc, xcrckey, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, xtmp6, \ xtmp7, xtmp8, xtmp9, xtmp10, xtmp11, first_crc, %%DIR, \ %%CIPH, ctr_check sub num_bytes, 64 sub bytes_to_crc, 64 %ifidn %%DIR, ENC jz %%_128_done %endif align 16 %%_main_loop_64: cmp bytes_to_crc, 64 jb %%_main_loop DO_PON_4 p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \ xcrc, xcrckey, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, xtmp6, \ xtmp7, xtmp8, xtmp9, xtmp10, xtmp11, next_crc, %%DIR, \ %%CIPH, ctr_check sub num_bytes, 64 sub bytes_to_crc, 64 %ifidn %%DIR, ENC jz %%_128_done %endif jmp %%_main_loop_64 %%_crc_below_64_bytes: DO_PON p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \ xcrc, xcrckey, xtmp1, xtmp2, xtmp3, first_crc, %%DIR, \ %%CIPH, ctr_check sub num_bytes, 16 sub bytes_to_crc, 16 align 16 %%_main_loop: cmp bytes_to_crc, 16 jb %%_exit_loop DO_PON p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \ xcrc, xcrckey, xtmp1, xtmp2, xtmp3, next_crc, %%DIR, \ %%CIPH, ctr_check sub num_bytes, 16 sub bytes_to_crc, 16 %ifidn %%DIR, ENC jz %%_128_done %endif jmp %%_main_loop %%_exit_loop: %ifidn %%DIR, DEC ;; decrypt rest of the message including CRC and optional padding mov tmp, num_bytes CIPHER_BIP_REST tmp, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \ xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3 mov tmp, num_bytes ; correct in/out pointers - to point before cipher & BIP and tmp, -16 ; partial block handler doesn't increment pointers sub p_in, tmp sub p_out, tmp or bytes_to_crc, bytes_to_crc jz %%_128_done %endif ; DECRYPTION ;; Partial bytes left - complete CRC calculation %%_crc_two_xmms: lea tmp, [rel pshufb_shf_table] vmovdqu xtmp2, [tmp + bytes_to_crc] ;; @note: in case of in-place operation (default) this load is ;; creating store-to-load problem. ;; However, there is no easy way to address it at the moment. %ifidn %%DIR, ENC vmovdqu xtmp1, [p_in - 16 + bytes_to_crc] ; xtmp1 = data for CRC %else vmovdqu xtmp1, [p_out - 16 + bytes_to_crc] ; xtmp1 = data for CRC %endif vmovdqa xtmp3, xcrc vpshufb xcrc, xtmp2 ; top num_bytes with LSB xcrc vpxor xtmp2, [rel mask3] vpshufb xtmp3, xtmp2 ; bottom (16 - num_bytes) with MSB xcrc ;; data bytes_to_crc (top) blended with MSB bytes of CRC (bottom) vpblendvb xtmp3, xtmp1, xtmp2 ;; final CRC calculation vpclmulqdq xtmp1, xcrc, xcrckey, 0x01 vpclmulqdq xcrc, xcrc, xcrckey, 0x10 vpxor xcrc, xtmp3 vpxor xcrc, xtmp1 %%_128_done: CRC32_REDUCE_128_TO_32 ethernet_fcs, xcrc, xtmp1, xtmp2, xcrckey %%_crc_done: ;; @todo - store-to-load problem in ENC case (to be fixed later) ;; - store CRC in input buffer and authentication tag output ;; - encrypt remaining bytes %ifidn %%DIR, ENC or DWORD(write_back_crc), DWORD(write_back_crc) jz %%_skip_crc_write_back mov [p_in + bytes_to_crc], DWORD(ethernet_fcs) %%_skip_crc_write_back: %endif mov tmp, [job + _auth_tag_output] mov [tmp + 4], DWORD(ethernet_fcs) or num_bytes, num_bytes jz %%_do_not_cipher_the_rest ;; encrypt rest of the message ;; - partial bytes including CRC and optional padding ;; decrypt rest of the message ;; - this may only happen when XGEM payload is short and padding is added %ifidn %%DIR, DEC or DWORD(decrypt_not_done), DWORD(decrypt_not_done) jnz %%_do_not_cipher_the_rest %endif CIPHER_BIP_REST num_bytes, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \ xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3 %%_do_not_cipher_the_rest: ;; finalize BIP vpsrldq xtmp1, xbip, 4 vpsrldq xtmp2, xbip, 8 vpsrldq xtmp3, xbip, 12 vpxor xtmp1, xtmp1, xtmp2 vpxor xbip, xbip, xtmp3 vpxor xbip, xbip, xtmp1 vmovd [tmp], xbip ; tmp already holds _auth_tag_output ;; set job status or dword [job + _status], STS_COMPLETED ;; return job mov rax, job %ifndef LINUX pop r15 %endif pop r14 pop r13 pop r12 %endmacro ; AES128_CTR_PON ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; submit_job_pon_enc_avx(JOB_AES_HMAC *job) align 64 MKGLOBAL(submit_job_pon_enc_avx,function,internal) submit_job_pon_enc_avx: AES128_CTR_PON ENC, CTR ret ;;; submit_job_pon_dec_avx(JOB_AES_HMAC *job) align 64 MKGLOBAL(submit_job_pon_dec_avx,function,internal) submit_job_pon_dec_avx: AES128_CTR_PON DEC, CTR ret ;;; submit_job_pon_enc_no_ctr_avx(JOB_AES_HMAC *job) align 64 MKGLOBAL(submit_job_pon_enc_no_ctr_avx,function,internal) submit_job_pon_enc_no_ctr_avx: AES128_CTR_PON ENC, NO_CTR ret ;;; submit_job_pon_dec_no_ctr_avx(JOB_AES_HMAC *job) align 64 MKGLOBAL(submit_job_pon_dec_no_ctr_avx,function,internal) submit_job_pon_dec_no_ctr_avx: AES128_CTR_PON DEC, NO_CTR ret %ifdef LINUX section .note.GNU-stack noalloc noexec nowrite progbits %endif