diff options
Diffstat (limited to '')
-rw-r--r-- | src/spdk/intel-ipsec-mb/sse/pon_sse.asm | 875 |
1 files changed, 875 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/sse/pon_sse.asm b/src/spdk/intel-ipsec-mb/sse/pon_sse.asm new file mode 100644 index 000000000..32585f5f8 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/pon_sse.asm @@ -0,0 +1,875 @@ +;; +;; Copyright (c) 2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "job_aes_hmac.asm" +%include "include/os.asm" +%include "include/memcpy.asm" + +;;; This is implementation of stitched algorithms: AES128-CTR + CRC32 + BIP +;;; This combination is required by PON/xPON/gPON standard. +;;; Note: BIP is running XOR of double words +;;; Order of operations: +;;; - encrypt: CRC32, AES-CTR and BIP +;;; - decrypt: BIP, AES-CTR and CRC32 + +%ifndef DEC_FN_NAME +%define DEC_FN_NAME submit_job_pon_dec_sse +%endif +%ifndef ENC_FN_NAME +%define ENC_FN_NAME submit_job_pon_enc_sse +%endif +%ifndef ENC_NO_CTR_FN_NAME +%define ENC_NO_CTR_FN_NAME submit_job_pon_enc_no_ctr_sse +%endif +%ifndef DEC_NO_CTR_FN_NAME +%define DEC_NO_CTR_FN_NAME submit_job_pon_dec_no_ctr_sse +%endif + +extern byteswap_const +extern ddq_add_1 + +section .data +default rel + +;;; Precomputed constants for CRC32 (Ethernet FCS) +;;; Details of the CRC algorithm and 4 byte buffer of +;;; {0x01, 0x02, 0x03, 0x04}: +;;; Result Poly Init RefIn RefOut XorOut +;;; 0xB63CFBCD 0x04C11DB7 0xFFFFFFFF true true 0xFFFFFFFF +align 16 +rk1: + dq 0x00000000ccaa009e, 0x00000001751997d0 + +align 16 +rk5: + dq 0x00000000ccaa009e, 0x0000000163cd6124 + +align 16 +rk7: + dq 0x00000001f7011640, 0x00000001db710640 + +align 16 +pshufb_shf_table: + ;; use these values for shift registers with the pshufb instruction + dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 + dq 0x0706050403020100, 0x000e0d0c0b0a0908 + +align 16 +init_crc_value: + dq 0x00000000FFFFFFFF, 0x0000000000000000 + +align 16 +mask: + dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000 + +align 16 +mask2: + dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF +align 16 +mask3: + dq 0x8080808080808080, 0x8080808080808080 + +align 16 +mask_out_top_bytes: + dq 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF + dq 0x0000000000000000, 0x0000000000000000 + +;; Precomputed constants for HEC calculation (XGEM header) +;; POLY 0x53900000: +;; k1 = 0xf9800000 +;; k2 = 0xa0900000 +;; k3 = 0x7cc00000 +;; q = 0x46b927ec +;; p_res = 0x53900000 + +align 16 +k3_q: + dq 0x7cc00000, 0x46b927ec + +align 16 +p_res: + dq 0x53900000, 0 + +align 16 +mask_out_top_64bits: + dq 0xffffffff_ffffffff, 0 + +section .text + +%define NUM_AES_ROUNDS 10 + +;; note: leave xmm0 free for implicit blend +%define xcounter xmm7 +%define xbip xmm1 +%define xcrc xmm2 +%define xcrckey xmm3 +%define xtmp1 xmm4 +%define xtmp2 xmm5 +%define xtmp3 xmm6 +%define xtmp4 xmm8 + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define arg4 rcx +%define tmp_1 r8 +%define tmp_2 r9 +%define tmp_3 r10 +%define tmp_4 r11 +%define tmp_5 r12 +%define tmp_6 r13 +%define tmp_7 r14 +%else +%define arg1 rcx +%define arg2 rdx +%define arg3 r8 +%define arg4 r9 +%define tmp_1 r10 +%define tmp_2 r11 +%define tmp_3 rax +%define tmp_4 r12 +%define tmp_5 r13 +%define tmp_6 r14 +%define tmp_7 r15 +%endif + +%define job arg1 + +%define p_in arg2 +%define p_keys arg3 +%define p_out arg4 + +%define num_bytes tmp_1 ; bytes to cipher +%define tmp tmp_2 +%define ctr_check tmp_3 ; counter block overflow check +%define bytes_to_crc tmp_4 ; number of bytes to CRC ( < num_bytes) + +%define ethernet_fcs tmp_6 ; not used together with tmp3 +%define tmp2 tmp_5 +%define tmp3 tmp_6 + +%define write_back_crc tmp_7 +%define decrypt_not_done tmp_7 + +;;; ============================================================================ +;;; Does all AES encryption rounds +%macro AES_ENC_ROUNDS 3 +%define %%KP %1 ; [in] pointer to expanded keys +%define %%N_ROUNDS %2 ; [in] max rounds (128bit: 10, 12, 14) +%define %%BLOCK %3 ; [in/out] XMM with encrypted block + +%assign round 0 + pxor %%BLOCK, [%%KP + (round * 16)] + +%rep (%%N_ROUNDS - 1) +%assign round (round + 1) + aesenc %%BLOCK, [%%KP + (round * 16)] +%endrep + +%assign round (round + 1) + aesenclast %%BLOCK, [%%KP + (round * 16)] + +%endmacro + +;;; ============================================================================ +;;; PON stitched algorithm round on a single AES block (16 bytes): +;;; AES-CTR (optional, depending on %%CIPH) +;;; - prepares counter blocks +;;; - encrypts counter blocks +;;; - loads text +;;; - xor's text against encrypted blocks +;;; - stores cipher text +;;; BIP +;;; - BIP update on 4 x 32-bits +;;; CRC32 +;;; - CRC32 calculation +;;; Note: via selection of no_crc, no_bip, no_load, no_store different macro +;;; behaviour can be achieved to match needs of the overall algorithm. +%macro DO_PON 15 +%define %%KP %1 ; [in] GP, pointer to expanded keys +%define %%N_ROUNDS %2 ; [in] number of AES rounds (10, 12 or 14) +%define %%CTR %3 ; [in/out] XMM with counter block +%define %%INP %4 ; [in/out] GP with input text pointer or "no_load" +%define %%OUTP %5 ; [in/out] GP with output text pointer or "no_store" +%define %%XBIP_IN_OUT %6 ; [in/out] XMM with BIP value or "no_bip" +%define %%XCRC_IN_OUT %7 ; [in/out] XMM with CRC (can be anything if "no_crc" below) +%define %%XCRC_MUL %8 ; [in] XMM with CRC multiplier constant (can be anything if "no_crc" below) +%define %%TXMM0 %9 ; [clobbered|out] XMM temporary or data out (no_store) +%define %%TXMM1 %10 ; [clobbered|in] XMM temporary or data in (no_load) +%define %%TXMM2 %11 ; [clobbered] XMM temporary +%define %%CRC_TYPE %12 ; [in] "first_crc" or "next_crc" or "no_crc" +%define %%DIR %13 ; [in] "ENC" or "DEC" +%define %%CIPH %14 ; [in] "CTR" or "NO_CTR" +%define %%CTR_CHECK %15 ; [in/out] GP with 64bit counter (to identify overflow) + +%ifidn %%CIPH, CTR + ;; prepare counter blocks for encryption + movdqa %%TXMM0, %%CTR + pshufb %%TXMM0, [rel byteswap_const] + ;; perform 1 increment on whole 128 bits + movdqa %%TXMM2, [rel ddq_add_1] + paddq %%CTR, %%TXMM2 + add %%CTR_CHECK, 1 + jnc %%_no_ctr_overflow + ;; Add 1 to the top 64 bits. First shift left value 1 by 64 bits. + pslldq %%TXMM2, 8 + paddq %%CTR, %%TXMM2 +%%_no_ctr_overflow: +%endif + ;; CRC calculation +%ifidn %%CRC_TYPE, next_crc + movdqa %%TXMM2, %%XCRC_IN_OUT + pclmulqdq %%TXMM2, %%XCRC_MUL, 0x01 + pclmulqdq %%XCRC_IN_OUT, %%XCRC_MUL, 0x10 +%endif + +%ifnidn %%INP, no_load + movdqu %%TXMM1, [%%INP] +%endif + +%ifidn %%CIPH, CTR + ;; AES rounds + AES_ENC_ROUNDS %%KP, %%N_ROUNDS, %%TXMM0 + + ;; xor plaintext/ciphertext against encrypted counter blocks + pxor %%TXMM0, %%TXMM1 +%else ;; CIPH = NO_CTR + ;; if no encryption needs to be done, move from input to output reg + movdqa %%TXMM0, %%TXMM1 +%endif ;; CIPH = CTR + +%ifidn %%CIPH, CTR +%ifidn %%DIR, ENC + ;; CRC calculation for ENCRYPTION +%ifidn %%CRC_TYPE, first_crc + ;; in the first run just XOR initial CRC with the first block + pxor %%XCRC_IN_OUT, %%TXMM1 +%endif +%ifidn %%CRC_TYPE, next_crc + ;; - XOR results of CLMUL's together + ;; - then XOR against text block + pxor %%XCRC_IN_OUT, %%TXMM2 + pxor %%XCRC_IN_OUT, %%TXMM1 +%endif +%else + ;; CRC calculation for DECRYPTION +%ifidn %%CRC_TYPE, first_crc + ;; in the first run just XOR initial CRC with the first block + pxor %%XCRC_IN_OUT, %%TXMM0 +%endif +%ifidn %%CRC_TYPE, next_crc + ;; - XOR results of CLMUL's together + ;; - then XOR against text block + pxor %%XCRC_IN_OUT, %%TXMM2 + pxor %%XCRC_IN_OUT, %%TXMM0 +%endif +%endif ; DECRYPT +%else ;; CIPH = NO_CTR + ;; CRC calculation for DECRYPTION +%ifidn %%CRC_TYPE, first_crc + ;; in the first run just XOR initial CRC with the first block + pxor %%XCRC_IN_OUT, %%TXMM1 +%endif +%ifidn %%CRC_TYPE, next_crc + ;; - XOR results of CLMUL's together + ;; - then XOR against text block + pxor %%XCRC_IN_OUT, %%TXMM2 + pxor %%XCRC_IN_OUT, %%TXMM1 +%endif + +%endif ;; CIPH = CTR + + ;; store the result in the output buffer +%ifnidn %%OUTP, no_store + movdqu [%%OUTP], %%TXMM0 +%endif + + ;; update BIP value - always use cipher text for BIP +%ifidn %%DIR, ENC +%ifnidn %%XBIP_IN_OUT, no_bip + pxor %%XBIP_IN_OUT, %%TXMM0 +%endif +%else +%ifnidn %%XBIP_IN_OUT, no_bip + pxor %%XBIP_IN_OUT, %%TXMM1 +%endif +%endif ; DECRYPT + + ;; increment in/out pointers +%ifnidn %%INP, no_load + add %%INP, 16 +%endif +%ifnidn %%OUTP, no_store + add %%OUTP, 16 +%endif +%endmacro ; DO_PON + +;;; ============================================================================ +;;; CIPHER and BIP specified number of bytes +%macro CIPHER_BIP_REST 14 +%define %%NUM_BYTES %1 ; [in/clobbered] number of bytes to cipher +%define %%DIR %2 ; [in] "ENC" or "DEC" +%define %%CIPH %3 ; [in] "CTR" or "NO_CTR" +%define %%PTR_IN %4 ; [in/clobbered] GPR pointer to input buffer +%define %%PTR_OUT %5 ; [in/clobbered] GPR pointer to output buffer +%define %%PTR_KEYS %6 ; [in] GPR pointer to expanded keys +%define %%XBIP_IN_OUT %7 ; [in/out] XMM 128-bit BIP state +%define %%XCTR_IN_OUT %8 ; [in/out] XMM 128-bit AES counter block +%define %%XMMT1 %9 ; [clobbered] temporary XMM +%define %%XMMT2 %10 ; [clobbered] temporary XMM +%define %%XMMT3 %11 ; [clobbered] temporary XMM +%define %%CTR_CHECK %12 ; [in/out] GP with 64bit counter (to identify overflow) +%define %%GPT1 %13 ; [clobbered] temporary GP +%define %%GPT2 %14 ; [clobbered] temporary GP + +%%_cipher_last_blocks: + cmp %%NUM_BYTES, 16 + jb %%_partial_block_left + + DO_PON %%PTR_KEYS, NUM_AES_ROUNDS, %%XCTR_IN_OUT, %%PTR_IN, %%PTR_OUT, %%XBIP_IN_OUT, \ + no_crc, no_crc, %%XMMT1, %%XMMT2, %%XMMT3, no_crc, %%DIR, %%CIPH, %%CTR_CHECK + sub %%NUM_BYTES, 16 + jz %%_bip_done + jmp %%_cipher_last_blocks + +%%_partial_block_left: + simd_load_sse_15_1 %%XMMT2, %%PTR_IN, %%NUM_BYTES + + ;; DO_PON() is not loading nor storing the data in this case: + ;; XMMT2 = data in + ;; XMMT1 = data out + DO_PON %%PTR_KEYS, NUM_AES_ROUNDS, %%XCTR_IN_OUT, no_load, no_store, no_bip, \ + no_crc, no_crc, %%XMMT1, %%XMMT2, %%XMMT3, no_crc, %%DIR, %%CIPH, %%CTR_CHECK + + ;; BIP update for partial block (mask out bytes outside the message) + lea %%GPT1, [rel mask_out_top_bytes + 16] + sub %%GPT1, %%NUM_BYTES + movdqu %%XMMT3, [%%GPT1] + ;; put masked cipher text into XMMT2 for BIP update +%ifidn %%DIR, ENC + movdqa %%XMMT2, %%XMMT1 + pand %%XMMT2, %%XMMT3 +%else + pand %%XMMT2, %%XMMT3 +%endif + pxor %%XBIP_IN_OUT, %%XMMT2 + + ;; store partial bytes in the output buffer + simd_store_sse_15 %%PTR_OUT, %%XMMT1, %%NUM_BYTES, %%GPT1, %%GPT2 + +%%_bip_done: +%endmacro ; CIPHER_BIP_REST +;; ============================================================================= +;; Barrett reduction from 128-bits to 32-bits modulo Ethernet FCS polynomial + +%macro CRC32_REDUCE_128_TO_32 5 +%define %%CRC %1 ; [out] GP to store 32-bit Ethernet FCS value +%define %%XCRC %2 ; [in/clobbered] XMM with CRC +%define %%XT1 %3 ; [clobbered] temporary xmm register +%define %%XT2 %4 ; [clobbered] temporary xmm register +%define %%XT3 %5 ; [clobbered] temporary xmm register + +%define %%XCRCKEY %%XT3 + + ;; compute CRC of a 128-bit value + movdqa %%XCRCKEY, [rel rk5] + + ;; 64b fold + movdqa %%XT1, %%XCRC + pclmulqdq %%XT1, %%XCRCKEY, 0x00 + psrldq %%XCRC, 8 + pxor %%XCRC, %%XT1 + + ;; 32b fold + movdqa %%XT1, %%XCRC + pslldq %%XT1, 4 + pclmulqdq %%XT1, %%XCRCKEY, 0x10 + pxor %%XCRC, %%XT1 + +%%_crc_barrett: + ;; Barrett reduction + pand %%XCRC, [rel mask2] + movdqa %%XT1, %%XCRC + movdqa %%XT2, %%XCRC + movdqa %%XCRCKEY, [rel rk7] + + pclmulqdq %%XCRC, %%XCRCKEY, 0x00 + pxor %%XCRC, %%XT2 + pand %%XCRC, [rel mask] + movdqa %%XT2, %%XCRC + pclmulqdq %%XCRC, %%XCRCKEY, 0x10 + pxor %%XCRC, %%XT2 + pxor %%XCRC, %%XT1 + pextrd DWORD(%%CRC), %%XCRC, 2 ; 32-bit CRC value + not DWORD(%%CRC) +%endmacro + +;; ============================================================================= +;; Barrett reduction from 128-bits to 32-bits modulo 0x53900000 polynomial + +%macro HEC_REDUCE_128_TO_32 4 +%define %%XMM_IN_OUT %1 ; [in/out] xmm register with data in and out +%define %%XT1 %2 ; [clobbered] temporary xmm register +%define %%XT2 %3 ; [clobbered] temporary xmm register +%define %%XT3 %4 ; [clobbered] temporary xmm register + +%define %%K3_Q %%XT1 +%define %%P_RES %%XT2 +%define %%XTMP %%XT3 + + ;; 128 to 64 bit reduction + movdqa %%K3_Q, [k3_q] + movdqa %%P_RES, [p_res] + + movdqa %%XTMP, %%XMM_IN_OUT + pclmulqdq %%XTMP, %%K3_Q, 0x01 ; K3 + pxor %%XTMP, %%XMM_IN_OUT + + pclmulqdq %%XTMP, %%K3_Q, 0x01 ; K3 + pxor %%XMM_IN_OUT, %%XTMP + + pand %%XMM_IN_OUT, [rel mask_out_top_64bits] + + ;; 64 to 32 bit reduction + movdqa %%XTMP, %%XMM_IN_OUT + psrldq %%XTMP, 4 + pclmulqdq %%XTMP, %%K3_Q, 0x10 ; Q + pxor %%XTMP, %%XMM_IN_OUT + psrldq %%XTMP, 4 + + pclmulqdq %%XTMP, %%P_RES, 0x00 ; P + pxor %%XMM_IN_OUT, %%XTMP +%endmacro + +;; ============================================================================= +;; Barrett reduction from 64-bits to 32-bits modulo 0x53900000 polynomial + +%macro HEC_REDUCE_64_TO_32 4 +%define %%XMM_IN_OUT %1 ; [in/out] xmm register with data in and out +%define %%XT1 %2 ; [clobbered] temporary xmm register +%define %%XT2 %3 ; [clobbered] temporary xmm register +%define %%XT3 %4 ; [clobbered] temporary xmm register + +%define %%K3_Q %%XT1 +%define %%P_RES %%XT2 +%define %%XTMP %%XT3 + + movdqa %%K3_Q, [k3_q] + movdqa %%P_RES, [p_res] + + ;; 64 to 32 bit reduction + movdqa %%XTMP, %%XMM_IN_OUT + psrldq %%XTMP, 4 + pclmulqdq %%XTMP, %%K3_Q, 0x10 ; Q + pxor %%XTMP, %%XMM_IN_OUT + psrldq %%XTMP, 4 + + pclmulqdq %%XTMP, %%P_RES, 0x00 ; P + pxor %%XMM_IN_OUT, %%XTMP +%endmacro + +;; ============================================================================= +;; HEC compute and header update for 32-bit XGEM headers +%macro HEC_COMPUTE_32 6 +%define %%HEC_IN_OUT %1 ; [in/out] GP register with HEC in LE format +%define %%GT1 %2 ; [clobbered] temporary GP register +%define %%XT1 %4 ; [clobbered] temporary xmm register +%define %%XT2 %5 ; [clobbered] temporary xmm register +%define %%XT3 %6 ; [clobbered] temporary xmm register +%define %%XT4 %7 ; [clobbered] temporary xmm register + + mov DWORD(%%GT1), DWORD(%%HEC_IN_OUT) + ;; shift out 13 bits of HEC value for CRC computation + shr DWORD(%%GT1), 13 + + ;; mask out current HEC value to merge with an updated HEC at the end + and DWORD(%%HEC_IN_OUT), 0xffff_e000 + + ;; prepare the message for CRC computation + movd %%XT1, DWORD(%%GT1) + pslldq %%XT1, 4 ; shift left by 32-bits + + HEC_REDUCE_64_TO_32 %%XT1, %%XT2, %%XT3, %%XT4 + + ;; extract 32-bit value + ;; - normally perform 20 bit shift right but bit 0 is a parity bit + movd DWORD(%%GT1), %%XT1 + shr DWORD(%%GT1), (20 - 1) + + ;; merge header bytes with updated 12-bit CRC value and + ;; compute parity + or DWORD(%%GT1), DWORD(%%HEC_IN_OUT) + popcnt DWORD(%%HEC_IN_OUT), DWORD(%%GT1) + and DWORD(%%HEC_IN_OUT), 1 + or DWORD(%%HEC_IN_OUT), DWORD(%%GT1) +%endmacro + +;; ============================================================================= +;; HEC compute and header update for 64-bit XGEM headers +%macro HEC_COMPUTE_64 6 +%define %%HEC_IN_OUT %1 ; [in/out] GP register with HEC in LE format +%define %%GT1 %2 ; [clobbered] temporary GP register +%define %%XT1 %3 ; [clobbered] temporary xmm register +%define %%XT2 %4 ; [clobbered] temporary xmm register +%define %%XT3 %5 ; [clobbered] temporary xmm register +%define %%XT4 %6 ; [clobbered] temporary xmm register + + mov %%GT1, %%HEC_IN_OUT + ;; shift out 13 bits of HEC value for CRC computation + shr %%GT1, 13 + + ;; mask out current HEC value to merge with an updated HEC at the end + and %%HEC_IN_OUT, 0xffff_ffff_ffff_e000 + + ;; prepare the message for CRC computation + movq %%XT1, %%GT1 + pslldq %%XT1, 4 ; shift left by 32-bits + + HEC_REDUCE_128_TO_32 %%XT1, %%XT2, %%XT3, %%XT4 + + ;; extract 32-bit value + ;; - normally perform 20 bit shift right but bit 0 is a parity bit + movd DWORD(%%GT1), %%XT1 + shr DWORD(%%GT1), (20 - 1) + + ;; merge header bytes with updated 12-bit CRC value and + ;; compute parity + or %%GT1, %%HEC_IN_OUT + popcnt %%HEC_IN_OUT, %%GT1 + and %%HEC_IN_OUT, 1 + or %%HEC_IN_OUT, %%GT1 +%endmacro + +;;; ============================================================================ +;;; PON stitched algorithm of AES128-CTR, CRC and BIP +;;; - this is master macro that implements encrypt/decrypt API +;;; - calls other macros and directly uses registers +;;; defined at the top of the file +%macro AES128_CTR_PON 2 +%define %%DIR %1 ; [in] direction "ENC" or "DEC" +%define %%CIPH %2 ; [in] cipher "CTR" or "NO_CTR" + + push r12 + push r13 + push r14 +%ifndef LINUX + push r15 +%endif + +%ifidn %%DIR, ENC + ;; by default write back CRC for encryption + mov DWORD(write_back_crc), 1 +%else + ;; mark decryption as finished + mov DWORD(decrypt_not_done), 1 +%endif + ;; START BIP (and update HEC if encrypt direction) + ;; - load XGEM header (8 bytes) for BIP (not part of encrypted payload) + ;; - convert it into LE + ;; - update HEC field in the header + ;; - convert it into BE + ;; - store back the header (with updated HEC) + ;; - start BIP + ;; (free to use tmp_1, tmp_2 and tmp_3 at this stage) + mov tmp_2, [job + _src] + add tmp_2, [job + _hash_start_src_offset_in_bytes] + mov tmp_3, [tmp_2] +%ifidn %%DIR, ENC + bswap tmp_3 ; go to LE + HEC_COMPUTE_64 tmp_3, tmp_1, xtmp1, xtmp2, xtmp3, xtmp4 + mov bytes_to_crc, tmp_3 + shr bytes_to_crc, (48 + 2) ; PLI = MSB 14 bits + bswap tmp_3 ; go back to BE + mov [tmp_2], tmp_3 + movq xbip, tmp_3 +%else + movq xbip, tmp_3 + mov bytes_to_crc, tmp_3 + bswap bytes_to_crc ; go to LE + shr bytes_to_crc, (48 + 2) ; PLI = MSB 14 bits +%endif + cmp bytes_to_crc, 4 + ja %%_crc_not_zero + ;; XGEM payload shorter or equal to 4 bytes +%ifidn %%DIR, ENC + ;; Don't write Ethernet FCS on encryption + xor DWORD(write_back_crc), DWORD(write_back_crc) +%else + ;; Mark decryption as not finished + ;; - Ethernet FCS is not computed + ;; - decrypt + BIP to be done at the end + xor DWORD(decrypt_not_done), DWORD(decrypt_not_done) +%endif + mov DWORD(bytes_to_crc), 4 ; it will be zero after the sub (avoid jmp) +%%_crc_not_zero: + sub bytes_to_crc, 4 ; subtract size of the CRC itself + +%ifidn %%CIPH, CTR + ;; - read 16 bytes of IV + ;; - convert to little endian format + ;; - save least significant 8 bytes in GP register for overflow check + mov tmp, [job + _iv] + movdqu xcounter, [tmp] + pshufb xcounter, [rel byteswap_const] + movq ctr_check, xcounter +%endif + + ;; get input buffer (after XGEM header) + mov p_in, [job + _src] + add p_in, [job + _cipher_start_src_offset_in_bytes] + + ;; get output buffer + mov p_out, [job + _dst] + +%ifidn %%CIPH, CTR + ;; get key pointers + mov p_keys, [job + _aes_enc_key_expanded] +%endif + + ;; initial CRC value + movdqa xcrc, [rel init_crc_value] + + ;; load CRC constants + movdqa xcrckey, [rel rk1] ; rk1 and rk2 in xcrckey + + ;; get number of bytes to cipher +%ifidn %%CIPH, CTR + mov num_bytes, [job + _msg_len_to_cipher_in_bytes] +%else + ;; Message length to cipher is 0 + ;; - length is obtained from message length to hash (BIP) minus XGEM header size + mov num_bytes, [job + _msg_len_to_hash_in_bytes] + sub num_bytes, 8 +%endif + or bytes_to_crc, bytes_to_crc + jz %%_crc_done + + cmp bytes_to_crc, 32 + jae %%_at_least_32_bytes + +%ifidn %%DIR, DEC + ;; decrypt the buffer first + mov tmp, num_bytes + CIPHER_BIP_REST tmp, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \ + xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3 + + ;; correct in/out pointers - go back to start of the buffers + mov tmp, num_bytes + and tmp, -16 ; partial block handler doesn't increment pointers + sub p_in, tmp + sub p_out, tmp +%endif ; DECRYPTION + + ;; less than 32 bytes + cmp bytes_to_crc, 16 + je %%_exact_16_left + jl %%_less_than_16_left + ;; load the plaintext +%ifidn %%DIR, ENC + movdqu xtmp1, [p_in] +%else + movdqu xtmp1, [p_out] +%endif + pxor xcrc, xtmp1 ; xor the initial crc value + jmp %%_crc_two_xmms + +%%_exact_16_left: +%ifidn %%DIR, ENC + movdqu xtmp1, [p_in] +%else + movdqu xtmp1, [p_out] +%endif + pxor xcrc, xtmp1 ; xor the initial CRC value + jmp %%_128_done + +%%_less_than_16_left: +%ifidn %%DIR, ENC + simd_load_sse_15_1 xtmp1, p_in, bytes_to_crc +%else + simd_load_sse_15_1 xtmp1, p_out, bytes_to_crc +%endif + pxor xcrc, xtmp1 ; xor the initial CRC value + + lea tmp, [rel pshufb_shf_table] + movdqu xtmp1, [tmp + bytes_to_crc] + pshufb xcrc, xtmp1 + jmp %%_128_done + +%%_at_least_32_bytes: + DO_PON p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \ + xcrc, xcrckey, xtmp1, xtmp2, xtmp3, first_crc, %%DIR, %%CIPH, ctr_check + sub num_bytes, 16 + sub bytes_to_crc, 16 + +%%_main_loop: + cmp bytes_to_crc, 16 + jb %%_exit_loop + DO_PON p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \ + xcrc, xcrckey, xtmp1, xtmp2, xtmp3, next_crc, %%DIR, %%CIPH, ctr_check + sub num_bytes, 16 + sub bytes_to_crc, 16 +%ifidn %%DIR, ENC + jz %%_128_done +%endif + jmp %%_main_loop + +%%_exit_loop: + +%ifidn %%DIR, DEC + ;; decrypt rest of the message including CRC and optional padding + mov tmp, num_bytes + + CIPHER_BIP_REST tmp, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \ + xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3 + + mov tmp, num_bytes ; correct in/out pointers - to point before cipher & BIP + and tmp, -16 ; partial block handler doesn't increment pointers + sub p_in, tmp + sub p_out, tmp + + or bytes_to_crc, bytes_to_crc + jz %%_128_done +%endif ; DECRYPTION + + ;; Partial bytes left - complete CRC calculation +%%_crc_two_xmms: + lea tmp, [rel pshufb_shf_table] + movdqu xtmp2, [tmp + bytes_to_crc] +%ifidn %%DIR, ENC + movdqu xtmp1, [p_in - 16 + bytes_to_crc] ; xtmp1 = data for CRC +%else + movdqu xtmp1, [p_out - 16 + bytes_to_crc] ; xtmp1 = data for CRC +%endif + movdqa xtmp3, xcrc + pshufb xcrc, xtmp2 ; top num_bytes with LSB xcrc + pxor xtmp2, [rel mask3] + pshufb xtmp3, xtmp2 ; bottom (16 - num_bytes) with MSB xcrc + + ;; data num_bytes (top) blended with MSB bytes of CRC (bottom) + movdqa xmm0, xtmp2 + pblendvb xtmp3, xtmp1 ; xmm0 implicit + + ;; final CRC calculation + movdqa xtmp1, xcrc + pclmulqdq xtmp1, xcrckey, 0x01 + pclmulqdq xcrc, xcrckey, 0x10 + pxor xcrc, xtmp3 + pxor xcrc, xtmp1 + +%%_128_done: + CRC32_REDUCE_128_TO_32 ethernet_fcs, xcrc, xtmp1, xtmp2, xcrckey + +%%_crc_done: + ;; @todo - store-to-load problem in ENC case (to be fixed later) + ;; - store CRC in input buffer and authentication tag output + ;; - encrypt remaining bytes +%ifidn %%DIR, ENC + or DWORD(write_back_crc), DWORD(write_back_crc) + jz %%_skip_crc_write_back + mov [p_in + bytes_to_crc], DWORD(ethernet_fcs) +%%_skip_crc_write_back: +%endif + mov tmp, [job + _auth_tag_output] + mov [tmp + 4], DWORD(ethernet_fcs) + + or num_bytes, num_bytes + jz %%_do_not_cipher_the_rest + + ;; encrypt rest of the message + ;; - partial bytes including CRC and optional padding + ;; decrypt rest of the message + ;; - this may only happen when XGEM payload is short and padding is added +%ifidn %%DIR, DEC + or DWORD(decrypt_not_done), DWORD(decrypt_not_done) + jnz %%_do_not_cipher_the_rest +%endif + CIPHER_BIP_REST num_bytes, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \ + xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3 +%%_do_not_cipher_the_rest: + + ;; finalize BIP + movdqa xtmp1, xbip + movdqa xtmp2, xbip + movdqa xtmp3, xbip + psrldq xtmp1, 4 + psrldq xtmp2, 8 + psrldq xtmp3, 12 + pxor xtmp1, xtmp2 + pxor xbip, xtmp3 + pxor xbip, xtmp1 + movd [tmp], xbip + + ;; set job status + or dword [job + _status], STS_COMPLETED + + ;; return job + mov rax, job + +%ifndef LINUX + pop r15 +%endif + pop r14 + pop r13 + pop r12 +%endmacro ; AES128_CTR_PON + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; aes_cntr_128_pon_enc_sse(JOB_AES_HMAC *job) +align 32 +MKGLOBAL(ENC_FN_NAME,function,internal) +ENC_FN_NAME: + AES128_CTR_PON ENC, CTR + ret + +;;; aes_cntr_128_pon_dec_sse(JOB_AES_HMAC *job) +align 32 +MKGLOBAL(DEC_FN_NAME,function,internal) +DEC_FN_NAME: + AES128_CTR_PON DEC, CTR + ret + +;;; aes_cntr_128_pon_enc_no_ctr_sse(JOB_AES_HMAC *job) +align 32 +MKGLOBAL(ENC_NO_CTR_FN_NAME,function,internal) +ENC_NO_CTR_FN_NAME: + AES128_CTR_PON ENC, NO_CTR + ret + +;;; aes_cntr_128_pon_dec_no_ctr_sse(JOB_AES_HMAC *job) +align 32 +MKGLOBAL(DEC_NO_CTR_FN_NAME,function,internal) +DEC_NO_CTR_FN_NAME: + AES128_CTR_PON DEC, NO_CTR + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif |