summaryrefslogtreecommitdiffstats
path: root/src/spdk/intel-ipsec-mb/sse/pon_sse.asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/spdk/intel-ipsec-mb/sse/pon_sse.asm')
-rw-r--r--src/spdk/intel-ipsec-mb/sse/pon_sse.asm875
1 files changed, 875 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/sse/pon_sse.asm b/src/spdk/intel-ipsec-mb/sse/pon_sse.asm
new file mode 100644
index 000000000..32585f5f8
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/sse/pon_sse.asm
@@ -0,0 +1,875 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "job_aes_hmac.asm"
+%include "include/os.asm"
+%include "include/memcpy.asm"
+
+;;; This is implementation of stitched algorithms: AES128-CTR + CRC32 + BIP
+;;; This combination is required by PON/xPON/gPON standard.
+;;; Note: BIP is running XOR of double words
+;;; Order of operations:
+;;; - encrypt: CRC32, AES-CTR and BIP
+;;; - decrypt: BIP, AES-CTR and CRC32
+
+%ifndef DEC_FN_NAME
+%define DEC_FN_NAME submit_job_pon_dec_sse
+%endif
+%ifndef ENC_FN_NAME
+%define ENC_FN_NAME submit_job_pon_enc_sse
+%endif
+%ifndef ENC_NO_CTR_FN_NAME
+%define ENC_NO_CTR_FN_NAME submit_job_pon_enc_no_ctr_sse
+%endif
+%ifndef DEC_NO_CTR_FN_NAME
+%define DEC_NO_CTR_FN_NAME submit_job_pon_dec_no_ctr_sse
+%endif
+
+extern byteswap_const
+extern ddq_add_1
+
+section .data
+default rel
+
+;;; Precomputed constants for CRC32 (Ethernet FCS)
+;;; Details of the CRC algorithm and 4 byte buffer of
+;;; {0x01, 0x02, 0x03, 0x04}:
+;;; Result Poly Init RefIn RefOut XorOut
+;;; 0xB63CFBCD 0x04C11DB7 0xFFFFFFFF true true 0xFFFFFFFF
+align 16
+rk1:
+ dq 0x00000000ccaa009e, 0x00000001751997d0
+
+align 16
+rk5:
+ dq 0x00000000ccaa009e, 0x0000000163cd6124
+
+align 16
+rk7:
+ dq 0x00000001f7011640, 0x00000001db710640
+
+align 16
+pshufb_shf_table:
+ ;; use these values for shift registers with the pshufb instruction
+ dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+ dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+align 16
+init_crc_value:
+ dq 0x00000000FFFFFFFF, 0x0000000000000000
+
+align 16
+mask:
+ dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
+
+align 16
+mask2:
+ dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
+align 16
+mask3:
+ dq 0x8080808080808080, 0x8080808080808080
+
+align 16
+mask_out_top_bytes:
+ dq 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
+ dq 0x0000000000000000, 0x0000000000000000
+
+;; Precomputed constants for HEC calculation (XGEM header)
+;; POLY 0x53900000:
+;; k1 = 0xf9800000
+;; k2 = 0xa0900000
+;; k3 = 0x7cc00000
+;; q = 0x46b927ec
+;; p_res = 0x53900000
+
+align 16
+k3_q:
+ dq 0x7cc00000, 0x46b927ec
+
+align 16
+p_res:
+ dq 0x53900000, 0
+
+align 16
+mask_out_top_64bits:
+ dq 0xffffffff_ffffffff, 0
+
+section .text
+
+%define NUM_AES_ROUNDS 10
+
+;; note: leave xmm0 free for implicit blend
+%define xcounter xmm7
+%define xbip xmm1
+%define xcrc xmm2
+%define xcrckey xmm3
+%define xtmp1 xmm4
+%define xtmp2 xmm5
+%define xtmp3 xmm6
+%define xtmp4 xmm8
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%define arg4 rcx
+%define tmp_1 r8
+%define tmp_2 r9
+%define tmp_3 r10
+%define tmp_4 r11
+%define tmp_5 r12
+%define tmp_6 r13
+%define tmp_7 r14
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 r8
+%define arg4 r9
+%define tmp_1 r10
+%define tmp_2 r11
+%define tmp_3 rax
+%define tmp_4 r12
+%define tmp_5 r13
+%define tmp_6 r14
+%define tmp_7 r15
+%endif
+
+%define job arg1
+
+%define p_in arg2
+%define p_keys arg3
+%define p_out arg4
+
+%define num_bytes tmp_1 ; bytes to cipher
+%define tmp tmp_2
+%define ctr_check tmp_3 ; counter block overflow check
+%define bytes_to_crc tmp_4 ; number of bytes to CRC ( < num_bytes)
+
+%define ethernet_fcs tmp_6 ; not used together with tmp3
+%define tmp2 tmp_5
+%define tmp3 tmp_6
+
+%define write_back_crc tmp_7
+%define decrypt_not_done tmp_7
+
+;;; ============================================================================
+;;; Does all AES encryption rounds
+%macro AES_ENC_ROUNDS 3
+%define %%KP %1 ; [in] pointer to expanded keys
+%define %%N_ROUNDS %2 ; [in] max rounds (128bit: 10, 12, 14)
+%define %%BLOCK %3 ; [in/out] XMM with encrypted block
+
+%assign round 0
+ pxor %%BLOCK, [%%KP + (round * 16)]
+
+%rep (%%N_ROUNDS - 1)
+%assign round (round + 1)
+ aesenc %%BLOCK, [%%KP + (round * 16)]
+%endrep
+
+%assign round (round + 1)
+ aesenclast %%BLOCK, [%%KP + (round * 16)]
+
+%endmacro
+
+;;; ============================================================================
+;;; PON stitched algorithm round on a single AES block (16 bytes):
+;;; AES-CTR (optional, depending on %%CIPH)
+;;; - prepares counter blocks
+;;; - encrypts counter blocks
+;;; - loads text
+;;; - xor's text against encrypted blocks
+;;; - stores cipher text
+;;; BIP
+;;; - BIP update on 4 x 32-bits
+;;; CRC32
+;;; - CRC32 calculation
+;;; Note: via selection of no_crc, no_bip, no_load, no_store different macro
+;;; behaviour can be achieved to match needs of the overall algorithm.
+%macro DO_PON 15
+%define %%KP %1 ; [in] GP, pointer to expanded keys
+%define %%N_ROUNDS %2 ; [in] number of AES rounds (10, 12 or 14)
+%define %%CTR %3 ; [in/out] XMM with counter block
+%define %%INP %4 ; [in/out] GP with input text pointer or "no_load"
+%define %%OUTP %5 ; [in/out] GP with output text pointer or "no_store"
+%define %%XBIP_IN_OUT %6 ; [in/out] XMM with BIP value or "no_bip"
+%define %%XCRC_IN_OUT %7 ; [in/out] XMM with CRC (can be anything if "no_crc" below)
+%define %%XCRC_MUL %8 ; [in] XMM with CRC multiplier constant (can be anything if "no_crc" below)
+%define %%TXMM0 %9 ; [clobbered|out] XMM temporary or data out (no_store)
+%define %%TXMM1 %10 ; [clobbered|in] XMM temporary or data in (no_load)
+%define %%TXMM2 %11 ; [clobbered] XMM temporary
+%define %%CRC_TYPE %12 ; [in] "first_crc" or "next_crc" or "no_crc"
+%define %%DIR %13 ; [in] "ENC" or "DEC"
+%define %%CIPH %14 ; [in] "CTR" or "NO_CTR"
+%define %%CTR_CHECK %15 ; [in/out] GP with 64bit counter (to identify overflow)
+
+%ifidn %%CIPH, CTR
+ ;; prepare counter blocks for encryption
+ movdqa %%TXMM0, %%CTR
+ pshufb %%TXMM0, [rel byteswap_const]
+ ;; perform 1 increment on whole 128 bits
+ movdqa %%TXMM2, [rel ddq_add_1]
+ paddq %%CTR, %%TXMM2
+ add %%CTR_CHECK, 1
+ jnc %%_no_ctr_overflow
+ ;; Add 1 to the top 64 bits. First shift left value 1 by 64 bits.
+ pslldq %%TXMM2, 8
+ paddq %%CTR, %%TXMM2
+%%_no_ctr_overflow:
+%endif
+ ;; CRC calculation
+%ifidn %%CRC_TYPE, next_crc
+ movdqa %%TXMM2, %%XCRC_IN_OUT
+ pclmulqdq %%TXMM2, %%XCRC_MUL, 0x01
+ pclmulqdq %%XCRC_IN_OUT, %%XCRC_MUL, 0x10
+%endif
+
+%ifnidn %%INP, no_load
+ movdqu %%TXMM1, [%%INP]
+%endif
+
+%ifidn %%CIPH, CTR
+ ;; AES rounds
+ AES_ENC_ROUNDS %%KP, %%N_ROUNDS, %%TXMM0
+
+ ;; xor plaintext/ciphertext against encrypted counter blocks
+ pxor %%TXMM0, %%TXMM1
+%else ;; CIPH = NO_CTR
+ ;; if no encryption needs to be done, move from input to output reg
+ movdqa %%TXMM0, %%TXMM1
+%endif ;; CIPH = CTR
+
+%ifidn %%CIPH, CTR
+%ifidn %%DIR, ENC
+ ;; CRC calculation for ENCRYPTION
+%ifidn %%CRC_TYPE, first_crc
+ ;; in the first run just XOR initial CRC with the first block
+ pxor %%XCRC_IN_OUT, %%TXMM1
+%endif
+%ifidn %%CRC_TYPE, next_crc
+ ;; - XOR results of CLMUL's together
+ ;; - then XOR against text block
+ pxor %%XCRC_IN_OUT, %%TXMM2
+ pxor %%XCRC_IN_OUT, %%TXMM1
+%endif
+%else
+ ;; CRC calculation for DECRYPTION
+%ifidn %%CRC_TYPE, first_crc
+ ;; in the first run just XOR initial CRC with the first block
+ pxor %%XCRC_IN_OUT, %%TXMM0
+%endif
+%ifidn %%CRC_TYPE, next_crc
+ ;; - XOR results of CLMUL's together
+ ;; - then XOR against text block
+ pxor %%XCRC_IN_OUT, %%TXMM2
+ pxor %%XCRC_IN_OUT, %%TXMM0
+%endif
+%endif ; DECRYPT
+%else ;; CIPH = NO_CTR
+ ;; CRC calculation for DECRYPTION
+%ifidn %%CRC_TYPE, first_crc
+ ;; in the first run just XOR initial CRC with the first block
+ pxor %%XCRC_IN_OUT, %%TXMM1
+%endif
+%ifidn %%CRC_TYPE, next_crc
+ ;; - XOR results of CLMUL's together
+ ;; - then XOR against text block
+ pxor %%XCRC_IN_OUT, %%TXMM2
+ pxor %%XCRC_IN_OUT, %%TXMM1
+%endif
+
+%endif ;; CIPH = CTR
+
+ ;; store the result in the output buffer
+%ifnidn %%OUTP, no_store
+ movdqu [%%OUTP], %%TXMM0
+%endif
+
+ ;; update BIP value - always use cipher text for BIP
+%ifidn %%DIR, ENC
+%ifnidn %%XBIP_IN_OUT, no_bip
+ pxor %%XBIP_IN_OUT, %%TXMM0
+%endif
+%else
+%ifnidn %%XBIP_IN_OUT, no_bip
+ pxor %%XBIP_IN_OUT, %%TXMM1
+%endif
+%endif ; DECRYPT
+
+ ;; increment in/out pointers
+%ifnidn %%INP, no_load
+ add %%INP, 16
+%endif
+%ifnidn %%OUTP, no_store
+ add %%OUTP, 16
+%endif
+%endmacro ; DO_PON
+
+;;; ============================================================================
+;;; CIPHER and BIP specified number of bytes
+%macro CIPHER_BIP_REST 14
+%define %%NUM_BYTES %1 ; [in/clobbered] number of bytes to cipher
+%define %%DIR %2 ; [in] "ENC" or "DEC"
+%define %%CIPH %3 ; [in] "CTR" or "NO_CTR"
+%define %%PTR_IN %4 ; [in/clobbered] GPR pointer to input buffer
+%define %%PTR_OUT %5 ; [in/clobbered] GPR pointer to output buffer
+%define %%PTR_KEYS %6 ; [in] GPR pointer to expanded keys
+%define %%XBIP_IN_OUT %7 ; [in/out] XMM 128-bit BIP state
+%define %%XCTR_IN_OUT %8 ; [in/out] XMM 128-bit AES counter block
+%define %%XMMT1 %9 ; [clobbered] temporary XMM
+%define %%XMMT2 %10 ; [clobbered] temporary XMM
+%define %%XMMT3 %11 ; [clobbered] temporary XMM
+%define %%CTR_CHECK %12 ; [in/out] GP with 64bit counter (to identify overflow)
+%define %%GPT1 %13 ; [clobbered] temporary GP
+%define %%GPT2 %14 ; [clobbered] temporary GP
+
+%%_cipher_last_blocks:
+ cmp %%NUM_BYTES, 16
+ jb %%_partial_block_left
+
+ DO_PON %%PTR_KEYS, NUM_AES_ROUNDS, %%XCTR_IN_OUT, %%PTR_IN, %%PTR_OUT, %%XBIP_IN_OUT, \
+ no_crc, no_crc, %%XMMT1, %%XMMT2, %%XMMT3, no_crc, %%DIR, %%CIPH, %%CTR_CHECK
+ sub %%NUM_BYTES, 16
+ jz %%_bip_done
+ jmp %%_cipher_last_blocks
+
+%%_partial_block_left:
+ simd_load_sse_15_1 %%XMMT2, %%PTR_IN, %%NUM_BYTES
+
+ ;; DO_PON() is not loading nor storing the data in this case:
+ ;; XMMT2 = data in
+ ;; XMMT1 = data out
+ DO_PON %%PTR_KEYS, NUM_AES_ROUNDS, %%XCTR_IN_OUT, no_load, no_store, no_bip, \
+ no_crc, no_crc, %%XMMT1, %%XMMT2, %%XMMT3, no_crc, %%DIR, %%CIPH, %%CTR_CHECK
+
+ ;; BIP update for partial block (mask out bytes outside the message)
+ lea %%GPT1, [rel mask_out_top_bytes + 16]
+ sub %%GPT1, %%NUM_BYTES
+ movdqu %%XMMT3, [%%GPT1]
+ ;; put masked cipher text into XMMT2 for BIP update
+%ifidn %%DIR, ENC
+ movdqa %%XMMT2, %%XMMT1
+ pand %%XMMT2, %%XMMT3
+%else
+ pand %%XMMT2, %%XMMT3
+%endif
+ pxor %%XBIP_IN_OUT, %%XMMT2
+
+ ;; store partial bytes in the output buffer
+ simd_store_sse_15 %%PTR_OUT, %%XMMT1, %%NUM_BYTES, %%GPT1, %%GPT2
+
+%%_bip_done:
+%endmacro ; CIPHER_BIP_REST
+;; =============================================================================
+;; Barrett reduction from 128-bits to 32-bits modulo Ethernet FCS polynomial
+
+%macro CRC32_REDUCE_128_TO_32 5
+%define %%CRC %1 ; [out] GP to store 32-bit Ethernet FCS value
+%define %%XCRC %2 ; [in/clobbered] XMM with CRC
+%define %%XT1 %3 ; [clobbered] temporary xmm register
+%define %%XT2 %4 ; [clobbered] temporary xmm register
+%define %%XT3 %5 ; [clobbered] temporary xmm register
+
+%define %%XCRCKEY %%XT3
+
+ ;; compute CRC of a 128-bit value
+ movdqa %%XCRCKEY, [rel rk5]
+
+ ;; 64b fold
+ movdqa %%XT1, %%XCRC
+ pclmulqdq %%XT1, %%XCRCKEY, 0x00
+ psrldq %%XCRC, 8
+ pxor %%XCRC, %%XT1
+
+ ;; 32b fold
+ movdqa %%XT1, %%XCRC
+ pslldq %%XT1, 4
+ pclmulqdq %%XT1, %%XCRCKEY, 0x10
+ pxor %%XCRC, %%XT1
+
+%%_crc_barrett:
+ ;; Barrett reduction
+ pand %%XCRC, [rel mask2]
+ movdqa %%XT1, %%XCRC
+ movdqa %%XT2, %%XCRC
+ movdqa %%XCRCKEY, [rel rk7]
+
+ pclmulqdq %%XCRC, %%XCRCKEY, 0x00
+ pxor %%XCRC, %%XT2
+ pand %%XCRC, [rel mask]
+ movdqa %%XT2, %%XCRC
+ pclmulqdq %%XCRC, %%XCRCKEY, 0x10
+ pxor %%XCRC, %%XT2
+ pxor %%XCRC, %%XT1
+ pextrd DWORD(%%CRC), %%XCRC, 2 ; 32-bit CRC value
+ not DWORD(%%CRC)
+%endmacro
+
+;; =============================================================================
+;; Barrett reduction from 128-bits to 32-bits modulo 0x53900000 polynomial
+
+%macro HEC_REDUCE_128_TO_32 4
+%define %%XMM_IN_OUT %1 ; [in/out] xmm register with data in and out
+%define %%XT1 %2 ; [clobbered] temporary xmm register
+%define %%XT2 %3 ; [clobbered] temporary xmm register
+%define %%XT3 %4 ; [clobbered] temporary xmm register
+
+%define %%K3_Q %%XT1
+%define %%P_RES %%XT2
+%define %%XTMP %%XT3
+
+ ;; 128 to 64 bit reduction
+ movdqa %%K3_Q, [k3_q]
+ movdqa %%P_RES, [p_res]
+
+ movdqa %%XTMP, %%XMM_IN_OUT
+ pclmulqdq %%XTMP, %%K3_Q, 0x01 ; K3
+ pxor %%XTMP, %%XMM_IN_OUT
+
+ pclmulqdq %%XTMP, %%K3_Q, 0x01 ; K3
+ pxor %%XMM_IN_OUT, %%XTMP
+
+ pand %%XMM_IN_OUT, [rel mask_out_top_64bits]
+
+ ;; 64 to 32 bit reduction
+ movdqa %%XTMP, %%XMM_IN_OUT
+ psrldq %%XTMP, 4
+ pclmulqdq %%XTMP, %%K3_Q, 0x10 ; Q
+ pxor %%XTMP, %%XMM_IN_OUT
+ psrldq %%XTMP, 4
+
+ pclmulqdq %%XTMP, %%P_RES, 0x00 ; P
+ pxor %%XMM_IN_OUT, %%XTMP
+%endmacro
+
+;; =============================================================================
+;; Barrett reduction from 64-bits to 32-bits modulo 0x53900000 polynomial
+
+%macro HEC_REDUCE_64_TO_32 4
+%define %%XMM_IN_OUT %1 ; [in/out] xmm register with data in and out
+%define %%XT1 %2 ; [clobbered] temporary xmm register
+%define %%XT2 %3 ; [clobbered] temporary xmm register
+%define %%XT3 %4 ; [clobbered] temporary xmm register
+
+%define %%K3_Q %%XT1
+%define %%P_RES %%XT2
+%define %%XTMP %%XT3
+
+ movdqa %%K3_Q, [k3_q]
+ movdqa %%P_RES, [p_res]
+
+ ;; 64 to 32 bit reduction
+ movdqa %%XTMP, %%XMM_IN_OUT
+ psrldq %%XTMP, 4
+ pclmulqdq %%XTMP, %%K3_Q, 0x10 ; Q
+ pxor %%XTMP, %%XMM_IN_OUT
+ psrldq %%XTMP, 4
+
+ pclmulqdq %%XTMP, %%P_RES, 0x00 ; P
+ pxor %%XMM_IN_OUT, %%XTMP
+%endmacro
+
+;; =============================================================================
+;; HEC compute and header update for 32-bit XGEM headers
+%macro HEC_COMPUTE_32 6
+%define %%HEC_IN_OUT %1 ; [in/out] GP register with HEC in LE format
+%define %%GT1 %2 ; [clobbered] temporary GP register
+%define %%XT1 %4 ; [clobbered] temporary xmm register
+%define %%XT2 %5 ; [clobbered] temporary xmm register
+%define %%XT3 %6 ; [clobbered] temporary xmm register
+%define %%XT4 %7 ; [clobbered] temporary xmm register
+
+ mov DWORD(%%GT1), DWORD(%%HEC_IN_OUT)
+ ;; shift out 13 bits of HEC value for CRC computation
+ shr DWORD(%%GT1), 13
+
+ ;; mask out current HEC value to merge with an updated HEC at the end
+ and DWORD(%%HEC_IN_OUT), 0xffff_e000
+
+ ;; prepare the message for CRC computation
+ movd %%XT1, DWORD(%%GT1)
+ pslldq %%XT1, 4 ; shift left by 32-bits
+
+ HEC_REDUCE_64_TO_32 %%XT1, %%XT2, %%XT3, %%XT4
+
+ ;; extract 32-bit value
+ ;; - normally perform 20 bit shift right but bit 0 is a parity bit
+ movd DWORD(%%GT1), %%XT1
+ shr DWORD(%%GT1), (20 - 1)
+
+ ;; merge header bytes with updated 12-bit CRC value and
+ ;; compute parity
+ or DWORD(%%GT1), DWORD(%%HEC_IN_OUT)
+ popcnt DWORD(%%HEC_IN_OUT), DWORD(%%GT1)
+ and DWORD(%%HEC_IN_OUT), 1
+ or DWORD(%%HEC_IN_OUT), DWORD(%%GT1)
+%endmacro
+
+;; =============================================================================
+;; HEC compute and header update for 64-bit XGEM headers
+%macro HEC_COMPUTE_64 6
+%define %%HEC_IN_OUT %1 ; [in/out] GP register with HEC in LE format
+%define %%GT1 %2 ; [clobbered] temporary GP register
+%define %%XT1 %3 ; [clobbered] temporary xmm register
+%define %%XT2 %4 ; [clobbered] temporary xmm register
+%define %%XT3 %5 ; [clobbered] temporary xmm register
+%define %%XT4 %6 ; [clobbered] temporary xmm register
+
+ mov %%GT1, %%HEC_IN_OUT
+ ;; shift out 13 bits of HEC value for CRC computation
+ shr %%GT1, 13
+
+ ;; mask out current HEC value to merge with an updated HEC at the end
+ and %%HEC_IN_OUT, 0xffff_ffff_ffff_e000
+
+ ;; prepare the message for CRC computation
+ movq %%XT1, %%GT1
+ pslldq %%XT1, 4 ; shift left by 32-bits
+
+ HEC_REDUCE_128_TO_32 %%XT1, %%XT2, %%XT3, %%XT4
+
+ ;; extract 32-bit value
+ ;; - normally perform 20 bit shift right but bit 0 is a parity bit
+ movd DWORD(%%GT1), %%XT1
+ shr DWORD(%%GT1), (20 - 1)
+
+ ;; merge header bytes with updated 12-bit CRC value and
+ ;; compute parity
+ or %%GT1, %%HEC_IN_OUT
+ popcnt %%HEC_IN_OUT, %%GT1
+ and %%HEC_IN_OUT, 1
+ or %%HEC_IN_OUT, %%GT1
+%endmacro
+
+;;; ============================================================================
+;;; PON stitched algorithm of AES128-CTR, CRC and BIP
+;;; - this is master macro that implements encrypt/decrypt API
+;;; - calls other macros and directly uses registers
+;;; defined at the top of the file
+%macro AES128_CTR_PON 2
+%define %%DIR %1 ; [in] direction "ENC" or "DEC"
+%define %%CIPH %2 ; [in] cipher "CTR" or "NO_CTR"
+
+ push r12
+ push r13
+ push r14
+%ifndef LINUX
+ push r15
+%endif
+
+%ifidn %%DIR, ENC
+ ;; by default write back CRC for encryption
+ mov DWORD(write_back_crc), 1
+%else
+ ;; mark decryption as finished
+ mov DWORD(decrypt_not_done), 1
+%endif
+ ;; START BIP (and update HEC if encrypt direction)
+ ;; - load XGEM header (8 bytes) for BIP (not part of encrypted payload)
+ ;; - convert it into LE
+ ;; - update HEC field in the header
+ ;; - convert it into BE
+ ;; - store back the header (with updated HEC)
+ ;; - start BIP
+ ;; (free to use tmp_1, tmp_2 and tmp_3 at this stage)
+ mov tmp_2, [job + _src]
+ add tmp_2, [job + _hash_start_src_offset_in_bytes]
+ mov tmp_3, [tmp_2]
+%ifidn %%DIR, ENC
+ bswap tmp_3 ; go to LE
+ HEC_COMPUTE_64 tmp_3, tmp_1, xtmp1, xtmp2, xtmp3, xtmp4
+ mov bytes_to_crc, tmp_3
+ shr bytes_to_crc, (48 + 2) ; PLI = MSB 14 bits
+ bswap tmp_3 ; go back to BE
+ mov [tmp_2], tmp_3
+ movq xbip, tmp_3
+%else
+ movq xbip, tmp_3
+ mov bytes_to_crc, tmp_3
+ bswap bytes_to_crc ; go to LE
+ shr bytes_to_crc, (48 + 2) ; PLI = MSB 14 bits
+%endif
+ cmp bytes_to_crc, 4
+ ja %%_crc_not_zero
+ ;; XGEM payload shorter or equal to 4 bytes
+%ifidn %%DIR, ENC
+ ;; Don't write Ethernet FCS on encryption
+ xor DWORD(write_back_crc), DWORD(write_back_crc)
+%else
+ ;; Mark decryption as not finished
+ ;; - Ethernet FCS is not computed
+ ;; - decrypt + BIP to be done at the end
+ xor DWORD(decrypt_not_done), DWORD(decrypt_not_done)
+%endif
+ mov DWORD(bytes_to_crc), 4 ; it will be zero after the sub (avoid jmp)
+%%_crc_not_zero:
+ sub bytes_to_crc, 4 ; subtract size of the CRC itself
+
+%ifidn %%CIPH, CTR
+ ;; - read 16 bytes of IV
+ ;; - convert to little endian format
+ ;; - save least significant 8 bytes in GP register for overflow check
+ mov tmp, [job + _iv]
+ movdqu xcounter, [tmp]
+ pshufb xcounter, [rel byteswap_const]
+ movq ctr_check, xcounter
+%endif
+
+ ;; get input buffer (after XGEM header)
+ mov p_in, [job + _src]
+ add p_in, [job + _cipher_start_src_offset_in_bytes]
+
+ ;; get output buffer
+ mov p_out, [job + _dst]
+
+%ifidn %%CIPH, CTR
+ ;; get key pointers
+ mov p_keys, [job + _aes_enc_key_expanded]
+%endif
+
+ ;; initial CRC value
+ movdqa xcrc, [rel init_crc_value]
+
+ ;; load CRC constants
+ movdqa xcrckey, [rel rk1] ; rk1 and rk2 in xcrckey
+
+ ;; get number of bytes to cipher
+%ifidn %%CIPH, CTR
+ mov num_bytes, [job + _msg_len_to_cipher_in_bytes]
+%else
+ ;; Message length to cipher is 0
+ ;; - length is obtained from message length to hash (BIP) minus XGEM header size
+ mov num_bytes, [job + _msg_len_to_hash_in_bytes]
+ sub num_bytes, 8
+%endif
+ or bytes_to_crc, bytes_to_crc
+ jz %%_crc_done
+
+ cmp bytes_to_crc, 32
+ jae %%_at_least_32_bytes
+
+%ifidn %%DIR, DEC
+ ;; decrypt the buffer first
+ mov tmp, num_bytes
+ CIPHER_BIP_REST tmp, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \
+ xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3
+
+ ;; correct in/out pointers - go back to start of the buffers
+ mov tmp, num_bytes
+ and tmp, -16 ; partial block handler doesn't increment pointers
+ sub p_in, tmp
+ sub p_out, tmp
+%endif ; DECRYPTION
+
+ ;; less than 32 bytes
+ cmp bytes_to_crc, 16
+ je %%_exact_16_left
+ jl %%_less_than_16_left
+ ;; load the plaintext
+%ifidn %%DIR, ENC
+ movdqu xtmp1, [p_in]
+%else
+ movdqu xtmp1, [p_out]
+%endif
+ pxor xcrc, xtmp1 ; xor the initial crc value
+ jmp %%_crc_two_xmms
+
+%%_exact_16_left:
+%ifidn %%DIR, ENC
+ movdqu xtmp1, [p_in]
+%else
+ movdqu xtmp1, [p_out]
+%endif
+ pxor xcrc, xtmp1 ; xor the initial CRC value
+ jmp %%_128_done
+
+%%_less_than_16_left:
+%ifidn %%DIR, ENC
+ simd_load_sse_15_1 xtmp1, p_in, bytes_to_crc
+%else
+ simd_load_sse_15_1 xtmp1, p_out, bytes_to_crc
+%endif
+ pxor xcrc, xtmp1 ; xor the initial CRC value
+
+ lea tmp, [rel pshufb_shf_table]
+ movdqu xtmp1, [tmp + bytes_to_crc]
+ pshufb xcrc, xtmp1
+ jmp %%_128_done
+
+%%_at_least_32_bytes:
+ DO_PON p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \
+ xcrc, xcrckey, xtmp1, xtmp2, xtmp3, first_crc, %%DIR, %%CIPH, ctr_check
+ sub num_bytes, 16
+ sub bytes_to_crc, 16
+
+%%_main_loop:
+ cmp bytes_to_crc, 16
+ jb %%_exit_loop
+ DO_PON p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \
+ xcrc, xcrckey, xtmp1, xtmp2, xtmp3, next_crc, %%DIR, %%CIPH, ctr_check
+ sub num_bytes, 16
+ sub bytes_to_crc, 16
+%ifidn %%DIR, ENC
+ jz %%_128_done
+%endif
+ jmp %%_main_loop
+
+%%_exit_loop:
+
+%ifidn %%DIR, DEC
+ ;; decrypt rest of the message including CRC and optional padding
+ mov tmp, num_bytes
+
+ CIPHER_BIP_REST tmp, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \
+ xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3
+
+ mov tmp, num_bytes ; correct in/out pointers - to point before cipher & BIP
+ and tmp, -16 ; partial block handler doesn't increment pointers
+ sub p_in, tmp
+ sub p_out, tmp
+
+ or bytes_to_crc, bytes_to_crc
+ jz %%_128_done
+%endif ; DECRYPTION
+
+ ;; Partial bytes left - complete CRC calculation
+%%_crc_two_xmms:
+ lea tmp, [rel pshufb_shf_table]
+ movdqu xtmp2, [tmp + bytes_to_crc]
+%ifidn %%DIR, ENC
+ movdqu xtmp1, [p_in - 16 + bytes_to_crc] ; xtmp1 = data for CRC
+%else
+ movdqu xtmp1, [p_out - 16 + bytes_to_crc] ; xtmp1 = data for CRC
+%endif
+ movdqa xtmp3, xcrc
+ pshufb xcrc, xtmp2 ; top num_bytes with LSB xcrc
+ pxor xtmp2, [rel mask3]
+ pshufb xtmp3, xtmp2 ; bottom (16 - num_bytes) with MSB xcrc
+
+ ;; data num_bytes (top) blended with MSB bytes of CRC (bottom)
+ movdqa xmm0, xtmp2
+ pblendvb xtmp3, xtmp1 ; xmm0 implicit
+
+ ;; final CRC calculation
+ movdqa xtmp1, xcrc
+ pclmulqdq xtmp1, xcrckey, 0x01
+ pclmulqdq xcrc, xcrckey, 0x10
+ pxor xcrc, xtmp3
+ pxor xcrc, xtmp1
+
+%%_128_done:
+ CRC32_REDUCE_128_TO_32 ethernet_fcs, xcrc, xtmp1, xtmp2, xcrckey
+
+%%_crc_done:
+ ;; @todo - store-to-load problem in ENC case (to be fixed later)
+ ;; - store CRC in input buffer and authentication tag output
+ ;; - encrypt remaining bytes
+%ifidn %%DIR, ENC
+ or DWORD(write_back_crc), DWORD(write_back_crc)
+ jz %%_skip_crc_write_back
+ mov [p_in + bytes_to_crc], DWORD(ethernet_fcs)
+%%_skip_crc_write_back:
+%endif
+ mov tmp, [job + _auth_tag_output]
+ mov [tmp + 4], DWORD(ethernet_fcs)
+
+ or num_bytes, num_bytes
+ jz %%_do_not_cipher_the_rest
+
+ ;; encrypt rest of the message
+ ;; - partial bytes including CRC and optional padding
+ ;; decrypt rest of the message
+ ;; - this may only happen when XGEM payload is short and padding is added
+%ifidn %%DIR, DEC
+ or DWORD(decrypt_not_done), DWORD(decrypt_not_done)
+ jnz %%_do_not_cipher_the_rest
+%endif
+ CIPHER_BIP_REST num_bytes, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \
+ xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3
+%%_do_not_cipher_the_rest:
+
+ ;; finalize BIP
+ movdqa xtmp1, xbip
+ movdqa xtmp2, xbip
+ movdqa xtmp3, xbip
+ psrldq xtmp1, 4
+ psrldq xtmp2, 8
+ psrldq xtmp3, 12
+ pxor xtmp1, xtmp2
+ pxor xbip, xtmp3
+ pxor xbip, xtmp1
+ movd [tmp], xbip
+
+ ;; set job status
+ or dword [job + _status], STS_COMPLETED
+
+ ;; return job
+ mov rax, job
+
+%ifndef LINUX
+ pop r15
+%endif
+ pop r14
+ pop r13
+ pop r12
+%endmacro ; AES128_CTR_PON
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; aes_cntr_128_pon_enc_sse(JOB_AES_HMAC *job)
+align 32
+MKGLOBAL(ENC_FN_NAME,function,internal)
+ENC_FN_NAME:
+ AES128_CTR_PON ENC, CTR
+ ret
+
+;;; aes_cntr_128_pon_dec_sse(JOB_AES_HMAC *job)
+align 32
+MKGLOBAL(DEC_FN_NAME,function,internal)
+DEC_FN_NAME:
+ AES128_CTR_PON DEC, CTR
+ ret
+
+;;; aes_cntr_128_pon_enc_no_ctr_sse(JOB_AES_HMAC *job)
+align 32
+MKGLOBAL(ENC_NO_CTR_FN_NAME,function,internal)
+ENC_NO_CTR_FN_NAME:
+ AES128_CTR_PON ENC, NO_CTR
+ ret
+
+;;; aes_cntr_128_pon_dec_no_ctr_sse(JOB_AES_HMAC *job)
+align 32
+MKGLOBAL(DEC_NO_CTR_FN_NAME,function,internal)
+DEC_NO_CTR_FN_NAME:
+ AES128_CTR_PON DEC, NO_CTR
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif