summaryrefslogtreecommitdiffstats
path: root/src/spdk/intel-ipsec-mb/avx/pon_avx.asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/spdk/intel-ipsec-mb/avx/pon_avx.asm')
-rw-r--r--src/spdk/intel-ipsec-mb/avx/pon_avx.asm1170
1 files changed, 1170 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/avx/pon_avx.asm b/src/spdk/intel-ipsec-mb/avx/pon_avx.asm
new file mode 100644
index 000000000..8510dc4a3
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/pon_avx.asm
@@ -0,0 +1,1170 @@
+;;
+;; Copyright (c) 2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%use smartalign
+
+%include "job_aes_hmac.asm"
+%include "include/os.asm"
+%include "include/memcpy.asm"
+
+;;; This is implementation of stitched algorithms: AES128-CTR + CRC32 + BIP
+;;; This combination is required by PON/xPON/gPON standard.
+;;; Note: BIP is running XOR of double words
+;;; Order of operations:
+;;; - encrypt: HEC update (XGEM header), CRC32 (Ethernet FCS), AES-CTR and BIP
+;;; - decrypt: BIP, AES-CTR and CRC32 (Ethernet FCS)
+
+extern byteswap_const
+extern ddq_add_1
+
+section .data
+default rel
+
+;;; Precomputed constants for CRC32 (Ethernet FCS)
+;;; Details of the CRC algorithm and 4 byte buffer of
+;;; {0x01, 0x02, 0x03, 0x04}:
+;;; Result Poly Init RefIn RefOut XorOut
+;;; 0xB63CFBCD 0x04C11DB7 0xFFFFFFFF true true 0xFFFFFFFF
+align 16
+rk1:
+ dq 0x00000000ccaa009e, 0x00000001751997d0
+
+align 16
+rk5:
+ dq 0x00000000ccaa009e, 0x0000000163cd6124
+
+align 16
+rk7:
+ dq 0x00000001f7011640, 0x00000001db710640
+
+align 16
+pshufb_shf_table:
+ ;; use these values for shift registers with the pshufb instruction
+ dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+ dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+align 16
+init_crc_value:
+ dq 0x00000000FFFFFFFF, 0x0000000000000000
+
+align 16
+mask:
+ dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
+
+align 16
+mask2:
+ dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
+align 16
+mask3:
+ dq 0x8080808080808080, 0x8080808080808080
+
+align 16
+mask_out_top_bytes:
+ dq 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
+ dq 0x0000000000000000, 0x0000000000000000
+
+align 16
+ddq_add_1_1:
+ dq 0x1, 0x1
+
+;; Precomputed constants for HEC calculation (XGEM header)
+;; POLY 0x53900000:
+;; k1 = 0xf9800000
+;; k2 = 0xa0900000
+;; k3 = 0x7cc00000
+;; q = 0x46b927ec
+;; p_res = 0x53900000
+
+align 16
+k3_q:
+ dq 0x7cc00000, 0x46b927ec
+
+align 16
+p_res:
+ dq 0x53900000, 0
+
+align 16
+mask_out_top_64bits:
+ dq 0xffffffff_ffffffff, 0
+
+section .text
+
+%define NUM_AES_ROUNDS 10
+
+%define xcounter xmm0
+%define xbip xmm1
+%define xcrc xmm2
+%define xcrckey xmm3
+%define xtmp1 xmm4
+%define xtmp2 xmm5
+%define xtmp3 xmm6
+%define xtmp4 xmm7
+%define xtmp5 xmm8
+%define xtmp6 xmm9
+%define xtmp7 xmm10
+%define xtmp8 xmm11
+%define xtmp9 xmm12
+%define xtmp10 xmm13
+%define xtmp11 xmm14
+
+%ifdef LINUX
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%define arg4 rcx
+%define tmp_1 r8
+%define tmp_2 r9
+%define tmp_3 r10
+%define tmp_4 r11
+%define tmp_5 r12
+%define tmp_6 r13
+%define tmp_7 r14
+%else
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 r8
+%define arg4 r9
+%define tmp_1 r10
+%define tmp_2 r11
+%define tmp_3 rax
+%define tmp_4 r12
+%define tmp_5 r13
+%define tmp_6 r14
+%define tmp_7 r15
+%endif
+
+%define job arg1
+
+%define p_in arg2
+%define p_keys arg3
+%define p_out arg4
+
+%define num_bytes tmp_1 ; bytes to cipher
+%define tmp tmp_2
+%define ctr_check tmp_3 ; counter block overflow check
+%define bytes_to_crc tmp_4 ; number of bytes to crc ( < num_bytes)
+
+%define ethernet_fcs tmp_6 ; not used together with tmp3
+%define tmp2 tmp_5
+%define tmp3 tmp_6
+
+%define write_back_crc tmp_7
+%define decrypt_not_done tmp_7
+
+;;; ============================================================================
+;;; Does all AES encryption rounds
+%macro AES_ENC_ROUNDS 3
+%define %%KP %1 ; [in] pointer to expanded keys
+%define %%N_ROUNDS %2 ; [in] max rounds (128bit: 10, 12, 14)
+%define %%BLOCK %3 ; [in/out] XMM with encrypted block
+
+%assign round 0
+ vpxor %%BLOCK, %%BLOCK, [%%KP + (round * 16)]
+
+%rep (%%N_ROUNDS - 1)
+%assign round (round + 1)
+ vaesenc %%BLOCK, %%BLOCK, [%%KP + (round * 16)]
+%endrep
+
+%assign round (round + 1)
+ vaesenclast %%BLOCK, %%BLOCK, [%%KP + (round * 16)]
+
+%endmacro
+
+;;; ============================================================================
+;;; Does all AES encryption rounds on 4 blocks
+%macro AES_ENC_ROUNDS_4 7
+%define %%KP %1 ; [in] pointer to expanded keys
+%define %%N_ROUNDS %2 ; [in] max rounds (128bit: 10, 12, 14)
+%define %%BLOCK1 %3 ; [in/out] XMM with encrypted block
+%define %%BLOCK2 %4 ; [in/out] XMM with encrypted block
+%define %%BLOCK3 %5 ; [in/out] XMM with encrypted block
+%define %%BLOCK4 %6 ; [in/out] XMM with encrypted block
+%define %%XT1 %7 ; [clobbered] temporary XMM register
+
+%assign round 0
+ vmovdqa %%XT1, [%%KP + (round * 16)]
+ vpxor %%BLOCK1, %%BLOCK1, %%XT1
+ vpxor %%BLOCK2, %%BLOCK2, %%XT1
+ vpxor %%BLOCK3, %%BLOCK3, %%XT1
+ vpxor %%BLOCK4, %%BLOCK4, %%XT1
+
+%rep (%%N_ROUNDS - 1)
+%assign round (round + 1)
+ vmovdqa %%XT1, [%%KP + (round * 16)]
+ vaesenc %%BLOCK1, %%BLOCK1, %%XT1
+ vaesenc %%BLOCK2, %%BLOCK2, %%XT1
+ vaesenc %%BLOCK3, %%BLOCK3, %%XT1
+ vaesenc %%BLOCK4, %%BLOCK4, %%XT1
+%endrep
+
+%assign round (round + 1)
+ vmovdqa %%XT1, [%%KP + (round * 16)]
+ vaesenclast %%BLOCK1, %%BLOCK1, %%XT1
+ vaesenclast %%BLOCK2, %%BLOCK2, %%XT1
+ vaesenclast %%BLOCK3, %%BLOCK3, %%XT1
+ vaesenclast %%BLOCK4, %%BLOCK4, %%XT1
+%endmacro
+
+;;; ============================================================================
+;;; CRC multiply before XOR against data block
+%macro CRC_CLMUL 3
+%define %%XCRC_IN_OUT %1 ; [in/out] XMM with CRC (can be anything if "no_crc" below)
+%define %%XCRC_MUL %2 ; [in] XMM with CRC constant (can be anything if "no_crc" below)
+%define %%XTMP %3 ; [clobbered] temporary XMM
+
+ vpclmulqdq %%XTMP, %%XCRC_IN_OUT, %%XCRC_MUL, 0x01
+ vpclmulqdq %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%XCRC_MUL, 0x10
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%XTMP
+%endmacro
+
+;;; ============================================================================
+;;; PON stitched algorithm round on a single AES block (16 bytes):
+;;; AES-CTR (optional, depending on %%CIPH)
+;;; - prepares counter block
+;;; - encrypts counter block
+;;; - loads text
+;;; - xor's text against encrypted blocks
+;;; - stores cipher text
+;;; BIP
+;;; - BIP update on 4 x 32-bits
+;;; CRC32
+;;; - CRC32 calculation
+;;; Note: via selection of no_crc, no_bip, no_load, no_store different macro
+;;; behaviour can be achieved to match needs of the overall algorithm.
+%macro DO_PON 15
+%define %%KP %1 ; [in] GP, pointer to expanded keys
+%define %%N_ROUNDS %2 ; [in] number of AES rounds (10, 12 or 14)
+%define %%CTR %3 ; [in/out] XMM with counter block
+%define %%INP %4 ; [in/out] GP with input text pointer or "no_load"
+%define %%OUTP %5 ; [in/out] GP with output text pointer or "no_store"
+%define %%XBIP_IN_OUT %6 ; [in/out] XMM with BIP value or "no_bip"
+%define %%XCRC_IN_OUT %7 ; [in/out] XMM with CRC (can be anything if "no_crc" below)
+%define %%XCRC_MUL %8 ; [in] XMM with CRC constant (can be anything if "no_crc" below)
+%define %%TXMM0 %9 ; [clobbered|out] XMM temporary or data out (no_store)
+%define %%TXMM1 %10 ; [clobbered|in] XMM temporary or data in (no_load)
+%define %%TXMM2 %11 ; [clobbered] XMM temporary
+%define %%CRC_TYPE %12 ; [in] "first_crc" or "next_crc" or "no_crc"
+%define %%DIR %13 ; [in] "ENC" or "DEC"
+%define %%CIPH %14 ; [in] "CTR" or "NO_CTR"
+%define %%CTR_CHECK %15 ; [in/out] GP with 64bit counter (to identify overflow)
+
+%ifidn %%CIPH, CTR
+ ;; prepare counter blocks for encryption
+ vpshufb %%TXMM0, %%CTR, [rel byteswap_const]
+ ;; perform 1 increment on whole 128 bits
+ add %%CTR_CHECK, 1
+ jc %%_ctr_overflow
+ vpaddq %%CTR, %%CTR, [rel ddq_add_1]
+ jmp %%_ctr_overflow_done
+%%_ctr_overflow:
+ vpaddq %%CTR, %%CTR, [rel ddq_add_1_1]
+%%_ctr_overflow_done:
+%endif
+
+ ;; CRC calculation
+%ifidn %%CRC_TYPE, next_crc
+ ;; CRC_MUL macro could be used here but its xor affects
+ ;; performance (blocks cipher xor's) so doing CLMUL
+ ;; only here and xor is done after the cipher.
+ vpclmulqdq %%TXMM2, %%XCRC_IN_OUT, %%XCRC_MUL, 0x01
+ vpclmulqdq %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%XCRC_MUL, 0x10
+%endif
+
+%ifnidn %%INP, no_load
+ vmovdqu %%TXMM1, [%%INP]
+%endif
+
+%ifidn %%CIPH, CTR
+ ;; AES rounds
+ AES_ENC_ROUNDS %%KP, %%N_ROUNDS, %%TXMM0
+
+ ;; xor plaintext/ciphertext against encrypted counter blocks
+ vpxor %%TXMM0, %%TXMM0, %%TXMM1
+%else ;; CIPH = NO_CTR
+ ;; register copy is needed as no_load/no_store options need it
+ vmovdqa %%TXMM0, %%TXMM1
+%endif ;; CIPH = CTR
+
+%ifnidn %%CRC_TYPE, no_crc
+%ifidn %%CRC_TYPE, next_crc
+ ;; Finish split CRC_MUL() operation
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXMM2
+%endif
+%ifidn %%CIPH, CTR
+ ;; CRC calculation for ENCRYPTION/DECRYPTION
+ ;; - always XOR against plaintext block
+%ifidn %%DIR, ENC
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXMM1
+%else
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXMM0
+%endif ; DECRYPT
+%else ;; CIPH = NO_CTR
+ ;; CRC calculation for NO CIPHER option
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXMM1
+%endif ;; CIPH = CTR
+%endif ;; CRC_TYPE != NO_CRC
+
+ ;; store the result in the output buffer
+%ifnidn %%OUTP, no_store
+%ifidn %%CIPH, CTR
+ vmovdqu [%%OUTP], %%TXMM0
+%else ;; CIPH = NO_CTR
+ vmovdqu [%%OUTP], %%TXMM1
+%endif ;; CIPH = CTR
+%endif
+
+ ;; update BIP value - always use cipher text for BIP
+%ifnidn %%XBIP_IN_OUT, no_bip
+%ifidn %%CIPH, CTR
+%ifidn %%DIR, ENC
+ vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%TXMM0
+%else
+ vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%TXMM1
+%endif ; DECRYPT
+%else ;; CIPH = NO_CTR
+ vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%TXMM1
+%endif ;; CIPH = CTR
+%endif ;; !NO_BIP
+
+ ;; increment in/out pointers
+%ifnidn %%INP, no_load
+ add %%INP, 16
+%endif
+%ifnidn %%OUTP, no_store
+ add %%OUTP, 16
+%endif
+%endmacro ; DO_PON
+
+;;; ============================================================================
+;;; PON stitched algorithm round on a single AES block (16 bytes):
+;;; AES-CTR (optional, depending on %%CIPH)
+;;; - prepares counter block
+;;; - encrypts counter block
+;;; - loads text
+;;; - xor's text against encrypted blocks
+;;; - stores cipher text
+;;; BIP
+;;; - BIP update on 4 x 32-bits
+;;; CRC32
+;;; - CRC32 calculation
+;;; Note: via selection of no_crc, no_bip, no_load, no_store different macro
+;;; behaviour can be achieved to match needs of the overall algorithm.
+%macro DO_PON_4 23
+%define %%KP %1 ; [in] GP, pointer to expanded keys
+%define %%N_ROUNDS %2 ; [in] number of AES rounds (10, 12 or 14)
+%define %%CTR %3 ; [in/out] XMM with counter block
+%define %%INP %4 ; [in/out] GP with input text pointer or "no_load"
+%define %%OUTP %5 ; [in/out] GP with output text pointer or "no_store"
+%define %%XBIP_IN_OUT %6 ; [in/out] XMM with BIP value or "no_bip"
+%define %%XCRC_IN_OUT %7 ; [in/out] XMM with CRC (can be anything if "no_crc" below)
+%define %%XCRC_MUL %8 ; [in] XMM with CRC constant (can be anything if "no_crc" below)
+%define %%T0 %9 ; [clobbered] XMM temporary
+%define %%T1 %10 ; [clobbered] XMM temporary
+%define %%T2 %11 ; [clobbered] XMM temporary
+%define %%T3 %12 ; [clobbered] XMM temporary
+%define %%T4 %13 ; [clobbered] XMM temporary
+%define %%T5 %14 ; [clobbered] XMM temporary
+%define %%T6 %15 ; [clobbered] XMM temporary
+%define %%T7 %16 ; [clobbered] XMM temporary
+%define %%T8 %17 ; [clobbered] XMM temporary
+%define %%T9 %18 ; [clobbered] XMM temporary
+%define %%T10 %19 ; [clobbered] XMM temporary
+%define %%CRC_TYPE %20 ; [in] "first_crc" or "next_crc" or "no_crc"
+%define %%DIR %21 ; [in] "ENC" or "DEC"
+%define %%CIPH %22 ; [in] "CTR" or "NO_CTR"
+%define %%CTR_CHECK %23 ; [in/out] GP with 64bit counter (to identify overflow)
+
+%define %%CTR1 %%T3
+%define %%CTR2 %%T4
+%define %%CTR3 %%T5
+%define %%CTR4 %%T6
+
+%define %%TXT1 %%T7
+%define %%TXT2 %%T8
+%define %%TXT3 %%T9
+%define %%TXT4 %%T10
+
+%ifidn %%CIPH, CTR
+ ;; prepare counter blocks for encryption
+ vmovdqa %%T0, [rel ddq_add_1]
+ vmovdqa %%T2, [rel byteswap_const]
+
+ ;; CTR1: copy saved CTR value as CTR1
+ vmovdqa %%CTR1, %%CTR
+
+ cmp %%CTR_CHECK, 0xffff_ffff_ffff_ffff - 4
+ ja %%_ctr_will_overflow
+
+ ;; case in which 64-bit counter will not overflow
+ vpaddq %%CTR2, %%CTR1, %%T0
+ vpaddq %%CTR3, %%CTR2, %%T0
+ vpaddq %%CTR4, %%CTR3, %%T0
+ vpaddq %%CTR, %%CTR4, %%T0
+ vpshufb %%CTR1, %%CTR1, %%T2
+ vpshufb %%CTR2, %%CTR2, %%T2
+ vpshufb %%CTR3, %%CTR3, %%T2
+ vpshufb %%CTR4, %%CTR4, %%T2
+ add %%CTR_CHECK, 4
+ jmp %%_ctr_update_done
+
+%%_ctr_will_overflow:
+ vmovdqa %%T1, [rel ddq_add_1_1]
+ ;; CTR2: perform 1 increment on whole 128 bits
+ add %%CTR_CHECK, 1
+ jc %%_ctr2_overflow
+ vpaddq %%CTR2, %%CTR1, %%T0
+ jmp %%_ctr2_overflow_done
+%%_ctr2_overflow:
+ vpaddq %%CTR2, %%CTR1, %%T1
+%%_ctr2_overflow_done:
+ vpshufb %%CTR1, %%CTR1, %%T2
+
+ ;; CTR3: perform 1 increment on whole 128 bits
+ add %%CTR_CHECK, 1
+ jc %%_ctr3_overflow
+ vpaddq %%CTR3, %%CTR2, %%T0
+ jmp %%_ctr3_overflow_done
+%%_ctr3_overflow:
+ vpaddq %%CTR3, %%CTR2, %%T1
+%%_ctr3_overflow_done:
+ vpshufb %%CTR2, %%CTR2, %%T2
+
+ ;; CTR4: perform 1 increment on whole 128 bits
+ add %%CTR_CHECK, 1
+ jc %%_ctr4_overflow
+ vpaddq %%CTR4, %%CTR3, %%T0
+ jmp %%_ctr4_overflow_done
+%%_ctr4_overflow:
+ vpaddq %%CTR4, %%CTR3, %%T1
+%%_ctr4_overflow_done:
+ vpshufb %%CTR3, %%CTR3, %%T2
+
+ ;; CTR: perform 1 increment on whole 128 bits (for the next iteration)
+ add %%CTR_CHECK, 1
+ jc %%_ctr_overflow
+ vpaddq %%CTR, %%CTR4, %%T0
+ jmp %%_ctr_overflow_done
+%%_ctr_overflow:
+ vpaddq %%CTR, %%CTR4, %%T1
+%%_ctr_overflow_done:
+ vpshufb %%CTR4, %%CTR4, %%T2
+%%_ctr_update_done:
+%endif
+
+%ifidn %%CRC_TYPE, next_crc
+ ;; CRC_MUL macro could be used here but its xor affects
+ ;; performance (blocks cipher xor's) so doing CLMUL
+ ;; only here and xor is done after the cipher.
+ vpclmulqdq %%T2, %%XCRC_IN_OUT, %%XCRC_MUL, 0x01
+ vpclmulqdq %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%XCRC_MUL, 0x10
+%endif
+
+ ;; load plaintext/ciphertext
+ vmovdqu %%TXT1, [%%INP]
+ vmovdqu %%TXT2, [%%INP + 16]
+ vmovdqu %%TXT3, [%%INP + 32]
+ vmovdqu %%TXT4, [%%INP + 48]
+
+%ifidn %%CIPH, CTR
+ AES_ENC_ROUNDS_4 %%KP, %%N_ROUNDS, %%CTR1, %%CTR2, %%CTR3, %%CTR4, %%T0
+
+ ;; xor plaintext/ciphertext against encrypted counter blocks
+ vpxor %%CTR1, %%CTR1, %%TXT1
+ vpxor %%CTR2, %%CTR2, %%TXT2
+ vpxor %%CTR3, %%CTR3, %%TXT3
+ vpxor %%CTR4, %%CTR4, %%TXT4
+%endif ;; CIPH = CTR
+
+%ifidn %%CRC_TYPE, next_crc
+ ;; Finish split CRC_MUL() operation
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%T2
+%endif
+%ifidn %%CIPH, CTR
+%ifidn %%DIR, ENC
+ ;; CRC calculation for ENCRYPTION (blocks 1 & 2)
+ ;; - XOR CRC against plaintext block
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT1
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT2
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+%else
+ ;; CRC calculation for DECRYPTION (blocks 1 & 2)
+ ;; - XOR CRC against plaintext block
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%CTR1
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%CTR2
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+%endif ; DECRYPT
+%else ;; CIPH = NO_CTR
+ ;; CRC calculation for NO CIPHER option (blocks 1 & 2)
+ ;; - XOR CRC against plaintext block
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT1
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT2
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+%endif ;; CIPH = CTR
+
+ ;; store ciphertext/plaintext
+%ifidn %%CIPH, CTR
+ vmovdqu [%%OUTP], %%CTR1
+ vmovdqu [%%OUTP + 16], %%CTR2
+ vmovdqu [%%OUTP + 32], %%CTR3
+ vmovdqu [%%OUTP + 48], %%CTR4
+%else ;; CIPH = NO_CTR
+ vmovdqu [%%OUTP], %%TXT1
+ vmovdqu [%%OUTP + 16], %%TXT2
+ vmovdqu [%%OUTP + 32], %%TXT3
+ vmovdqu [%%OUTP + 48], %%TXT4
+%endif ;; CIPH = CTR
+
+ ;; update BIP value
+%ifidn %%CIPH, CTR
+ ;; - always use ciphertext for BIP
+%ifidn %%DIR, ENC
+ vpxor %%T0, %%CTR1, %%CTR2
+ vpxor %%T1, %%CTR3, %%CTR4
+%else
+ vpxor %%T0, %%TXT1, %%TXT2
+ vpxor %%T1, %%TXT3, %%TXT4
+%endif ; DECRYPT
+%else ;; CIPH = NO_CTR
+ vpxor %%T0, %%TXT1, %%TXT2
+ vpxor %%T1, %%TXT3, %%TXT4
+%endif ;; CIPH = CTR
+ vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%T0
+ vpxor %%XBIP_IN_OUT, %%XBIP_IN_OUT, %%T1
+
+ ;; increment in/out pointers
+ add %%INP, 64
+ add %%OUTP, 64
+
+%ifidn %%CIPH, CTR
+%ifidn %%DIR, ENC
+ ;; CRC calculation for ENCRYPTION (blocks 3 & 4)
+ ;; - XOR CRC against plaintext block
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT3
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT4
+%else
+ ;; CRC calculation for DECRYPTION (blocks 3 & 4)
+ ;; - XOR CRC against plaintext block
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%CTR3
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%CTR4
+%endif ; DECRYPT
+%else ;; CIPH = NO_CTR
+ ;; CRC calculation for NO CIPHER option (blocks 3 & 4)
+ ;; - XOR CRC against plaintext block
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT3
+
+ CRC_CLMUL %%XCRC_IN_OUT, %%XCRC_MUL, %%T2
+ vpxor %%XCRC_IN_OUT, %%XCRC_IN_OUT, %%TXT4
+%endif ;; CIPH = CTR
+
+%endmacro ; DO_PON_4
+
+;;; ============================================================================
+;;; CIPHER and BIP specified number of bytes
+%macro CIPHER_BIP_REST 14
+%define %%NUM_BYTES %1 ; [in/clobbered] number of bytes to cipher
+%define %%DIR %2 ; [in] "ENC" or "DEC"
+%define %%CIPH %3 ; [in] "CTR" or "NO_CTR"
+%define %%PTR_IN %4 ; [in/clobbered] GPR pointer to input buffer
+%define %%PTR_OUT %5 ; [in/clobbered] GPR pointer to output buffer
+%define %%PTR_KEYS %6 ; [in] GPR pointer to expanded keys
+%define %%XBIP_IN_OUT %7 ; [in/out] XMM 128-bit BIP state
+%define %%XCTR_IN_OUT %8 ; [in/out] XMM 128-bit AES counter block
+%define %%XMMT1 %9 ; [clobbered] temporary XMM
+%define %%XMMT2 %10 ; [clobbered] temporary XMM
+%define %%XMMT3 %11 ; [clobbered] temporary XMM
+%define %%CTR_CHECK %12 ; [in/out] GP with 64bit counter (to identify overflow)
+%define %%GPT1 %13 ; [clobbered] temporary GP
+%define %%GPT2 %14 ; [clobbered] temporary GP
+
+ align 16
+%%_cipher_last_blocks:
+ cmp %%NUM_BYTES, 16
+ jb %%_partial_block_left
+
+ DO_PON %%PTR_KEYS, NUM_AES_ROUNDS, %%XCTR_IN_OUT, %%PTR_IN, %%PTR_OUT, %%XBIP_IN_OUT, \
+ no_crc, no_crc, %%XMMT1, %%XMMT2, %%XMMT3, no_crc, %%DIR, %%CIPH, %%CTR_CHECK
+ sub %%NUM_BYTES, 16
+ jz %%_bip_done
+ jmp %%_cipher_last_blocks
+
+%%_partial_block_left:
+ simd_load_avx_15_1 %%XMMT2, %%PTR_IN, %%NUM_BYTES
+
+ ;; DO_PON() is not loading nor storing the data in this case:
+ ;; XMMT2 = data in
+ ;; XMMT1 = data out
+ DO_PON %%PTR_KEYS, NUM_AES_ROUNDS, %%XCTR_IN_OUT, no_load, no_store, no_bip, \
+ no_crc, no_crc, %%XMMT1, %%XMMT2, %%XMMT3, no_crc, %%DIR, %%CIPH, %%CTR_CHECK
+
+ ;; bip update for partial block (mask out bytes outside the message)
+ lea %%GPT1, [rel mask_out_top_bytes + 16]
+ sub %%GPT1, %%NUM_BYTES
+ vmovdqu %%XMMT3, [%%GPT1]
+ ;; put masked cipher text into XMMT2 for BIP update
+%ifidn %%DIR, ENC
+ vpand %%XMMT2, %%XMMT1, %%XMMT3
+%else
+ vpand %%XMMT2, %%XMMT2, %%XMMT3
+%endif
+ vpxor %%XBIP_IN_OUT, %%XMMT2
+
+ ;; store partial bytes in the output buffer
+ simd_store_avx_15 %%PTR_OUT, %%XMMT1, %%NUM_BYTES, %%GPT1, %%GPT2
+
+%%_bip_done:
+%endmacro ; CIPHER_BIP_REST
+
+;; =============================================================================
+;; Barrett reduction from 128-bits to 32-bits modulo Ethernet FCS polynomial
+
+%macro CRC32_REDUCE_128_TO_32 5
+%define %%CRC %1 ; [out] GP to store 32-bit Ethernet FCS value
+%define %%XCRC %2 ; [in/clobbered] XMM with CRC
+%define %%XT1 %3 ; [clobbered] temporary xmm register
+%define %%XT2 %4 ; [clobbered] temporary xmm register
+%define %%XT3 %5 ; [clobbered] temporary xmm register
+
+%define %%XCRCKEY %%XT3
+
+ ;; compute crc of a 128-bit value
+ vmovdqa %%XCRCKEY, [rel rk5]
+
+ ;; 64b fold
+ vpclmulqdq %%XT1, %%XCRC, %%XCRCKEY, 0x00
+ vpsrldq %%XCRC, %%XCRC, 8
+ vpxor %%XCRC, %%XCRC, %%XT1
+
+ ;; 32b fold
+ vpslldq %%XT1, %%XCRC, 4
+ vpclmulqdq %%XT1, %%XT1, %%XCRCKEY, 0x10
+ vpxor %%XCRC, %%XCRC, %%XT1
+
+%%_crc_barrett:
+ ;; Barrett reduction
+ vpand %%XCRC, [rel mask2]
+ vmovdqa %%XT1, %%XCRC
+ vmovdqa %%XT2, %%XCRC
+ vmovdqa %%XCRCKEY, [rel rk7]
+
+ vpclmulqdq %%XCRC, %%XCRCKEY, 0x00
+ vpxor %%XCRC, %%XT2
+ vpand %%XCRC, [rel mask]
+ vmovdqa %%XT2, %%XCRC
+ vpclmulqdq %%XCRC, %%XCRCKEY, 0x10
+ vpxor %%XCRC, %%XT2
+ vpxor %%XCRC, %%XT1
+ vpextrd DWORD(%%CRC), %%XCRC, 2 ; 32-bit CRC value
+ not DWORD(%%CRC)
+%endmacro
+
+;; =============================================================================
+;; Barrett reduction from 128-bits to 32-bits modulo 0x53900000 polynomial
+
+%macro HEC_REDUCE_128_TO_32 4
+%define %%XMM_IN_OUT %1 ; [in/out] xmm register with data in and out
+%define %%XT1 %2 ; [clobbered] temporary xmm register
+%define %%XT2 %3 ; [clobbered] temporary xmm register
+%define %%XT3 %4 ; [clobbered] temporary xmm register
+
+%define %%K3_Q %%XT1
+%define %%P_RES %%XT2
+%define %%XTMP %%XT3
+
+ ;; 128 to 64 bit reduction
+ vmovdqa %%K3_Q, [k3_q]
+ vmovdqa %%P_RES, [p_res]
+
+ vpclmulqdq %%XTMP, %%XMM_IN_OUT, %%K3_Q, 0x01 ; K3
+ vpxor %%XTMP, %%XTMP, %%XMM_IN_OUT
+
+ vpclmulqdq %%XTMP, %%XTMP, %%K3_Q, 0x01 ; K3
+ vpxor %%XMM_IN_OUT, %%XTMP, %%XMM_IN_OUT
+
+ vpand %%XMM_IN_OUT, [rel mask_out_top_64bits]
+
+ ;; 64 to 32 bit reduction
+ vpsrldq %%XTMP, %%XMM_IN_OUT, 4
+ vpclmulqdq %%XTMP, %%XTMP, %%K3_Q, 0x10 ; Q
+ vpxor %%XTMP, %%XTMP, %%XMM_IN_OUT
+ vpsrldq %%XTMP, %%XTMP, 4
+
+ vpclmulqdq %%XTMP, %%XTMP, %%P_RES, 0x00 ; P
+ vpxor %%XMM_IN_OUT, %%XTMP, %%XMM_IN_OUT
+%endmacro
+
+;; =============================================================================
+;; Barrett reduction from 64-bits to 32-bits modulo 0x53900000 polynomial
+
+%macro HEC_REDUCE_64_TO_32 4
+%define %%XMM_IN_OUT %1 ; [in/out] xmm register with data in and out
+%define %%XT1 %2 ; [clobbered] temporary xmm register
+%define %%XT2 %3 ; [clobbered] temporary xmm register
+%define %%XT3 %4 ; [clobbered] temporary xmm register
+
+%define %%K3_Q %%XT1
+%define %%P_RES %%XT2
+%define %%XTMP %%XT3
+
+ vmovdqa %%K3_Q, [k3_q]
+ vmovdqa %%P_RES, [p_res]
+
+ ;; 64 to 32 bit reduction
+ vpsrldq %%XTMP, %%XMM_IN_OUT, 4
+ vpclmulqdq %%XTMP, %%XTMP, %%K3_Q, 0x10 ; Q
+ vpxor %%XTMP, %%XTMP, %%XMM_IN_OUT
+ vpsrldq %%XTMP, %%XTMP, 4
+
+ vpclmulqdq %%XTMP, %%XTMP, %%P_RES, 0x00 ; P
+ vpxor %%XMM_IN_OUT, %%XTMP, %%XMM_IN_OUT
+%endmacro
+
+;; =============================================================================
+;; HEC compute and header update for 32-bit XGEM headers
+%macro HEC_COMPUTE_32 6
+%define %%HEC_IN_OUT %1 ; [in/out] GP register with HEC in LE format
+%define %%GT1 %2 ; [clobbered] temporary GP register
+%define %%XT1 %4 ; [clobbered] temporary xmm register
+%define %%XT2 %5 ; [clobbered] temporary xmm register
+%define %%XT3 %6 ; [clobbered] temporary xmm register
+%define %%XT4 %7 ; [clobbered] temporary xmm register
+
+ mov DWORD(%%GT1), DWORD(%%HEC_IN_OUT)
+ ;; shift out 13 bits of HEC value for CRC computation
+ shr DWORD(%%GT1), 13
+
+ ;; mask out current HEC value to merge with an updated HEC at the end
+ and DWORD(%%HEC_IN_OUT), 0xffff_e000
+
+ ;; prepare the message for CRC computation
+ vmovd %%XT1, DWORD(%%GT1)
+ vpslldq %%XT1, 4 ; shift left by 32-bits
+
+ HEC_REDUCE_64_TO_32 %%XT1, %%XT2, %%XT3, %%XT4
+
+ ;; extract 32-bit value
+ ;; - normally perform 20 bit shift right but bit 0 is a parity bit
+ vmovd DWORD(%%GT1), %%XT1
+ shr DWORD(%%GT1), (20 - 1)
+
+ ;; merge header bytes with updated 12-bit CRC value and
+ ;; compute parity
+ or DWORD(%%GT1), DWORD(%%HEC_IN_OUT)
+ popcnt DWORD(%%HEC_IN_OUT), DWORD(%%GT1)
+ and DWORD(%%HEC_IN_OUT), 1
+ or DWORD(%%HEC_IN_OUT), DWORD(%%GT1)
+%endmacro
+
+;; =============================================================================
+;; HEC compute and header update for 64-bit XGEM headers
+%macro HEC_COMPUTE_64 6
+%define %%HEC_IN_OUT %1 ; [in/out] GP register with HEC in LE format
+%define %%GT1 %2 ; [clobbered] temporary GP register
+%define %%XT1 %3 ; [clobbered] temporary xmm register
+%define %%XT2 %4 ; [clobbered] temporary xmm register
+%define %%XT3 %5 ; [clobbered] temporary xmm register
+%define %%XT4 %6 ; [clobbered] temporary xmm register
+
+ mov %%GT1, %%HEC_IN_OUT
+ ;; shift out 13 bits of HEC value for CRC computation
+ shr %%GT1, 13
+
+ ;; mask out current HEC value to merge with an updated HEC at the end
+ and %%HEC_IN_OUT, 0xffff_ffff_ffff_e000
+
+ ;; prepare the message for CRC computation
+ vmovq %%XT1, %%GT1
+ vpslldq %%XT1, 4 ; shift left by 32-bits
+
+ HEC_REDUCE_128_TO_32 %%XT1, %%XT2, %%XT3, %%XT4
+
+ ;; extract 32-bit value
+ ;; - normally perform 20 bit shift right but bit 0 is a parity bit
+ vmovd DWORD(%%GT1), %%XT1
+ shr DWORD(%%GT1), (20 - 1)
+
+ ;; merge header bytes with updated 12-bit CRC value and
+ ;; compute parity
+ or %%GT1, %%HEC_IN_OUT
+ popcnt %%HEC_IN_OUT, %%GT1
+ and %%HEC_IN_OUT, 1
+ or %%HEC_IN_OUT, %%GT1
+%endmacro
+
+;;; ============================================================================
+;;; PON stitched algorithm of AES128-CTR, CRC and BIP
+;;; - this is master macro that implements encrypt/decrypt API
+;;; - calls other macros and directly uses registers
+;;; defined at the top of the file
+%macro AES128_CTR_PON 2
+%define %%DIR %1 ; [in] direction "ENC" or "DEC"
+%define %%CIPH %2 ; [in] cipher "CTR" or "NO_CTR"
+
+ push r12
+ push r13
+ push r14
+%ifndef LINUX
+ push r15
+%endif
+
+%ifidn %%DIR, ENC
+ ;; by default write back CRC for encryption
+ mov DWORD(write_back_crc), 1
+%else
+ ;; mark decryption as finished
+ mov DWORD(decrypt_not_done), 1
+%endif
+ ;; START BIP (and update HEC if encrypt direction)
+ ;; - load XGEM header (8 bytes) for BIP (not part of encrypted payload)
+ ;; - convert it into LE
+ ;; - update HEC field in the header
+ ;; - convert it into BE
+ ;; - store back the header (with updated HEC)
+ ;; - start BIP
+ ;; (free to use tmp_1, tmp2 and tmp_3 at this stage)
+ mov tmp_2, [job + _src]
+ add tmp_2, [job + _hash_start_src_offset_in_bytes]
+ mov tmp_3, [tmp_2]
+%ifidn %%DIR, ENC
+ bswap tmp_3 ; go to LE
+ HEC_COMPUTE_64 tmp_3, tmp_1, xtmp1, xtmp2, xtmp3, xtmp4
+ mov bytes_to_crc, tmp_3
+ shr bytes_to_crc, (48 + 2) ; PLI = MSB 14 bits
+ bswap tmp_3 ; go back to BE
+ mov [tmp_2], tmp_3
+ vmovq xbip, tmp_3
+%else
+ vmovq xbip, tmp_3
+ mov bytes_to_crc, tmp_3
+ bswap bytes_to_crc ; go to LE
+ shr bytes_to_crc, (48 + 2) ; PLI = MSB 14 bits
+%endif
+ cmp bytes_to_crc, 4
+ ja %%_crc_not_zero
+ ;; XGEM payload shorter or equal to 4 bytes
+%ifidn %%DIR, ENC
+ ;; On encryption, do not write Ethernet FCS back into the message
+ xor DWORD(write_back_crc), DWORD(write_back_crc)
+%else
+ ;; Mark decryption as not finished
+ ;; - Ethernet FCS is not computed
+ ;; - decrypt + BIP to be done at the end
+ xor DWORD(decrypt_not_done), DWORD(decrypt_not_done)
+%endif
+ mov DWORD(bytes_to_crc), 4 ; it will be zero after the next line (avoid jmp)
+%%_crc_not_zero:
+ sub bytes_to_crc, 4 ; subtract size of the CRC itself
+
+%ifidn %%CIPH, CTR
+ ;; - read 16 bytes of IV
+ ;; - convert to little endian format
+ ;; - save least significant 8 bytes in GP register for overflow check
+ mov tmp, [job + _iv]
+ vmovdqu xcounter, [tmp]
+ vpshufb xcounter, [rel byteswap_const]
+ vmovq ctr_check, xcounter
+%endif
+
+ ;; get input buffer (after XGEM header)
+ mov p_in, [job + _src]
+ add p_in, [job + _cipher_start_src_offset_in_bytes]
+
+ ;; get output buffer
+ mov p_out, [job + _dst]
+
+%ifidn %%CIPH, CTR
+ ;; get key pointers
+ mov p_keys, [job + _aes_enc_key_expanded]
+%endif
+
+ ;; initial CRC value
+ vmovdqa xcrc, [rel init_crc_value]
+
+ ;; load CRC constants
+ vmovdqa xcrckey, [rel rk1] ; rk1 and rk2 in xcrckey
+
+ ;; get number of bytes to cipher
+%ifidn %%CIPH, CTR
+ mov num_bytes, [job + _msg_len_to_cipher_in_bytes]
+%else
+ ;; Message length to cipher is 0
+ ;; - length is obtained from message length to hash (BIP) minus XGEM header size
+ mov num_bytes, [job + _msg_len_to_hash_in_bytes]
+ sub num_bytes, 8
+%endif
+ or bytes_to_crc, bytes_to_crc
+ jz %%_crc_done
+
+ cmp bytes_to_crc, 32
+ jae %%_at_least_32_bytes
+
+%ifidn %%DIR, DEC
+ ;; decrypt the buffer first
+ mov tmp, num_bytes
+ CIPHER_BIP_REST tmp, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \
+ xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3
+
+ ;; correct in/out pointers - go back to start of the buffers
+ mov tmp, num_bytes
+ and tmp, -16 ; partial block handler doesn't increment pointers
+ sub p_in, tmp
+ sub p_out, tmp
+%endif ; DECRYPTION
+
+ ;; less than 32 bytes
+ cmp bytes_to_crc, 16
+ je %%_exact_16_left
+ jl %%_less_than_16_left
+ ;; load the plaintext
+%ifidn %%DIR, ENC
+ vmovdqu xtmp1, [p_in]
+%else
+ vmovdqu xtmp1, [p_out]
+%endif
+ vpxor xcrc, xtmp1 ; xor the initial crc value
+ jmp %%_crc_two_xmms
+
+%%_exact_16_left:
+%ifidn %%DIR, ENC
+ vmovdqu xtmp1, [p_in]
+%else
+ vmovdqu xtmp1, [p_out]
+%endif
+ vpxor xcrc, xtmp1 ; xor the initial crc value
+ jmp %%_128_done
+
+%%_less_than_16_left:
+%ifidn %%DIR, ENC
+ simd_load_avx_15_1 xtmp1, p_in, bytes_to_crc
+%else
+ simd_load_avx_15_1 xtmp1, p_out, bytes_to_crc
+%endif
+ vpxor xcrc, xtmp1 ; xor the initial crc value
+
+ lea tmp, [rel pshufb_shf_table]
+ vmovdqu xtmp1, [tmp + bytes_to_crc]
+ vpshufb xcrc, xtmp1
+ jmp %%_128_done
+
+%%_at_least_32_bytes:
+ cmp bytes_to_crc, 64
+ jb %%_crc_below_64_bytes
+
+ DO_PON_4 p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \
+ xcrc, xcrckey, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, xtmp6, \
+ xtmp7, xtmp8, xtmp9, xtmp10, xtmp11, first_crc, %%DIR, \
+ %%CIPH, ctr_check
+ sub num_bytes, 64
+ sub bytes_to_crc, 64
+%ifidn %%DIR, ENC
+ jz %%_128_done
+%endif
+
+ align 16
+%%_main_loop_64:
+ cmp bytes_to_crc, 64
+ jb %%_main_loop
+
+ DO_PON_4 p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \
+ xcrc, xcrckey, xtmp1, xtmp2, xtmp3, xtmp4, xtmp5, xtmp6, \
+ xtmp7, xtmp8, xtmp9, xtmp10, xtmp11, next_crc, %%DIR, \
+ %%CIPH, ctr_check
+ sub num_bytes, 64
+ sub bytes_to_crc, 64
+%ifidn %%DIR, ENC
+ jz %%_128_done
+%endif
+ jmp %%_main_loop_64
+
+%%_crc_below_64_bytes:
+ DO_PON p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \
+ xcrc, xcrckey, xtmp1, xtmp2, xtmp3, first_crc, %%DIR, \
+ %%CIPH, ctr_check
+ sub num_bytes, 16
+ sub bytes_to_crc, 16
+
+ align 16
+%%_main_loop:
+ cmp bytes_to_crc, 16
+ jb %%_exit_loop
+ DO_PON p_keys, NUM_AES_ROUNDS, xcounter, p_in, p_out, xbip, \
+ xcrc, xcrckey, xtmp1, xtmp2, xtmp3, next_crc, %%DIR, \
+ %%CIPH, ctr_check
+ sub num_bytes, 16
+ sub bytes_to_crc, 16
+%ifidn %%DIR, ENC
+ jz %%_128_done
+%endif
+ jmp %%_main_loop
+
+%%_exit_loop:
+
+%ifidn %%DIR, DEC
+ ;; decrypt rest of the message including CRC and optional padding
+ mov tmp, num_bytes
+
+ CIPHER_BIP_REST tmp, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \
+ xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3
+
+ mov tmp, num_bytes ; correct in/out pointers - to point before cipher & BIP
+ and tmp, -16 ; partial block handler doesn't increment pointers
+ sub p_in, tmp
+ sub p_out, tmp
+
+ or bytes_to_crc, bytes_to_crc
+ jz %%_128_done
+%endif ; DECRYPTION
+
+ ;; Partial bytes left - complete CRC calculation
+%%_crc_two_xmms:
+ lea tmp, [rel pshufb_shf_table]
+ vmovdqu xtmp2, [tmp + bytes_to_crc]
+ ;; @note: in case of in-place operation (default) this load is
+ ;; creating store-to-load problem.
+ ;; However, there is no easy way to address it at the moment.
+%ifidn %%DIR, ENC
+ vmovdqu xtmp1, [p_in - 16 + bytes_to_crc] ; xtmp1 = data for CRC
+%else
+ vmovdqu xtmp1, [p_out - 16 + bytes_to_crc] ; xtmp1 = data for CRC
+%endif
+ vmovdqa xtmp3, xcrc
+ vpshufb xcrc, xtmp2 ; top num_bytes with LSB xcrc
+ vpxor xtmp2, [rel mask3]
+ vpshufb xtmp3, xtmp2 ; bottom (16 - num_bytes) with MSB xcrc
+
+ ;; data bytes_to_crc (top) blended with MSB bytes of CRC (bottom)
+ vpblendvb xtmp3, xtmp1, xtmp2
+
+ ;; final CRC calculation
+ vpclmulqdq xtmp1, xcrc, xcrckey, 0x01
+ vpclmulqdq xcrc, xcrc, xcrckey, 0x10
+ vpxor xcrc, xtmp3
+ vpxor xcrc, xtmp1
+
+%%_128_done:
+ CRC32_REDUCE_128_TO_32 ethernet_fcs, xcrc, xtmp1, xtmp2, xcrckey
+
+%%_crc_done:
+ ;; @todo - store-to-load problem in ENC case (to be fixed later)
+ ;; - store CRC in input buffer and authentication tag output
+ ;; - encrypt remaining bytes
+%ifidn %%DIR, ENC
+ or DWORD(write_back_crc), DWORD(write_back_crc)
+ jz %%_skip_crc_write_back
+ mov [p_in + bytes_to_crc], DWORD(ethernet_fcs)
+%%_skip_crc_write_back:
+%endif
+ mov tmp, [job + _auth_tag_output]
+ mov [tmp + 4], DWORD(ethernet_fcs)
+
+ or num_bytes, num_bytes
+ jz %%_do_not_cipher_the_rest
+
+ ;; encrypt rest of the message
+ ;; - partial bytes including CRC and optional padding
+ ;; decrypt rest of the message
+ ;; - this may only happen when XGEM payload is short and padding is added
+%ifidn %%DIR, DEC
+ or DWORD(decrypt_not_done), DWORD(decrypt_not_done)
+ jnz %%_do_not_cipher_the_rest
+%endif
+ CIPHER_BIP_REST num_bytes, %%DIR, %%CIPH, p_in, p_out, p_keys, xbip, \
+ xcounter, xtmp1, xtmp2, xtmp3, ctr_check, tmp2, tmp3
+
+%%_do_not_cipher_the_rest:
+
+ ;; finalize BIP
+ vpsrldq xtmp1, xbip, 4
+ vpsrldq xtmp2, xbip, 8
+ vpsrldq xtmp3, xbip, 12
+ vpxor xtmp1, xtmp1, xtmp2
+ vpxor xbip, xbip, xtmp3
+ vpxor xbip, xbip, xtmp1
+ vmovd [tmp], xbip ; tmp already holds _auth_tag_output
+
+ ;; set job status
+ or dword [job + _status], STS_COMPLETED
+
+ ;; return job
+ mov rax, job
+
+%ifndef LINUX
+ pop r15
+%endif
+ pop r14
+ pop r13
+ pop r12
+%endmacro ; AES128_CTR_PON
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; submit_job_pon_enc_avx(JOB_AES_HMAC *job)
+align 64
+MKGLOBAL(submit_job_pon_enc_avx,function,internal)
+submit_job_pon_enc_avx:
+ AES128_CTR_PON ENC, CTR
+ ret
+
+;;; submit_job_pon_dec_avx(JOB_AES_HMAC *job)
+align 64
+MKGLOBAL(submit_job_pon_dec_avx,function,internal)
+submit_job_pon_dec_avx:
+ AES128_CTR_PON DEC, CTR
+ ret
+
+;;; submit_job_pon_enc_no_ctr_avx(JOB_AES_HMAC *job)
+align 64
+MKGLOBAL(submit_job_pon_enc_no_ctr_avx,function,internal)
+submit_job_pon_enc_no_ctr_avx:
+ AES128_CTR_PON ENC, NO_CTR
+ ret
+
+;;; submit_job_pon_dec_no_ctr_avx(JOB_AES_HMAC *job)
+align 64
+MKGLOBAL(submit_job_pon_dec_no_ctr_avx,function,internal)
+submit_job_pon_dec_no_ctr_avx:
+ AES128_CTR_PON DEC, NO_CTR
+ ret
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif