summaryrefslogtreecommitdiffstats
path: root/src/spdk/intel-ipsec-mb/avx/zuc_avx.asm
diff options
context:
space:
mode:
Diffstat (limited to '')
-rwxr-xr-xsrc/spdk/intel-ipsec-mb/avx/zuc_avx.asm1146
1 files changed, 1146 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/avx/zuc_avx.asm b/src/spdk/intel-ipsec-mb/avx/zuc_avx.asm
new file mode 100755
index 000000000..e7c6bad8a
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/avx/zuc_avx.asm
@@ -0,0 +1,1146 @@
+;;
+;; Copyright (c) 2009-2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%include "include/os.asm"
+%include "include/reg_sizes.asm"
+
+extern lookup_8bit_avx
+
+section .data
+default rel
+align 64
+S0:
+db 0x3e,0x72,0x5b,0x47,0xca,0xe0,0x00,0x33,0x04,0xd1,0x54,0x98,0x09,0xb9,0x6d,0xcb
+db 0x7b,0x1b,0xf9,0x32,0xaf,0x9d,0x6a,0xa5,0xb8,0x2d,0xfc,0x1d,0x08,0x53,0x03,0x90
+db 0x4d,0x4e,0x84,0x99,0xe4,0xce,0xd9,0x91,0xdd,0xb6,0x85,0x48,0x8b,0x29,0x6e,0xac
+db 0xcd,0xc1,0xf8,0x1e,0x73,0x43,0x69,0xc6,0xb5,0xbd,0xfd,0x39,0x63,0x20,0xd4,0x38
+db 0x76,0x7d,0xb2,0xa7,0xcf,0xed,0x57,0xc5,0xf3,0x2c,0xbb,0x14,0x21,0x06,0x55,0x9b
+db 0xe3,0xef,0x5e,0x31,0x4f,0x7f,0x5a,0xa4,0x0d,0x82,0x51,0x49,0x5f,0xba,0x58,0x1c
+db 0x4a,0x16,0xd5,0x17,0xa8,0x92,0x24,0x1f,0x8c,0xff,0xd8,0xae,0x2e,0x01,0xd3,0xad
+db 0x3b,0x4b,0xda,0x46,0xeb,0xc9,0xde,0x9a,0x8f,0x87,0xd7,0x3a,0x80,0x6f,0x2f,0xc8
+db 0xb1,0xb4,0x37,0xf7,0x0a,0x22,0x13,0x28,0x7c,0xcc,0x3c,0x89,0xc7,0xc3,0x96,0x56
+db 0x07,0xbf,0x7e,0xf0,0x0b,0x2b,0x97,0x52,0x35,0x41,0x79,0x61,0xa6,0x4c,0x10,0xfe
+db 0xbc,0x26,0x95,0x88,0x8a,0xb0,0xa3,0xfb,0xc0,0x18,0x94,0xf2,0xe1,0xe5,0xe9,0x5d
+db 0xd0,0xdc,0x11,0x66,0x64,0x5c,0xec,0x59,0x42,0x75,0x12,0xf5,0x74,0x9c,0xaa,0x23
+db 0x0e,0x86,0xab,0xbe,0x2a,0x02,0xe7,0x67,0xe6,0x44,0xa2,0x6c,0xc2,0x93,0x9f,0xf1
+db 0xf6,0xfa,0x36,0xd2,0x50,0x68,0x9e,0x62,0x71,0x15,0x3d,0xd6,0x40,0xc4,0xe2,0x0f
+db 0x8e,0x83,0x77,0x6b,0x25,0x05,0x3f,0x0c,0x30,0xea,0x70,0xb7,0xa1,0xe8,0xa9,0x65
+db 0x8d,0x27,0x1a,0xdb,0x81,0xb3,0xa0,0xf4,0x45,0x7a,0x19,0xdf,0xee,0x78,0x34,0x60
+
+S1:
+db 0x55,0xc2,0x63,0x71,0x3b,0xc8,0x47,0x86,0x9f,0x3c,0xda,0x5b,0x29,0xaa,0xfd,0x77
+db 0x8c,0xc5,0x94,0x0c,0xa6,0x1a,0x13,0x00,0xe3,0xa8,0x16,0x72,0x40,0xf9,0xf8,0x42
+db 0x44,0x26,0x68,0x96,0x81,0xd9,0x45,0x3e,0x10,0x76,0xc6,0xa7,0x8b,0x39,0x43,0xe1
+db 0x3a,0xb5,0x56,0x2a,0xc0,0x6d,0xb3,0x05,0x22,0x66,0xbf,0xdc,0x0b,0xfa,0x62,0x48
+db 0xdd,0x20,0x11,0x06,0x36,0xc9,0xc1,0xcf,0xf6,0x27,0x52,0xbb,0x69,0xf5,0xd4,0x87
+db 0x7f,0x84,0x4c,0xd2,0x9c,0x57,0xa4,0xbc,0x4f,0x9a,0xdf,0xfe,0xd6,0x8d,0x7a,0xeb
+db 0x2b,0x53,0xd8,0x5c,0xa1,0x14,0x17,0xfb,0x23,0xd5,0x7d,0x30,0x67,0x73,0x08,0x09
+db 0xee,0xb7,0x70,0x3f,0x61,0xb2,0x19,0x8e,0x4e,0xe5,0x4b,0x93,0x8f,0x5d,0xdb,0xa9
+db 0xad,0xf1,0xae,0x2e,0xcb,0x0d,0xfc,0xf4,0x2d,0x46,0x6e,0x1d,0x97,0xe8,0xd1,0xe9
+db 0x4d,0x37,0xa5,0x75,0x5e,0x83,0x9e,0xab,0x82,0x9d,0xb9,0x1c,0xe0,0xcd,0x49,0x89
+db 0x01,0xb6,0xbd,0x58,0x24,0xa2,0x5f,0x38,0x78,0x99,0x15,0x90,0x50,0xb8,0x95,0xe4
+db 0xd0,0x91,0xc7,0xce,0xed,0x0f,0xb4,0x6f,0xa0,0xcc,0xf0,0x02,0x4a,0x79,0xc3,0xde
+db 0xa3,0xef,0xea,0x51,0xe6,0x6b,0x18,0xec,0x1b,0x2c,0x80,0xf7,0x74,0xe7,0xff,0x21
+db 0x5a,0x6a,0x54,0x1e,0x41,0x31,0x92,0x35,0xc4,0x33,0x07,0x0a,0xba,0x7e,0x0e,0x34
+db 0x88,0xb1,0x98,0x7c,0xf3,0x3d,0x60,0x6c,0x7b,0xca,0xd3,0x1f,0x32,0x65,0x04,0x28
+db 0x64,0xbe,0x85,0x9b,0x2f,0x59,0x8a,0xd7,0xb0,0x25,0xac,0xaf,0x12,0x03,0xe2,0xf2
+
+EK_d:
+dw 0x44D7, 0x26BC, 0x626B, 0x135E, 0x5789, 0x35E2, 0x7135, 0x09AF,
+dw 0x4D78, 0x2F13, 0x6BC4, 0x1AF1, 0x5E26, 0x3C4D, 0x789A, 0x47AC
+
+mask31:
+dd 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
+
+align 16
+bit_reverse_table_l:
+db 0x00, 0x08, 0x04, 0x0c, 0x02, 0x0a, 0x06, 0x0e, 0x01, 0x09, 0x05, 0x0d, 0x03, 0x0b, 0x07, 0x0f
+
+align 16
+bit_reverse_table_h:
+db 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0
+
+align 16
+bit_reverse_and_table:
+db 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f
+
+align 16
+data_mask_64bits:
+dd 0xffffffff, 0xffffffff, 0x00000000, 0x00000000
+
+bit_mask_table:
+db 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe
+
+
+section .text
+align 64
+
+%define OFFSET_FR1 (16*4)
+%define OFFSET_FR2 (17*4)
+%define OFFSET_BRC_X0 (18*4)
+%define OFFSET_BRC_X1 (19*4)
+%define OFFSET_BRC_X2 (20*4)
+%define OFFSET_BRC_X3 (21*4)
+
+%define MASK31 xmm12
+
+%define OFS_R1 (16*(4*4))
+%define OFS_R2 (OFS_R1 + (4*4))
+%define OFS_X0 (OFS_R2 + (4*4))
+%define OFS_X1 (OFS_X0 + (4*4))
+%define OFS_X2 (OFS_X1 + (4*4))
+%define OFS_X3 (OFS_X2 + (4*4))
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE 16*10
+%else
+ %define XMM_STORAGE 0
+%endif
+
+%define VARIABLE_OFFSET XMM_STORAGE
+
+%macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+%ifidn __OUTPUT_FORMAT__, win64
+ push rdi
+ push rsi
+%endif
+ mov r14, rsp
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm7
+ vmovdqu [rsp + 2*16],xmm8
+ vmovdqu [rsp + 3*16],xmm9
+ vmovdqu [rsp + 4*16],xmm10
+ vmovdqu [rsp + 5*16],xmm11
+ vmovdqu [rsp + 6*16],xmm12
+ vmovdqu [rsp + 7*16],xmm13
+ vmovdqu [rsp + 8*16],xmm14
+ vmovdqu [rsp + 9*16],xmm15
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15, [rsp + 9*16]
+ vmovdqu xmm14, [rsp + 8*16]
+ vmovdqu xmm13, [rsp + 7*16]
+ vmovdqu xmm12, [rsp + 6*16]
+ vmovdqu xmm11, [rsp + 5*16]
+ vmovdqu xmm10, [rsp + 4*16]
+ vmovdqu xmm9, [rsp + 3*16]
+ vmovdqu xmm8, [rsp + 2*16]
+ vmovdqu xmm7, [rsp + 1*16]
+ vmovdqu xmm6, [rsp + 0*16]
+%endif
+ mov rsp, r14
+%ifidn __OUTPUT_FORMAT__, win64
+ pop rsi
+ pop rdi
+%endif
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%endmacro
+
+
+;;
+;; make_u31()
+;;
+%macro make_u31 4
+
+%define %%Rt %1
+%define %%Ke %2
+%define %%Ek %3
+%define %%Iv %4
+ xor %%Rt, %%Rt
+ shrd %%Rt, %%Iv, 8
+ shrd %%Rt, %%Ek, 15
+ shrd %%Rt, %%Ke, 9
+%endmacro
+
+
+;
+; bits_reorg4()
+;
+; params
+; %1 - round number
+; rax - LFSR pointer
+; uses
+;
+; return
+;
+%macro bits_reorg4 1
+ ;
+ ; xmm15 = LFSR_S15
+ ; xmm14 = LFSR_S14
+ ; xmm11 = LFSR_S11
+ ; xmm9 = LFSR_S9
+ ; xmm7 = LFSR_S7
+ ; xmm5 = LFSR_S5
+ ; xmm2 = LFSR_S2
+ ; xmm0 = LFSR_S0
+ ;
+ vmovdqa xmm15, [rax + ((15 + %1) % 16)*16]
+ vmovdqa xmm14, [rax + ((14 + %1) % 16)*16]
+ vmovdqa xmm11, [rax + ((11 + %1) % 16)*16]
+ vmovdqa xmm9, [rax + (( 9 + %1) % 16)*16]
+ vmovdqa xmm7, [rax + (( 7 + %1) % 16)*16]
+ vmovdqa xmm5, [rax + (( 5 + %1) % 16)*16]
+ vmovdqa xmm2, [rax + (( 2 + %1) % 16)*16]
+ vmovdqa xmm0, [rax + (( 0 + %1) % 16)*16]
+
+ vpxor xmm1, xmm1
+ vpslld xmm15, 1
+ vpblendw xmm3, xmm14, xmm1, 0xAA
+ vpblendw xmm15, xmm3, xmm15, 0xAA
+
+ vmovdqa [rax + OFS_X0], xmm15 ; BRC_X0
+ vpslld xmm11, 16
+ vpsrld xmm9, 15
+ vpor xmm11, xmm9
+ vmovdqa [rax + OFS_X1], xmm11 ; BRC_X1
+ vpslld xmm7, 16
+ vpsrld xmm5, 15
+ vpor xmm7, xmm5
+ vmovdqa [rax + OFS_X2], xmm7 ; BRC_X2
+ vpslld xmm2, 16
+ vpsrld xmm0, 15
+ vpor xmm2, xmm0
+ vmovdqa [rax + OFS_X3], xmm2 ; BRC_X3
+%endmacro
+
+%macro lookup_single_sbox 2
+%define %%table %1 ; [in] Pointer to table to look up
+%define %%idx_val %2 ; [in/out] Index to look up and returned value (rcx, rdx, r8, r9)
+
+%ifdef SAFE_LOOKUP
+ ;; Save all registers used in lookup_8bit (xmm0-5, r9,r10)
+ ;; and registers for param passing and return (4 regs, OS dependent)
+ ;; (6*16 + 6*8 = 144 bytes)
+ sub rsp, 144
+
+ vmovdqu [rsp], xmm0
+ vmovdqu [rsp + 16], xmm1
+ vmovdqu [rsp + 32], xmm2
+ vmovdqu [rsp + 48], xmm3
+ vmovdqu [rsp + 64], xmm4
+ vmovdqu [rsp + 80], xmm5
+ mov [rsp + 96], r9
+ mov [rsp + 104], r10
+
+%ifdef LINUX
+ mov [rsp + 112], rdi
+ mov [rsp + 120], rsi
+ mov [rsp + 128], rdx
+ mov rdi, %%table
+ mov rsi, %%idx_val
+ mov rdx, 256
+%else
+%ifnidni %%idx_val, rcx
+ mov [rsp + 112], rcx
+%endif
+%ifnidni %%idx_val, rdx
+ mov [rsp + 120], rdx
+%endif
+%ifnidni %%idx_val, r8
+ mov [rsp + 128], r8
+%endif
+
+ mov rdx, %%idx_val
+ mov rcx, %%table
+ mov r8, 256
+%endif
+ mov [rsp + 136], rax
+
+ call lookup_8bit_avx
+
+ ;; Restore all registers
+ vmovdqu xmm0, [rsp]
+ vmovdqu xmm1, [rsp + 16]
+ vmovdqu xmm2, [rsp + 32]
+ vmovdqu xmm3, [rsp + 48]
+ vmovdqu xmm4, [rsp + 64]
+ vmovdqu xmm5, [rsp + 80]
+ mov r9, [rsp + 96]
+ mov r10, [rsp + 104]
+
+%ifdef LINUX
+ mov rdi, [rsp + 112]
+ mov rsi, [rsp + 120]
+ mov rdx, [rsp + 128]
+%else
+%ifnidni %%idx_val, rcx
+ mov rcx, [rsp + 112]
+%endif
+%ifnidni %%idx_val, rdx
+ mov rdx, [rsp + 120]
+%endif
+%ifnidni %%idx_val, rcx
+ mov r8, [rsp + 128]
+%endif
+%endif
+
+ ;; Move returned value from lookup function, before restoring rax
+ mov DWORD(%%idx_val), eax
+ mov rax, [rsp + 136]
+
+ add rsp, 144
+
+%else ;; SAFE_LOOKUP
+
+ movzx DWORD(%%idx_val), BYTE [%%table + %%idx_val]
+
+%endif ;; SAFE_LOOKUP
+%endmacro
+
+;
+; sbox_lkup()
+;
+; params
+; %1 R1/R2 table offset
+; %2 R1/R2 entry offset
+; %3 xmm reg name
+; uses
+; rcx,rdx,r8,r9,r10,rsi
+; return
+;
+%macro sbox_lkup 3
+ vpextrb rcx, %3, (0 + (%2 * 4))
+ lookup_single_sbox rsi, rcx
+ vpextrb rdx, %3, (1 + (%2 * 4))
+ lookup_single_sbox rdi, rdx
+
+ xor r10, r10
+ vpextrb r8, %3, (2 + (%2 * 4))
+ lookup_single_sbox rsi, r8
+ vpextrb r9, %3, (3 + (%2 * 4))
+ lookup_single_sbox rdi, r9
+
+ shrd r10d, ecx, 8
+ shrd r10d, edx, 8
+ shrd r10d, r8d, 8
+ shrd r10d, r9d, 8
+ mov [rax + %1 + (%2 * 4)], r10d
+%endmacro
+
+
+;
+; rot_mod32()
+;
+; uses xmm7
+;
+%macro rot_mod32 3
+ vpslld %1, %2, %3
+ vpsrld xmm7, %2, (32 - %3)
+
+ vpor %1, xmm7
+%endmacro
+
+
+;
+; nonlin_fun4()
+;
+; params
+; %1 == 1, then calculate W
+; uses
+;
+; return
+; xmm0 = W value, updates F_R1[] / F_R2[]
+;
+%macro nonlin_fun4 1
+
+%if (%1 == 1)
+ vmovdqa xmm0, [rax + OFS_X0]
+ vpxor xmm0, [rax + OFS_R1]
+ vpaddd xmm0, [rax + OFS_R2] ; W = (BRC_X0 ^ F_R1) + F_R2
+%endif
+ ;
+ vmovdqa xmm1, [rax + OFS_R1]
+ vmovdqa xmm2, [rax + OFS_R2]
+ vpaddd xmm1, [rax + OFS_X1] ; W1 = F_R1 + BRC_X1
+ vpxor xmm2, [rax + OFS_X2] ; W2 = F_R2 ^ BRC_X2
+ ;
+
+ vpslld xmm3, xmm1, 16
+ vpsrld xmm4, xmm1, 16
+ vpslld xmm5, xmm2, 16
+ vpsrld xmm6, xmm2, 16
+ vpor xmm1, xmm3, xmm6
+ vpor xmm2, xmm4, xmm5
+
+ ;
+ rot_mod32 xmm3, xmm1, 2
+ rot_mod32 xmm4, xmm1, 10
+ rot_mod32 xmm5, xmm1, 18
+ rot_mod32 xmm6, xmm1, 24
+ vpxor xmm1, xmm3
+ vpxor xmm1, xmm4
+ vpxor xmm1, xmm5
+ vpxor xmm1, xmm6 ; XMM1 = U = L1(P)
+
+ sbox_lkup OFS_R1, 0, xmm1 ; F_R1[0]
+ sbox_lkup OFS_R1, 1, xmm1 ; F_R1[1]
+ sbox_lkup OFS_R1, 2, xmm1 ; F_R1[2]
+ sbox_lkup OFS_R1, 3, xmm1 ; F_R1[3]
+ ;
+ rot_mod32 xmm3, xmm2, 8
+ rot_mod32 xmm4, xmm2, 14
+ rot_mod32 xmm5, xmm2, 22
+ rot_mod32 xmm6, xmm2, 30
+ vpxor xmm2, xmm3
+ vpxor xmm2, xmm4
+ vpxor xmm2, xmm5
+ vpxor xmm2, xmm6 ; XMM2 = V = L2(Q)
+ ;
+
+ sbox_lkup OFS_R2, 0, xmm2 ; F_R2[0]
+ sbox_lkup OFS_R2, 1, xmm2 ; F_R2[1]
+ sbox_lkup OFS_R2, 2, xmm2 ; F_R2[2]
+ sbox_lkup OFS_R2, 3, xmm2 ; F_R2[3]
+%endmacro
+
+
+;
+; store_kstr4()
+;
+; params
+;
+; uses
+; xmm0 as input
+; return
+;
+%macro store_kstr4 0
+ vpxor xmm0, [rax + OFS_X3]
+ vpextrd r15d, xmm0, 3
+ pop r9 ; *pKeyStr4
+ vpextrd r14d, xmm0, 2
+ pop r8 ; *pKeyStr3
+ vpextrd r13d, xmm0, 1
+ pop rdx ; *pKeyStr2
+ vpextrd r12d, xmm0, 0
+ pop rcx ; *pKeyStr1
+ mov [r9], r15d
+ mov [r8], r14d
+ mov [rdx], r13d
+ mov [rcx], r12d
+ add rcx, 4
+ add rdx, 4
+ add r8, 4
+ add r9, 4
+ push rcx
+ push rdx
+ push r8
+ push r9
+%endmacro
+
+
+;
+; add_mod31()
+; add two 32-bit args and reduce mod (2^31-1)
+; params
+; %1 - arg1/res
+; %2 - arg2
+; uses
+; xmm2
+; return
+; %1
+%macro add_mod31 2
+ vpaddd %1, %2
+ vpsrld xmm2, %1, 31
+ vpand %1, MASK31
+ vpaddd %1, xmm2
+%endmacro
+
+
+;
+; rot_mod31()
+; rotate (mult by pow of 2) 32-bit arg and reduce mod (2^31-1)
+; params
+; %1 - arg
+; %2 - # of bits
+; uses
+; xmm2
+; return
+; %1
+%macro rot_mod31 2
+
+ vpslld xmm2, %1, %2
+ vpsrld %1, %1, (31 - %2)
+
+ vpor %1, xmm2
+ vpand %1, MASK31
+%endmacro
+
+
+;
+; lfsr_updt4()
+;
+; params
+; %1 - round number
+; uses
+; xmm0 as input (ZERO or W)
+; return
+;
+%macro lfsr_updt4 1
+ ;
+ ; xmm1 = LFSR_S0
+ ; xmm4 = LFSR_S4
+ ; xmm10 = LFSR_S10
+ ; xmm13 = LFSR_S13
+ ; xmm15 = LFSR_S15
+ ;
+ vpxor xmm3, xmm3
+ vmovdqa xmm1, [rax + (( 0 + %1) % 16)*16]
+ vmovdqa xmm4, [rax + (( 4 + %1) % 16)*16]
+ vmovdqa xmm10, [rax + ((10 + %1) % 16)*16]
+ vmovdqa xmm13, [rax + ((13 + %1) % 16)*16]
+ vmovdqa xmm15, [rax + ((15 + %1) % 16)*16]
+
+ ; Calculate LFSR feedback
+ add_mod31 xmm0, xmm1
+ rot_mod31 xmm1, 8
+ add_mod31 xmm0, xmm1
+ rot_mod31 xmm4, 20
+ add_mod31 xmm0, xmm4
+ rot_mod31 xmm10, 21
+ add_mod31 xmm0, xmm10
+ rot_mod31 xmm13, 17
+ add_mod31 xmm0, xmm13
+ rot_mod31 xmm15, 15
+ add_mod31 xmm0, xmm15
+
+
+
+ vmovdqa [rax + (( 0 + %1) % 16)*16], xmm0
+
+ ; LFSR_S16 = (LFSR_S15++) = eax
+%endmacro
+
+
+;
+; key_expand_4()
+;
+%macro key_expand_4 2
+ movzx r8d, byte [rdi + (%1 + 0)]
+ movzx r9d, word [rbx + ((%1 + 0)*2)]
+ movzx r10d, byte [rsi + (%1 + 0)]
+ make_u31 r11d, r8d, r9d, r10d
+ mov [rax + (((%1 + 0)*16)+(%2*4))], r11d
+
+ movzx r12d, byte [rdi + (%1 + 1)]
+ movzx r13d, word [rbx + ((%1 + 1)*2)]
+ movzx r14d, byte [rsi + (%1 + 1)]
+ make_u31 r15d, r12d, r13d, r14d
+ mov [rax + (((%1 + 1)*16)+(%2*4))], r15d
+%endmacro
+
+
+MKGLOBAL(asm_ZucInitialization_4_avx,function,internal)
+asm_ZucInitialization_4_avx:
+
+%ifdef LINUX
+ %define pKe rdi
+ %define pIv rsi
+ %define pState rdx
+%else
+ %define pKe rcx
+ %define pIv rdx
+ %define pState r8
+%endif
+
+ ; Save non-volatile registers
+ push rbx
+ push rdi
+ push rsi
+ push r12
+ push r13
+ push r14
+ push r15
+ push rdx
+
+ lea rax, [pState] ; load pointer to LFSR
+ push pState ; Save LFSR Pointer to stack
+
+ ; setup the key pointer for first buffer key expand
+ mov rbx, [pKe] ; load the pointer to the array of keys into rbx
+
+ push pKe ; save rdi (key pointer) to the stack
+ lea rdi, [rbx] ; load the pointer to the first key into rdi
+
+
+ ; setup the IV pointer for first buffer key expand
+ mov rcx, [pIv] ; load the pointer to the array of IV's
+ push pIv ; save the IV pointer to the stack
+ lea rsi, [rcx] ; load the first IV pointer
+
+ lea rbx, [EK_d] ; load D variables
+
+ ; Expand key packet 1
+ key_expand_4 0, 0
+ key_expand_4 2, 0
+ key_expand_4 4, 0
+ key_expand_4 6, 0
+ key_expand_4 8, 0
+ key_expand_4 10, 0
+ key_expand_4 12, 0
+ key_expand_4 14, 0
+
+
+ ;second packet key expand here - reset pointers
+ pop rdx ; get IV array pointer from Stack
+ mov rcx, [rdx+8] ; load offset to IV 2 in array
+ lea rsi, [rcx] ; load pointer to IV2
+
+ pop rbx ; get Key array pointer from Stack
+ mov rcx, [rbx+8] ; load offset to key 2 in array
+ lea rdi, [rcx] ; load pointer to Key 2
+
+ push rbx ; save Key pointer
+ push rdx ; save IV pointer
+
+ lea rbx, [EK_d]
+
+ ; Expand key packet 2
+ key_expand_4 0, 1
+ key_expand_4 2, 1
+ key_expand_4 4, 1
+ key_expand_4 6, 1
+ key_expand_4 8, 1
+ key_expand_4 10, 1
+ key_expand_4 12, 1
+ key_expand_4 14, 1
+
+
+
+ ;Third packet key expand here - reset pointers
+ pop rdx ; get IV array pointer from Stack
+ mov rcx, [rdx+16] ; load offset to IV 3 in array
+ lea rsi, [rcx] ; load pointer to IV3
+
+ pop rbx ; get Key array pointer from Stack
+ mov rcx, [rbx+16] ; load offset to key 3 in array
+ lea rdi, [rcx] ; load pointer to Key 3
+
+ push rbx ; save Key pointer
+ push rdx ; save IV pointer
+ lea rbx, [EK_d]
+ ; Expand key packet 3
+ key_expand_4 0, 2
+ key_expand_4 2, 2
+ key_expand_4 4, 2
+ key_expand_4 6, 2
+ key_expand_4 8, 2
+ key_expand_4 10, 2
+ key_expand_4 12, 2
+ key_expand_4 14, 2
+
+
+
+ ;fourth packet key expand here - reset pointers
+ pop rdx ; get IV array pointer from Stack
+ mov rcx, [rdx+24] ; load offset to IV 4 in array
+ lea rsi, [rcx] ; load pointer to IV4
+
+ pop rbx ; get Key array pointer from Stack
+ mov rcx, [rbx+24] ; load offset to key 2 in array
+ lea rdi, [rcx] ; load pointer to Key 2
+ lea rbx, [EK_d]
+ ; Expand key packet 4
+ key_expand_4 0, 3
+ key_expand_4 2, 3
+ key_expand_4 4, 3
+ key_expand_4 6, 3
+ key_expand_4 8, 3
+ key_expand_4 10, 3
+ key_expand_4 12, 3
+ key_expand_4 14, 3
+
+ ; Set R1 and R2 to zero
+ ;xor r10, r10
+ ;xor r11, r11
+
+
+
+ ; Load read-only registers
+ lea rdi, [S0] ; used by sbox_lkup() macro
+ lea rsi, [S1]
+ vmovdqa xmm12, [mask31]
+
+ ; Shift LFSR 32-times, update state variables
+%assign N 0
+%rep 32
+ pop rdx
+ lea rax, [rdx]
+ push rdx
+
+ bits_reorg4 N
+ nonlin_fun4 1
+ vpsrld xmm0,1 ; Shift out LSB of W
+
+ pop rdx
+ lea rax, [rdx]
+ push rdx
+
+ lfsr_updt4 N ; W (xmm0) used in LFSR update - not set to zero
+%assign N N+1
+%endrep
+
+ ; And once more, initial round from keygen phase = 33 times
+ pop rdx
+ lea rax, [rdx]
+ push rdx
+
+ bits_reorg4 0
+ nonlin_fun4 0
+
+ pop rdx
+ lea rax, [rdx]
+
+ vpxor xmm0, xmm0
+ lfsr_updt4 0
+
+
+
+ ; Restore non-volatile registers
+ pop rdx
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rsi
+ pop rdi
+ pop rbx
+
+ ret
+;
+;
+;
+;;
+;; void asm_ZucGenKeystream64B_4_avx(state4_t *pSta, u32* pKeyStr1, u32* pKeyStr2, u32* pKeyStr3, u32* pKeyStr4);
+;;
+;; WIN64
+;; RCX - pSta
+;; RDX - pKeyStr1
+;; R8 - pKeyStr2
+;; R9 - pKeyStr3
+;; Stack - pKeyStr4
+;;
+;; LIN64
+;; RDI - pSta
+;; RSI - pKeyStr1
+;; RDX - pKeyStr2
+;; RCX - pKeyStr3
+;; R8 - pKeyStr4
+;;
+MKGLOBAL(asm_ZucGenKeystream64B_4_avx,function,internal)
+asm_ZucGenKeystream64B_4_avx:
+
+%ifdef LINUX
+ %define pState rdi
+ %define pKS1 rsi
+ %define pKS2 rdx
+ %define pKS3 rcx
+ %define pKS4 r8
+%else
+ %define pState rcx
+ %define pKS1 rdx
+ %define pKS2 r8
+ %define pKS3 r9
+ %define pKS4 rax
+%endif
+
+%ifndef LINUX
+ mov rax, [rsp + 8*5] ; 5th parameter from stack
+%endif
+
+ ; Save non-volatile registers
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+
+%ifndef LINUX
+ push rdi
+ push rsi
+%endif
+ ; Store 4 keystream pointers on the stack
+
+ push pKS1
+ push pKS2
+ push pKS3
+ push pKS4
+
+
+ ; Load state pointer in RAX
+ mov rax, pState
+
+
+ ; Load read-only registers
+ lea rdi, [S0] ; used by sbox_lkup() macro
+ lea rsi, [S1]
+ vmovdqa xmm12, [mask31]
+
+ ; Generate 64B of keystream in 16 rounds
+%assign N 1
+%rep 16
+ bits_reorg4 N
+ nonlin_fun4 1
+ store_kstr4
+ vpxor xmm0, xmm0
+ lfsr_updt4 N
+%assign N N+1
+%endrep
+
+ ; Take keystream pointers off (#push = #pops)
+ pop rax
+ pop rax
+ pop rax
+ pop rax
+
+%ifndef LINUX
+ pop rsi
+ pop rdi
+%endif
+
+ ; Restore non-volatile registers
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
+
+
+;;
+;; extern uint32_t asm_Eia3RemainderAVX(const void *ks, const void *data, uint64_t n_bits)
+;;
+;; Returns authentication update value to be XOR'ed with current authentication tag
+;;
+;; WIN64
+;; RCX - KS (key stream pointer)
+;; RDX - DATA (data pointer)
+;; R8 - N_BITS (number data bits to process)
+;; LIN64
+;; RDI - KS (key stream pointer)
+;; RSI - DATA (data pointer)
+;; RDX - N_BITS (number data bits to process)
+;;
+align 64
+MKGLOBAL(asm_Eia3RemainderAVX,function,internal)
+asm_Eia3RemainderAVX:
+
+%ifdef LINUX
+ %define KS rdi
+ %define DATA rsi
+ %define N_BITS rdx
+%else
+ %define KS rcx
+ %define DATA rdx
+ %define N_BITS r8
+%endif
+ FUNC_SAVE
+
+ vmovdqa xmm5, [bit_reverse_table_l]
+ vmovdqa xmm6, [bit_reverse_table_h]
+ vmovdqa xmm7, [bit_reverse_and_table]
+ vmovdqa xmm10, [data_mask_64bits]
+ vpxor xmm9, xmm9
+
+%rep 3
+ cmp N_BITS, 128
+ jb Eia3RoundsAVX_dq_end
+
+ ;; read 16 bytes and reverse bits
+ vmovdqu xmm0, [DATA]
+ vmovdqa xmm1, xmm0
+ vpand xmm1, xmm7
+
+ vmovdqa xmm2, xmm7
+ vpandn xmm2, xmm0
+ vpsrld xmm2, 4
+
+ vmovdqa xmm8, xmm6 ; bit reverse low nibbles (use high table)
+ vpshufb xmm8, xmm1
+
+ vmovdqa xmm4, xmm5 ; bit reverse high nibbles (use low table)
+ vpshufb xmm4, xmm2
+
+ vpor xmm8, xmm4
+ ; xmm8 - bit reversed data bytes
+
+ ;; ZUC authentication part
+ ;; - 4x32 data bits
+ ;; - set up KS
+ vmovdqu xmm3, [KS + (0*4)]
+ vmovdqu xmm4, [KS + (2*4)]
+ vpshufd xmm0, xmm3, 0x61
+ vpshufd xmm1, xmm4, 0x61
+
+ ;; - set up DATA
+ vmovdqa xmm2, xmm8
+ vpand xmm2, xmm10
+ vpshufd xmm3, xmm2, 0xdc
+ vmovdqa xmm4, xmm3
+
+ vpsrldq xmm8, 8
+ vpshufd xmm13, xmm8, 0xdc
+ vmovdqa xmm14, xmm13
+
+ ;; - clmul
+ ;; - xor the results from 4 32-bit words together
+ vpclmulqdq xmm3, xmm0, 0x00
+ vpclmulqdq xmm4, xmm0, 0x11
+ vpclmulqdq xmm13, xmm1, 0x00
+ vpclmulqdq xmm14, xmm1, 0x11
+
+ vpxor xmm3, xmm4
+ vpxor xmm13, xmm14
+ vpxor xmm9, xmm3
+ vpxor xmm9, xmm13
+ lea DATA, [DATA + 16]
+ lea KS, [KS + 16]
+ sub N_BITS, 128
+%endrep
+Eia3RoundsAVX_dq_end:
+
+%rep 3
+ cmp N_BITS, 32
+ jb Eia3RoundsAVX_dw_end
+
+ ;; swap dwords in KS
+ vmovq xmm1, [KS]
+ vpshufd xmm4, xmm1, 0xf1
+
+ ;; bit-reverse 4 bytes of data
+ vmovdqa xmm2, xmm7
+ vmovd xmm0, [DATA]
+ vmovdqa xmm1, xmm0
+ vpand xmm1, xmm2
+
+ vpandn xmm2, xmm0
+ vpsrld xmm2, 4
+
+ vmovdqa xmm0, xmm6 ; bit reverse low nibbles (use high table)
+ vpshufb xmm0, xmm1
+
+ vmovdqa xmm3, xmm5 ; bit reverse high nibbles (use low table)
+ vpshufb xmm3, xmm2
+
+ vpor xmm0, xmm3
+
+ ;; rol & xor
+ vpclmulqdq xmm0, xmm4, 0
+ vpxor xmm9, xmm0
+
+ lea DATA, [DATA + 4]
+ lea KS, [KS + 4]
+ sub N_BITS, 32
+%endrep
+
+Eia3RoundsAVX_dw_end:
+ vmovq rax, xmm9
+ shr rax, 32
+
+ or N_BITS, N_BITS
+ jz Eia3RoundsAVX_byte_loop_end
+
+ ;; get 64-bit key stream for the last data bits (less than 32)
+ mov KS, [KS]
+
+ ;; process remaining data bytes and bits
+Eia3RoundsAVX_byte_loop:
+ or N_BITS, N_BITS
+ jz Eia3RoundsAVX_byte_loop_end
+
+ cmp N_BITS, 8
+ jb Eia3RoundsAVX_byte_partial
+
+ movzx r11, byte [DATA]
+ sub N_BITS, 8
+ jmp Eia3RoundsAVX_byte_read
+
+Eia3RoundsAVX_byte_partial:
+ ;; process remaining bits (up to 7)
+ lea r11, [bit_mask_table]
+ movzx r10, byte [r11 + N_BITS]
+ movzx r11, byte [DATA]
+ and r11, r10
+ xor N_BITS, N_BITS
+Eia3RoundsAVX_byte_read:
+
+%assign DATATEST 0x80
+%rep 8
+ xor r10, r10
+ test r11, DATATEST
+ cmovne r10, KS
+ xor rax, r10
+ rol KS, 1
+%assign DATATEST (DATATEST >> 1)
+%endrep ; byte boundary
+ lea DATA, [DATA + 1]
+ jmp Eia3RoundsAVX_byte_loop
+
+Eia3RoundsAVX_byte_loop_end:
+
+ ;; eax - holds the return value at this stage
+ FUNC_RESTORE
+
+ ret
+
+;;
+;;extern uint32_t asm_Eia3Round64BAVX(uint32_t T, const void *KS, const void *DATA)
+;;
+;; Updates authentication tag T based on keystream KS and DATA.
+;; - it processes 64 bytes of DATA
+;; - reads data in 16 byte chunks and bit reverses them
+;; - reads and re-arranges KS
+;; - employs clmul for the XOR & ROL part
+;; - copies top 64 butes of KS to bottom (for the next round)
+;;
+;; WIN64
+;; RCX - T
+;; RDX - KS pointer to key stream (2 x 64 bytes)
+;;; R8 - DATA pointer to data
+;; LIN64
+;; RDI - T
+;; RSI - KS pointer to key stream (2 x 64 bytes)
+;; RDX - DATA pointer to data
+;;
+align 64
+MKGLOBAL(asm_Eia3Round64BAVX,function,internal)
+asm_Eia3Round64BAVX:
+
+%ifdef LINUX
+ %define T edi
+ %define KS rsi
+ %define DATA rdx
+%else
+ %define T ecx
+ %define KS rdx
+ %define DATA r8
+%endif
+
+ FUNC_SAVE
+
+ vmovdqa xmm5, [bit_reverse_table_l]
+ vmovdqa xmm6, [bit_reverse_table_h]
+ vmovdqa xmm7, [bit_reverse_and_table]
+ vmovdqa xmm10, [data_mask_64bits]
+
+ vpxor xmm9, xmm9
+%assign I 0
+%rep 4
+ ;; read 16 bytes and reverse bits
+ vmovdqu xmm0, [DATA + 16*I]
+ vpand xmm1, xmm0, xmm7
+
+ vpandn xmm2, xmm7, xmm0
+ vpsrld xmm2, 4
+
+ vpshufb xmm8, xmm6, xmm1 ; bit reverse low nibbles (use high table)
+ vpshufb xmm4, xmm5, xmm2 ; bit reverse high nibbles (use low table)
+
+ vpor xmm8, xmm4
+ ; xmm8 - bit reversed data bytes
+
+ ;; ZUC authentication part
+ ;; - 4x32 data bits
+ ;; - set up KS
+%if I != 0
+ vmovdqa xmm11, xmm12
+ vmovdqu xmm12, [KS + (I*16) + (4*4)]
+%else
+ vmovdqu xmm11, [KS + (I*16) + (0*4)]
+ vmovdqu xmm12, [KS + (I*16) + (4*4)]
+%endif
+ vpalignr xmm13, xmm12, xmm11, 8
+ vpshufd xmm2, xmm11, 0x61
+ vpshufd xmm3, xmm13, 0x61
+
+ ;; - set up DATA
+ vpand xmm13, xmm10, xmm8
+ vpshufd xmm0, xmm13, 0xdc
+
+ vpsrldq xmm8, 8
+ vpshufd xmm1, xmm8, 0xdc
+
+ ;; - clmul
+ ;; - xor the results from 4 32-bit words together
+%if I != 0
+ vpclmulqdq xmm13, xmm0, xmm2, 0x00
+ vpclmulqdq xmm14, xmm0, xmm2, 0x11
+ vpclmulqdq xmm15, xmm1, xmm3, 0x00
+ vpclmulqdq xmm8, xmm1, xmm3, 0x11
+
+ vpxor xmm13, xmm14
+ vpxor xmm15, xmm8
+ vpxor xmm9, xmm13
+ vpxor xmm9, xmm15
+%else
+ vpclmulqdq xmm9, xmm0, xmm2, 0x00
+ vpclmulqdq xmm13, xmm0, xmm2, 0x11
+ vpclmulqdq xmm14, xmm1, xmm3, 0x00
+ vpclmulqdq xmm15, xmm1, xmm3, 0x11
+
+ vpxor xmm14, xmm15
+ vpxor xmm9, xmm13
+ vpxor xmm9, xmm14
+%endif
+
+
+%assign I (I + 1)
+%endrep
+
+ ;; - update T
+ vmovq rax, xmm9
+ shr rax, 32
+ xor eax, T
+
+ FUNC_RESTORE
+
+ ret
+
+
+;----------------------------------------------------------------------------------------
+;----------------------------------------------------------------------------------------
+
+%ifdef LINUX
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif