diff options
Diffstat (limited to 'src/spdk/intel-ipsec-mb/avx/zuc_avx.asm')
-rwxr-xr-x | src/spdk/intel-ipsec-mb/avx/zuc_avx.asm | 1146 |
1 files changed, 1146 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/avx/zuc_avx.asm b/src/spdk/intel-ipsec-mb/avx/zuc_avx.asm new file mode 100755 index 000000000..e7c6bad8a --- /dev/null +++ b/src/spdk/intel-ipsec-mb/avx/zuc_avx.asm @@ -0,0 +1,1146 @@ +;; +;; Copyright (c) 2009-2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.asm" +%include "include/reg_sizes.asm" + +extern lookup_8bit_avx + +section .data +default rel +align 64 +S0: +db 0x3e,0x72,0x5b,0x47,0xca,0xe0,0x00,0x33,0x04,0xd1,0x54,0x98,0x09,0xb9,0x6d,0xcb +db 0x7b,0x1b,0xf9,0x32,0xaf,0x9d,0x6a,0xa5,0xb8,0x2d,0xfc,0x1d,0x08,0x53,0x03,0x90 +db 0x4d,0x4e,0x84,0x99,0xe4,0xce,0xd9,0x91,0xdd,0xb6,0x85,0x48,0x8b,0x29,0x6e,0xac +db 0xcd,0xc1,0xf8,0x1e,0x73,0x43,0x69,0xc6,0xb5,0xbd,0xfd,0x39,0x63,0x20,0xd4,0x38 +db 0x76,0x7d,0xb2,0xa7,0xcf,0xed,0x57,0xc5,0xf3,0x2c,0xbb,0x14,0x21,0x06,0x55,0x9b +db 0xe3,0xef,0x5e,0x31,0x4f,0x7f,0x5a,0xa4,0x0d,0x82,0x51,0x49,0x5f,0xba,0x58,0x1c +db 0x4a,0x16,0xd5,0x17,0xa8,0x92,0x24,0x1f,0x8c,0xff,0xd8,0xae,0x2e,0x01,0xd3,0xad +db 0x3b,0x4b,0xda,0x46,0xeb,0xc9,0xde,0x9a,0x8f,0x87,0xd7,0x3a,0x80,0x6f,0x2f,0xc8 +db 0xb1,0xb4,0x37,0xf7,0x0a,0x22,0x13,0x28,0x7c,0xcc,0x3c,0x89,0xc7,0xc3,0x96,0x56 +db 0x07,0xbf,0x7e,0xf0,0x0b,0x2b,0x97,0x52,0x35,0x41,0x79,0x61,0xa6,0x4c,0x10,0xfe +db 0xbc,0x26,0x95,0x88,0x8a,0xb0,0xa3,0xfb,0xc0,0x18,0x94,0xf2,0xe1,0xe5,0xe9,0x5d +db 0xd0,0xdc,0x11,0x66,0x64,0x5c,0xec,0x59,0x42,0x75,0x12,0xf5,0x74,0x9c,0xaa,0x23 +db 0x0e,0x86,0xab,0xbe,0x2a,0x02,0xe7,0x67,0xe6,0x44,0xa2,0x6c,0xc2,0x93,0x9f,0xf1 +db 0xf6,0xfa,0x36,0xd2,0x50,0x68,0x9e,0x62,0x71,0x15,0x3d,0xd6,0x40,0xc4,0xe2,0x0f +db 0x8e,0x83,0x77,0x6b,0x25,0x05,0x3f,0x0c,0x30,0xea,0x70,0xb7,0xa1,0xe8,0xa9,0x65 +db 0x8d,0x27,0x1a,0xdb,0x81,0xb3,0xa0,0xf4,0x45,0x7a,0x19,0xdf,0xee,0x78,0x34,0x60 + +S1: +db 0x55,0xc2,0x63,0x71,0x3b,0xc8,0x47,0x86,0x9f,0x3c,0xda,0x5b,0x29,0xaa,0xfd,0x77 +db 0x8c,0xc5,0x94,0x0c,0xa6,0x1a,0x13,0x00,0xe3,0xa8,0x16,0x72,0x40,0xf9,0xf8,0x42 +db 0x44,0x26,0x68,0x96,0x81,0xd9,0x45,0x3e,0x10,0x76,0xc6,0xa7,0x8b,0x39,0x43,0xe1 +db 0x3a,0xb5,0x56,0x2a,0xc0,0x6d,0xb3,0x05,0x22,0x66,0xbf,0xdc,0x0b,0xfa,0x62,0x48 +db 0xdd,0x20,0x11,0x06,0x36,0xc9,0xc1,0xcf,0xf6,0x27,0x52,0xbb,0x69,0xf5,0xd4,0x87 +db 0x7f,0x84,0x4c,0xd2,0x9c,0x57,0xa4,0xbc,0x4f,0x9a,0xdf,0xfe,0xd6,0x8d,0x7a,0xeb +db 0x2b,0x53,0xd8,0x5c,0xa1,0x14,0x17,0xfb,0x23,0xd5,0x7d,0x30,0x67,0x73,0x08,0x09 +db 0xee,0xb7,0x70,0x3f,0x61,0xb2,0x19,0x8e,0x4e,0xe5,0x4b,0x93,0x8f,0x5d,0xdb,0xa9 +db 0xad,0xf1,0xae,0x2e,0xcb,0x0d,0xfc,0xf4,0x2d,0x46,0x6e,0x1d,0x97,0xe8,0xd1,0xe9 +db 0x4d,0x37,0xa5,0x75,0x5e,0x83,0x9e,0xab,0x82,0x9d,0xb9,0x1c,0xe0,0xcd,0x49,0x89 +db 0x01,0xb6,0xbd,0x58,0x24,0xa2,0x5f,0x38,0x78,0x99,0x15,0x90,0x50,0xb8,0x95,0xe4 +db 0xd0,0x91,0xc7,0xce,0xed,0x0f,0xb4,0x6f,0xa0,0xcc,0xf0,0x02,0x4a,0x79,0xc3,0xde +db 0xa3,0xef,0xea,0x51,0xe6,0x6b,0x18,0xec,0x1b,0x2c,0x80,0xf7,0x74,0xe7,0xff,0x21 +db 0x5a,0x6a,0x54,0x1e,0x41,0x31,0x92,0x35,0xc4,0x33,0x07,0x0a,0xba,0x7e,0x0e,0x34 +db 0x88,0xb1,0x98,0x7c,0xf3,0x3d,0x60,0x6c,0x7b,0xca,0xd3,0x1f,0x32,0x65,0x04,0x28 +db 0x64,0xbe,0x85,0x9b,0x2f,0x59,0x8a,0xd7,0xb0,0x25,0xac,0xaf,0x12,0x03,0xe2,0xf2 + +EK_d: +dw 0x44D7, 0x26BC, 0x626B, 0x135E, 0x5789, 0x35E2, 0x7135, 0x09AF, +dw 0x4D78, 0x2F13, 0x6BC4, 0x1AF1, 0x5E26, 0x3C4D, 0x789A, 0x47AC + +mask31: +dd 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF + +align 16 +bit_reverse_table_l: +db 0x00, 0x08, 0x04, 0x0c, 0x02, 0x0a, 0x06, 0x0e, 0x01, 0x09, 0x05, 0x0d, 0x03, 0x0b, 0x07, 0x0f + +align 16 +bit_reverse_table_h: +db 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0 + +align 16 +bit_reverse_and_table: +db 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f + +align 16 +data_mask_64bits: +dd 0xffffffff, 0xffffffff, 0x00000000, 0x00000000 + +bit_mask_table: +db 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe + + +section .text +align 64 + +%define OFFSET_FR1 (16*4) +%define OFFSET_FR2 (17*4) +%define OFFSET_BRC_X0 (18*4) +%define OFFSET_BRC_X1 (19*4) +%define OFFSET_BRC_X2 (20*4) +%define OFFSET_BRC_X3 (21*4) + +%define MASK31 xmm12 + +%define OFS_R1 (16*(4*4)) +%define OFS_R2 (OFS_R1 + (4*4)) +%define OFS_X0 (OFS_R2 + (4*4)) +%define OFS_X1 (OFS_X0 + (4*4)) +%define OFS_X2 (OFS_X1 + (4*4)) +%define OFS_X3 (OFS_X2 + (4*4)) + +%ifidn __OUTPUT_FORMAT__, win64 + %define XMM_STORAGE 16*10 +%else + %define XMM_STORAGE 0 +%endif + +%define VARIABLE_OFFSET XMM_STORAGE + +%macro FUNC_SAVE 0 + push r12 + push r13 + push r14 + push r15 +%ifidn __OUTPUT_FORMAT__, win64 + push rdi + push rsi +%endif + mov r14, rsp + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + vmovdqu [rsp + 0*16],xmm6 + vmovdqu [rsp + 1*16],xmm7 + vmovdqu [rsp + 2*16],xmm8 + vmovdqu [rsp + 3*16],xmm9 + vmovdqu [rsp + 4*16],xmm10 + vmovdqu [rsp + 5*16],xmm11 + vmovdqu [rsp + 6*16],xmm12 + vmovdqu [rsp + 7*16],xmm13 + vmovdqu [rsp + 8*16],xmm14 + vmovdqu [rsp + 9*16],xmm15 +%endif +%endmacro + + +%macro FUNC_RESTORE 0 + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15, [rsp + 9*16] + vmovdqu xmm14, [rsp + 8*16] + vmovdqu xmm13, [rsp + 7*16] + vmovdqu xmm12, [rsp + 6*16] + vmovdqu xmm11, [rsp + 5*16] + vmovdqu xmm10, [rsp + 4*16] + vmovdqu xmm9, [rsp + 3*16] + vmovdqu xmm8, [rsp + 2*16] + vmovdqu xmm7, [rsp + 1*16] + vmovdqu xmm6, [rsp + 0*16] +%endif + mov rsp, r14 +%ifidn __OUTPUT_FORMAT__, win64 + pop rsi + pop rdi +%endif + pop r15 + pop r14 + pop r13 + pop r12 +%endmacro + + +;; +;; make_u31() +;; +%macro make_u31 4 + +%define %%Rt %1 +%define %%Ke %2 +%define %%Ek %3 +%define %%Iv %4 + xor %%Rt, %%Rt + shrd %%Rt, %%Iv, 8 + shrd %%Rt, %%Ek, 15 + shrd %%Rt, %%Ke, 9 +%endmacro + + +; +; bits_reorg4() +; +; params +; %1 - round number +; rax - LFSR pointer +; uses +; +; return +; +%macro bits_reorg4 1 + ; + ; xmm15 = LFSR_S15 + ; xmm14 = LFSR_S14 + ; xmm11 = LFSR_S11 + ; xmm9 = LFSR_S9 + ; xmm7 = LFSR_S7 + ; xmm5 = LFSR_S5 + ; xmm2 = LFSR_S2 + ; xmm0 = LFSR_S0 + ; + vmovdqa xmm15, [rax + ((15 + %1) % 16)*16] + vmovdqa xmm14, [rax + ((14 + %1) % 16)*16] + vmovdqa xmm11, [rax + ((11 + %1) % 16)*16] + vmovdqa xmm9, [rax + (( 9 + %1) % 16)*16] + vmovdqa xmm7, [rax + (( 7 + %1) % 16)*16] + vmovdqa xmm5, [rax + (( 5 + %1) % 16)*16] + vmovdqa xmm2, [rax + (( 2 + %1) % 16)*16] + vmovdqa xmm0, [rax + (( 0 + %1) % 16)*16] + + vpxor xmm1, xmm1 + vpslld xmm15, 1 + vpblendw xmm3, xmm14, xmm1, 0xAA + vpblendw xmm15, xmm3, xmm15, 0xAA + + vmovdqa [rax + OFS_X0], xmm15 ; BRC_X0 + vpslld xmm11, 16 + vpsrld xmm9, 15 + vpor xmm11, xmm9 + vmovdqa [rax + OFS_X1], xmm11 ; BRC_X1 + vpslld xmm7, 16 + vpsrld xmm5, 15 + vpor xmm7, xmm5 + vmovdqa [rax + OFS_X2], xmm7 ; BRC_X2 + vpslld xmm2, 16 + vpsrld xmm0, 15 + vpor xmm2, xmm0 + vmovdqa [rax + OFS_X3], xmm2 ; BRC_X3 +%endmacro + +%macro lookup_single_sbox 2 +%define %%table %1 ; [in] Pointer to table to look up +%define %%idx_val %2 ; [in/out] Index to look up and returned value (rcx, rdx, r8, r9) + +%ifdef SAFE_LOOKUP + ;; Save all registers used in lookup_8bit (xmm0-5, r9,r10) + ;; and registers for param passing and return (4 regs, OS dependent) + ;; (6*16 + 6*8 = 144 bytes) + sub rsp, 144 + + vmovdqu [rsp], xmm0 + vmovdqu [rsp + 16], xmm1 + vmovdqu [rsp + 32], xmm2 + vmovdqu [rsp + 48], xmm3 + vmovdqu [rsp + 64], xmm4 + vmovdqu [rsp + 80], xmm5 + mov [rsp + 96], r9 + mov [rsp + 104], r10 + +%ifdef LINUX + mov [rsp + 112], rdi + mov [rsp + 120], rsi + mov [rsp + 128], rdx + mov rdi, %%table + mov rsi, %%idx_val + mov rdx, 256 +%else +%ifnidni %%idx_val, rcx + mov [rsp + 112], rcx +%endif +%ifnidni %%idx_val, rdx + mov [rsp + 120], rdx +%endif +%ifnidni %%idx_val, r8 + mov [rsp + 128], r8 +%endif + + mov rdx, %%idx_val + mov rcx, %%table + mov r8, 256 +%endif + mov [rsp + 136], rax + + call lookup_8bit_avx + + ;; Restore all registers + vmovdqu xmm0, [rsp] + vmovdqu xmm1, [rsp + 16] + vmovdqu xmm2, [rsp + 32] + vmovdqu xmm3, [rsp + 48] + vmovdqu xmm4, [rsp + 64] + vmovdqu xmm5, [rsp + 80] + mov r9, [rsp + 96] + mov r10, [rsp + 104] + +%ifdef LINUX + mov rdi, [rsp + 112] + mov rsi, [rsp + 120] + mov rdx, [rsp + 128] +%else +%ifnidni %%idx_val, rcx + mov rcx, [rsp + 112] +%endif +%ifnidni %%idx_val, rdx + mov rdx, [rsp + 120] +%endif +%ifnidni %%idx_val, rcx + mov r8, [rsp + 128] +%endif +%endif + + ;; Move returned value from lookup function, before restoring rax + mov DWORD(%%idx_val), eax + mov rax, [rsp + 136] + + add rsp, 144 + +%else ;; SAFE_LOOKUP + + movzx DWORD(%%idx_val), BYTE [%%table + %%idx_val] + +%endif ;; SAFE_LOOKUP +%endmacro + +; +; sbox_lkup() +; +; params +; %1 R1/R2 table offset +; %2 R1/R2 entry offset +; %3 xmm reg name +; uses +; rcx,rdx,r8,r9,r10,rsi +; return +; +%macro sbox_lkup 3 + vpextrb rcx, %3, (0 + (%2 * 4)) + lookup_single_sbox rsi, rcx + vpextrb rdx, %3, (1 + (%2 * 4)) + lookup_single_sbox rdi, rdx + + xor r10, r10 + vpextrb r8, %3, (2 + (%2 * 4)) + lookup_single_sbox rsi, r8 + vpextrb r9, %3, (3 + (%2 * 4)) + lookup_single_sbox rdi, r9 + + shrd r10d, ecx, 8 + shrd r10d, edx, 8 + shrd r10d, r8d, 8 + shrd r10d, r9d, 8 + mov [rax + %1 + (%2 * 4)], r10d +%endmacro + + +; +; rot_mod32() +; +; uses xmm7 +; +%macro rot_mod32 3 + vpslld %1, %2, %3 + vpsrld xmm7, %2, (32 - %3) + + vpor %1, xmm7 +%endmacro + + +; +; nonlin_fun4() +; +; params +; %1 == 1, then calculate W +; uses +; +; return +; xmm0 = W value, updates F_R1[] / F_R2[] +; +%macro nonlin_fun4 1 + +%if (%1 == 1) + vmovdqa xmm0, [rax + OFS_X0] + vpxor xmm0, [rax + OFS_R1] + vpaddd xmm0, [rax + OFS_R2] ; W = (BRC_X0 ^ F_R1) + F_R2 +%endif + ; + vmovdqa xmm1, [rax + OFS_R1] + vmovdqa xmm2, [rax + OFS_R2] + vpaddd xmm1, [rax + OFS_X1] ; W1 = F_R1 + BRC_X1 + vpxor xmm2, [rax + OFS_X2] ; W2 = F_R2 ^ BRC_X2 + ; + + vpslld xmm3, xmm1, 16 + vpsrld xmm4, xmm1, 16 + vpslld xmm5, xmm2, 16 + vpsrld xmm6, xmm2, 16 + vpor xmm1, xmm3, xmm6 + vpor xmm2, xmm4, xmm5 + + ; + rot_mod32 xmm3, xmm1, 2 + rot_mod32 xmm4, xmm1, 10 + rot_mod32 xmm5, xmm1, 18 + rot_mod32 xmm6, xmm1, 24 + vpxor xmm1, xmm3 + vpxor xmm1, xmm4 + vpxor xmm1, xmm5 + vpxor xmm1, xmm6 ; XMM1 = U = L1(P) + + sbox_lkup OFS_R1, 0, xmm1 ; F_R1[0] + sbox_lkup OFS_R1, 1, xmm1 ; F_R1[1] + sbox_lkup OFS_R1, 2, xmm1 ; F_R1[2] + sbox_lkup OFS_R1, 3, xmm1 ; F_R1[3] + ; + rot_mod32 xmm3, xmm2, 8 + rot_mod32 xmm4, xmm2, 14 + rot_mod32 xmm5, xmm2, 22 + rot_mod32 xmm6, xmm2, 30 + vpxor xmm2, xmm3 + vpxor xmm2, xmm4 + vpxor xmm2, xmm5 + vpxor xmm2, xmm6 ; XMM2 = V = L2(Q) + ; + + sbox_lkup OFS_R2, 0, xmm2 ; F_R2[0] + sbox_lkup OFS_R2, 1, xmm2 ; F_R2[1] + sbox_lkup OFS_R2, 2, xmm2 ; F_R2[2] + sbox_lkup OFS_R2, 3, xmm2 ; F_R2[3] +%endmacro + + +; +; store_kstr4() +; +; params +; +; uses +; xmm0 as input +; return +; +%macro store_kstr4 0 + vpxor xmm0, [rax + OFS_X3] + vpextrd r15d, xmm0, 3 + pop r9 ; *pKeyStr4 + vpextrd r14d, xmm0, 2 + pop r8 ; *pKeyStr3 + vpextrd r13d, xmm0, 1 + pop rdx ; *pKeyStr2 + vpextrd r12d, xmm0, 0 + pop rcx ; *pKeyStr1 + mov [r9], r15d + mov [r8], r14d + mov [rdx], r13d + mov [rcx], r12d + add rcx, 4 + add rdx, 4 + add r8, 4 + add r9, 4 + push rcx + push rdx + push r8 + push r9 +%endmacro + + +; +; add_mod31() +; add two 32-bit args and reduce mod (2^31-1) +; params +; %1 - arg1/res +; %2 - arg2 +; uses +; xmm2 +; return +; %1 +%macro add_mod31 2 + vpaddd %1, %2 + vpsrld xmm2, %1, 31 + vpand %1, MASK31 + vpaddd %1, xmm2 +%endmacro + + +; +; rot_mod31() +; rotate (mult by pow of 2) 32-bit arg and reduce mod (2^31-1) +; params +; %1 - arg +; %2 - # of bits +; uses +; xmm2 +; return +; %1 +%macro rot_mod31 2 + + vpslld xmm2, %1, %2 + vpsrld %1, %1, (31 - %2) + + vpor %1, xmm2 + vpand %1, MASK31 +%endmacro + + +; +; lfsr_updt4() +; +; params +; %1 - round number +; uses +; xmm0 as input (ZERO or W) +; return +; +%macro lfsr_updt4 1 + ; + ; xmm1 = LFSR_S0 + ; xmm4 = LFSR_S4 + ; xmm10 = LFSR_S10 + ; xmm13 = LFSR_S13 + ; xmm15 = LFSR_S15 + ; + vpxor xmm3, xmm3 + vmovdqa xmm1, [rax + (( 0 + %1) % 16)*16] + vmovdqa xmm4, [rax + (( 4 + %1) % 16)*16] + vmovdqa xmm10, [rax + ((10 + %1) % 16)*16] + vmovdqa xmm13, [rax + ((13 + %1) % 16)*16] + vmovdqa xmm15, [rax + ((15 + %1) % 16)*16] + + ; Calculate LFSR feedback + add_mod31 xmm0, xmm1 + rot_mod31 xmm1, 8 + add_mod31 xmm0, xmm1 + rot_mod31 xmm4, 20 + add_mod31 xmm0, xmm4 + rot_mod31 xmm10, 21 + add_mod31 xmm0, xmm10 + rot_mod31 xmm13, 17 + add_mod31 xmm0, xmm13 + rot_mod31 xmm15, 15 + add_mod31 xmm0, xmm15 + + + + vmovdqa [rax + (( 0 + %1) % 16)*16], xmm0 + + ; LFSR_S16 = (LFSR_S15++) = eax +%endmacro + + +; +; key_expand_4() +; +%macro key_expand_4 2 + movzx r8d, byte [rdi + (%1 + 0)] + movzx r9d, word [rbx + ((%1 + 0)*2)] + movzx r10d, byte [rsi + (%1 + 0)] + make_u31 r11d, r8d, r9d, r10d + mov [rax + (((%1 + 0)*16)+(%2*4))], r11d + + movzx r12d, byte [rdi + (%1 + 1)] + movzx r13d, word [rbx + ((%1 + 1)*2)] + movzx r14d, byte [rsi + (%1 + 1)] + make_u31 r15d, r12d, r13d, r14d + mov [rax + (((%1 + 1)*16)+(%2*4))], r15d +%endmacro + + +MKGLOBAL(asm_ZucInitialization_4_avx,function,internal) +asm_ZucInitialization_4_avx: + +%ifdef LINUX + %define pKe rdi + %define pIv rsi + %define pState rdx +%else + %define pKe rcx + %define pIv rdx + %define pState r8 +%endif + + ; Save non-volatile registers + push rbx + push rdi + push rsi + push r12 + push r13 + push r14 + push r15 + push rdx + + lea rax, [pState] ; load pointer to LFSR + push pState ; Save LFSR Pointer to stack + + ; setup the key pointer for first buffer key expand + mov rbx, [pKe] ; load the pointer to the array of keys into rbx + + push pKe ; save rdi (key pointer) to the stack + lea rdi, [rbx] ; load the pointer to the first key into rdi + + + ; setup the IV pointer for first buffer key expand + mov rcx, [pIv] ; load the pointer to the array of IV's + push pIv ; save the IV pointer to the stack + lea rsi, [rcx] ; load the first IV pointer + + lea rbx, [EK_d] ; load D variables + + ; Expand key packet 1 + key_expand_4 0, 0 + key_expand_4 2, 0 + key_expand_4 4, 0 + key_expand_4 6, 0 + key_expand_4 8, 0 + key_expand_4 10, 0 + key_expand_4 12, 0 + key_expand_4 14, 0 + + + ;second packet key expand here - reset pointers + pop rdx ; get IV array pointer from Stack + mov rcx, [rdx+8] ; load offset to IV 2 in array + lea rsi, [rcx] ; load pointer to IV2 + + pop rbx ; get Key array pointer from Stack + mov rcx, [rbx+8] ; load offset to key 2 in array + lea rdi, [rcx] ; load pointer to Key 2 + + push rbx ; save Key pointer + push rdx ; save IV pointer + + lea rbx, [EK_d] + + ; Expand key packet 2 + key_expand_4 0, 1 + key_expand_4 2, 1 + key_expand_4 4, 1 + key_expand_4 6, 1 + key_expand_4 8, 1 + key_expand_4 10, 1 + key_expand_4 12, 1 + key_expand_4 14, 1 + + + + ;Third packet key expand here - reset pointers + pop rdx ; get IV array pointer from Stack + mov rcx, [rdx+16] ; load offset to IV 3 in array + lea rsi, [rcx] ; load pointer to IV3 + + pop rbx ; get Key array pointer from Stack + mov rcx, [rbx+16] ; load offset to key 3 in array + lea rdi, [rcx] ; load pointer to Key 3 + + push rbx ; save Key pointer + push rdx ; save IV pointer + lea rbx, [EK_d] + ; Expand key packet 3 + key_expand_4 0, 2 + key_expand_4 2, 2 + key_expand_4 4, 2 + key_expand_4 6, 2 + key_expand_4 8, 2 + key_expand_4 10, 2 + key_expand_4 12, 2 + key_expand_4 14, 2 + + + + ;fourth packet key expand here - reset pointers + pop rdx ; get IV array pointer from Stack + mov rcx, [rdx+24] ; load offset to IV 4 in array + lea rsi, [rcx] ; load pointer to IV4 + + pop rbx ; get Key array pointer from Stack + mov rcx, [rbx+24] ; load offset to key 2 in array + lea rdi, [rcx] ; load pointer to Key 2 + lea rbx, [EK_d] + ; Expand key packet 4 + key_expand_4 0, 3 + key_expand_4 2, 3 + key_expand_4 4, 3 + key_expand_4 6, 3 + key_expand_4 8, 3 + key_expand_4 10, 3 + key_expand_4 12, 3 + key_expand_4 14, 3 + + ; Set R1 and R2 to zero + ;xor r10, r10 + ;xor r11, r11 + + + + ; Load read-only registers + lea rdi, [S0] ; used by sbox_lkup() macro + lea rsi, [S1] + vmovdqa xmm12, [mask31] + + ; Shift LFSR 32-times, update state variables +%assign N 0 +%rep 32 + pop rdx + lea rax, [rdx] + push rdx + + bits_reorg4 N + nonlin_fun4 1 + vpsrld xmm0,1 ; Shift out LSB of W + + pop rdx + lea rax, [rdx] + push rdx + + lfsr_updt4 N ; W (xmm0) used in LFSR update - not set to zero +%assign N N+1 +%endrep + + ; And once more, initial round from keygen phase = 33 times + pop rdx + lea rax, [rdx] + push rdx + + bits_reorg4 0 + nonlin_fun4 0 + + pop rdx + lea rax, [rdx] + + vpxor xmm0, xmm0 + lfsr_updt4 0 + + + + ; Restore non-volatile registers + pop rdx + pop r15 + pop r14 + pop r13 + pop r12 + pop rsi + pop rdi + pop rbx + + ret +; +; +; +;; +;; void asm_ZucGenKeystream64B_4_avx(state4_t *pSta, u32* pKeyStr1, u32* pKeyStr2, u32* pKeyStr3, u32* pKeyStr4); +;; +;; WIN64 +;; RCX - pSta +;; RDX - pKeyStr1 +;; R8 - pKeyStr2 +;; R9 - pKeyStr3 +;; Stack - pKeyStr4 +;; +;; LIN64 +;; RDI - pSta +;; RSI - pKeyStr1 +;; RDX - pKeyStr2 +;; RCX - pKeyStr3 +;; R8 - pKeyStr4 +;; +MKGLOBAL(asm_ZucGenKeystream64B_4_avx,function,internal) +asm_ZucGenKeystream64B_4_avx: + +%ifdef LINUX + %define pState rdi + %define pKS1 rsi + %define pKS2 rdx + %define pKS3 rcx + %define pKS4 r8 +%else + %define pState rcx + %define pKS1 rdx + %define pKS2 r8 + %define pKS3 r9 + %define pKS4 rax +%endif + +%ifndef LINUX + mov rax, [rsp + 8*5] ; 5th parameter from stack +%endif + + ; Save non-volatile registers + push rbx + push r12 + push r13 + push r14 + push r15 + +%ifndef LINUX + push rdi + push rsi +%endif + ; Store 4 keystream pointers on the stack + + push pKS1 + push pKS2 + push pKS3 + push pKS4 + + + ; Load state pointer in RAX + mov rax, pState + + + ; Load read-only registers + lea rdi, [S0] ; used by sbox_lkup() macro + lea rsi, [S1] + vmovdqa xmm12, [mask31] + + ; Generate 64B of keystream in 16 rounds +%assign N 1 +%rep 16 + bits_reorg4 N + nonlin_fun4 1 + store_kstr4 + vpxor xmm0, xmm0 + lfsr_updt4 N +%assign N N+1 +%endrep + + ; Take keystream pointers off (#push = #pops) + pop rax + pop rax + pop rax + pop rax + +%ifndef LINUX + pop rsi + pop rdi +%endif + + ; Restore non-volatile registers + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret + + +;; +;; extern uint32_t asm_Eia3RemainderAVX(const void *ks, const void *data, uint64_t n_bits) +;; +;; Returns authentication update value to be XOR'ed with current authentication tag +;; +;; WIN64 +;; RCX - KS (key stream pointer) +;; RDX - DATA (data pointer) +;; R8 - N_BITS (number data bits to process) +;; LIN64 +;; RDI - KS (key stream pointer) +;; RSI - DATA (data pointer) +;; RDX - N_BITS (number data bits to process) +;; +align 64 +MKGLOBAL(asm_Eia3RemainderAVX,function,internal) +asm_Eia3RemainderAVX: + +%ifdef LINUX + %define KS rdi + %define DATA rsi + %define N_BITS rdx +%else + %define KS rcx + %define DATA rdx + %define N_BITS r8 +%endif + FUNC_SAVE + + vmovdqa xmm5, [bit_reverse_table_l] + vmovdqa xmm6, [bit_reverse_table_h] + vmovdqa xmm7, [bit_reverse_and_table] + vmovdqa xmm10, [data_mask_64bits] + vpxor xmm9, xmm9 + +%rep 3 + cmp N_BITS, 128 + jb Eia3RoundsAVX_dq_end + + ;; read 16 bytes and reverse bits + vmovdqu xmm0, [DATA] + vmovdqa xmm1, xmm0 + vpand xmm1, xmm7 + + vmovdqa xmm2, xmm7 + vpandn xmm2, xmm0 + vpsrld xmm2, 4 + + vmovdqa xmm8, xmm6 ; bit reverse low nibbles (use high table) + vpshufb xmm8, xmm1 + + vmovdqa xmm4, xmm5 ; bit reverse high nibbles (use low table) + vpshufb xmm4, xmm2 + + vpor xmm8, xmm4 + ; xmm8 - bit reversed data bytes + + ;; ZUC authentication part + ;; - 4x32 data bits + ;; - set up KS + vmovdqu xmm3, [KS + (0*4)] + vmovdqu xmm4, [KS + (2*4)] + vpshufd xmm0, xmm3, 0x61 + vpshufd xmm1, xmm4, 0x61 + + ;; - set up DATA + vmovdqa xmm2, xmm8 + vpand xmm2, xmm10 + vpshufd xmm3, xmm2, 0xdc + vmovdqa xmm4, xmm3 + + vpsrldq xmm8, 8 + vpshufd xmm13, xmm8, 0xdc + vmovdqa xmm14, xmm13 + + ;; - clmul + ;; - xor the results from 4 32-bit words together + vpclmulqdq xmm3, xmm0, 0x00 + vpclmulqdq xmm4, xmm0, 0x11 + vpclmulqdq xmm13, xmm1, 0x00 + vpclmulqdq xmm14, xmm1, 0x11 + + vpxor xmm3, xmm4 + vpxor xmm13, xmm14 + vpxor xmm9, xmm3 + vpxor xmm9, xmm13 + lea DATA, [DATA + 16] + lea KS, [KS + 16] + sub N_BITS, 128 +%endrep +Eia3RoundsAVX_dq_end: + +%rep 3 + cmp N_BITS, 32 + jb Eia3RoundsAVX_dw_end + + ;; swap dwords in KS + vmovq xmm1, [KS] + vpshufd xmm4, xmm1, 0xf1 + + ;; bit-reverse 4 bytes of data + vmovdqa xmm2, xmm7 + vmovd xmm0, [DATA] + vmovdqa xmm1, xmm0 + vpand xmm1, xmm2 + + vpandn xmm2, xmm0 + vpsrld xmm2, 4 + + vmovdqa xmm0, xmm6 ; bit reverse low nibbles (use high table) + vpshufb xmm0, xmm1 + + vmovdqa xmm3, xmm5 ; bit reverse high nibbles (use low table) + vpshufb xmm3, xmm2 + + vpor xmm0, xmm3 + + ;; rol & xor + vpclmulqdq xmm0, xmm4, 0 + vpxor xmm9, xmm0 + + lea DATA, [DATA + 4] + lea KS, [KS + 4] + sub N_BITS, 32 +%endrep + +Eia3RoundsAVX_dw_end: + vmovq rax, xmm9 + shr rax, 32 + + or N_BITS, N_BITS + jz Eia3RoundsAVX_byte_loop_end + + ;; get 64-bit key stream for the last data bits (less than 32) + mov KS, [KS] + + ;; process remaining data bytes and bits +Eia3RoundsAVX_byte_loop: + or N_BITS, N_BITS + jz Eia3RoundsAVX_byte_loop_end + + cmp N_BITS, 8 + jb Eia3RoundsAVX_byte_partial + + movzx r11, byte [DATA] + sub N_BITS, 8 + jmp Eia3RoundsAVX_byte_read + +Eia3RoundsAVX_byte_partial: + ;; process remaining bits (up to 7) + lea r11, [bit_mask_table] + movzx r10, byte [r11 + N_BITS] + movzx r11, byte [DATA] + and r11, r10 + xor N_BITS, N_BITS +Eia3RoundsAVX_byte_read: + +%assign DATATEST 0x80 +%rep 8 + xor r10, r10 + test r11, DATATEST + cmovne r10, KS + xor rax, r10 + rol KS, 1 +%assign DATATEST (DATATEST >> 1) +%endrep ; byte boundary + lea DATA, [DATA + 1] + jmp Eia3RoundsAVX_byte_loop + +Eia3RoundsAVX_byte_loop_end: + + ;; eax - holds the return value at this stage + FUNC_RESTORE + + ret + +;; +;;extern uint32_t asm_Eia3Round64BAVX(uint32_t T, const void *KS, const void *DATA) +;; +;; Updates authentication tag T based on keystream KS and DATA. +;; - it processes 64 bytes of DATA +;; - reads data in 16 byte chunks and bit reverses them +;; - reads and re-arranges KS +;; - employs clmul for the XOR & ROL part +;; - copies top 64 butes of KS to bottom (for the next round) +;; +;; WIN64 +;; RCX - T +;; RDX - KS pointer to key stream (2 x 64 bytes) +;;; R8 - DATA pointer to data +;; LIN64 +;; RDI - T +;; RSI - KS pointer to key stream (2 x 64 bytes) +;; RDX - DATA pointer to data +;; +align 64 +MKGLOBAL(asm_Eia3Round64BAVX,function,internal) +asm_Eia3Round64BAVX: + +%ifdef LINUX + %define T edi + %define KS rsi + %define DATA rdx +%else + %define T ecx + %define KS rdx + %define DATA r8 +%endif + + FUNC_SAVE + + vmovdqa xmm5, [bit_reverse_table_l] + vmovdqa xmm6, [bit_reverse_table_h] + vmovdqa xmm7, [bit_reverse_and_table] + vmovdqa xmm10, [data_mask_64bits] + + vpxor xmm9, xmm9 +%assign I 0 +%rep 4 + ;; read 16 bytes and reverse bits + vmovdqu xmm0, [DATA + 16*I] + vpand xmm1, xmm0, xmm7 + + vpandn xmm2, xmm7, xmm0 + vpsrld xmm2, 4 + + vpshufb xmm8, xmm6, xmm1 ; bit reverse low nibbles (use high table) + vpshufb xmm4, xmm5, xmm2 ; bit reverse high nibbles (use low table) + + vpor xmm8, xmm4 + ; xmm8 - bit reversed data bytes + + ;; ZUC authentication part + ;; - 4x32 data bits + ;; - set up KS +%if I != 0 + vmovdqa xmm11, xmm12 + vmovdqu xmm12, [KS + (I*16) + (4*4)] +%else + vmovdqu xmm11, [KS + (I*16) + (0*4)] + vmovdqu xmm12, [KS + (I*16) + (4*4)] +%endif + vpalignr xmm13, xmm12, xmm11, 8 + vpshufd xmm2, xmm11, 0x61 + vpshufd xmm3, xmm13, 0x61 + + ;; - set up DATA + vpand xmm13, xmm10, xmm8 + vpshufd xmm0, xmm13, 0xdc + + vpsrldq xmm8, 8 + vpshufd xmm1, xmm8, 0xdc + + ;; - clmul + ;; - xor the results from 4 32-bit words together +%if I != 0 + vpclmulqdq xmm13, xmm0, xmm2, 0x00 + vpclmulqdq xmm14, xmm0, xmm2, 0x11 + vpclmulqdq xmm15, xmm1, xmm3, 0x00 + vpclmulqdq xmm8, xmm1, xmm3, 0x11 + + vpxor xmm13, xmm14 + vpxor xmm15, xmm8 + vpxor xmm9, xmm13 + vpxor xmm9, xmm15 +%else + vpclmulqdq xmm9, xmm0, xmm2, 0x00 + vpclmulqdq xmm13, xmm0, xmm2, 0x11 + vpclmulqdq xmm14, xmm1, xmm3, 0x00 + vpclmulqdq xmm15, xmm1, xmm3, 0x11 + + vpxor xmm14, xmm15 + vpxor xmm9, xmm13 + vpxor xmm9, xmm14 +%endif + + +%assign I (I + 1) +%endrep + + ;; - update T + vmovq rax, xmm9 + shr rax, 32 + xor eax, T + + FUNC_RESTORE + + ret + + +;---------------------------------------------------------------------------------------- +;---------------------------------------------------------------------------------------- + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif |