diff options
Diffstat (limited to 'src/spdk/intel-ipsec-mb/sse')
54 files changed, 16720 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/sse/aes128_cbc_dec_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes128_cbc_dec_by4_sse.asm new file mode 100644 index 00000000..3226adee --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes128_cbc_dec_by4_sse.asm @@ -0,0 +1,528 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; routine to do AES cbc decrypt on 16n bytes doing AES by 4 + +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +; void aes_cbc_dec_128_sse(void *in, +; UINT128 *IV, +; UINT128 keys[11], +; void *out, +; UINT64 len_bytes); +; +; arg 1: IN: pointer to input (cipher text) +; arg 2: IV: pointer to IV +; arg 3: KEYS: pointer to keys +; arg 4: OUT: pointer to output (plain text) +; arg 5: LEN: length in bytes (multiple of 16) +; +%include "os.asm" + +%define MOVDQ movdqu + +%ifdef LINUX +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%else +%define IN rcx +%define IV rdx +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%endif + +%define IDX rax +%define TMP IDX +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 +%define XKEY0 xmm4 +%define XKEY2 xmm5 +%define XKEY4 xmm6 +%define XKEY6 xmm7 +%define XKEY8 xmm8 +%define XKEY10 xmm9 +%define XIV xmm10 +%define XSAVED0 xmm11 +%define XSAVED1 xmm12 +%define XSAVED2 xmm13 +%define XSAVED3 xmm14 +%define XKEY xmm15 + +%define IV_TMP XSAVED3 + +section .text + +MKGLOBAL(aes_cbc_dec_128_sse,function,internal) +aes_cbc_dec_128_sse: +%ifndef LINUX + mov LEN, [rsp + 8*5] +%endif + + mov TMP, LEN + and TMP, 3*16 + jz initial_4 + cmp TMP, 2*16 + jb initial_1 + ja initial_3 + +initial_2: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + movdqu XDATA1, [IN + 1*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XIV, XDATA1 + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, [KEYS + 1*16] ; 1. DEC + aesdec XDATA1, [KEYS + 1*16] + + mov IDX, 2*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, [KEYS + 3*16] ; 3. DEC + aesdec XDATA1, [KEYS + 3*16] + + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, [KEYS + 5*16] ; 5. DEC + aesdec XDATA1, [KEYS + 5*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + + movdqa XKEY8, [KEYS + 8*16] + + aesdec XDATA0, [KEYS + 7*16] ; 7. DEC + aesdec XDATA1, [KEYS + 7*16] + + aesdec XDATA0, XKEY8 ; 8. DEC + aesdec XDATA1, XKEY8 + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, [KEYS + 9*16] ; 9. DEC + aesdec XDATA1, [KEYS + 9*16] + + aesdeclast XDATA0, XKEY10 ; 10. DEC + aesdeclast XDATA1, XKEY10 + + pxor XDATA0, IV_TMP + pxor XDATA1, XSAVED0 + + movdqu [OUT + 0*16], XDATA0 + movdqu [OUT + 1*16], XDATA1 + + cmp LEN, 2*16 + je done + jmp main_loop + + + align 16 +initial_1: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XIV, XDATA0 + + pxor XDATA0, XKEY0 ; 0. ARK + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, [KEYS + 1*16] ; 1. DEC + + mov IDX, 1*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, [KEYS + 3*16] ; 3. DEC + + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, [KEYS + 5*16] ; 5. DEC + + aesdec XDATA0, XKEY6 ; 6. DEC + + movdqa XKEY8, [KEYS + 8*16] + + aesdec XDATA0, [KEYS + 7*16] ; 7. DEC + + aesdec XDATA0, XKEY8 ; 8. DEC + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, [KEYS + 9*16] ; 9. DEC + + aesdeclast XDATA0, XKEY10 ; 10. DEC + + pxor XDATA0, IV_TMP + + movdqu [OUT + 0*16], XDATA0 + + cmp LEN, 1*16 + je done + jmp main_loop + + +initial_3: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + movdqu XDATA1, [IN + 1*16] + movdqu XDATA2, [IN + 2*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XSAVED1, XDATA1 + movdqa XIV, XDATA2 + + movdqa XKEY, [KEYS + 1*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + pxor XDATA2, XKEY0 + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, XKEY ; 1. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + + movdqa XKEY, [KEYS + 3*16] + + mov IDX, 3*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + aesdec XDATA2, XKEY2 + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, XKEY ; 3. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + + movdqa XKEY, [KEYS + 5*16] + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + aesdec XDATA2, XKEY4 + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, XKEY ; 5. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + + movdqa XKEY, [KEYS + 7*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + aesdec XDATA2, XKEY6 + + movdqa XKEY8, [KEYS + 8*16] + + aesdec XDATA0, XKEY ; 7. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + + movdqa XKEY, [KEYS + 9*16] + + aesdec XDATA0, XKEY8 ; 8. DEC + aesdec XDATA1, XKEY8 + aesdec XDATA2, XKEY8 + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, XKEY ; 9. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + + aesdeclast XDATA0, XKEY10 ; 10. DEC + aesdeclast XDATA1, XKEY10 + aesdeclast XDATA2, XKEY10 + + pxor XDATA0, IV_TMP + pxor XDATA1, XSAVED0 + pxor XDATA2, XSAVED1 + + movdqu [OUT + 0*16], XDATA0 + movdqu [OUT + 1*16], XDATA1 + movdqu [OUT + 2*16], XDATA2 + + cmp LEN, 3*16 + je done + jmp main_loop + + + align 16 +initial_4: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + movdqu XDATA1, [IN + 1*16] + movdqu XDATA2, [IN + 2*16] + movdqu XDATA3, [IN + 3*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XSAVED1, XDATA1 + movdqa XSAVED2, XDATA2 + movdqa XIV, XDATA3 + + movdqa XKEY, [KEYS + 1*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + pxor XDATA2, XKEY0 + pxor XDATA3, XKEY0 + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, XKEY ; 1. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + movdqa XKEY, [KEYS + 3*16] + + mov IDX, 4*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + aesdec XDATA2, XKEY2 + aesdec XDATA3, XKEY2 + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, XKEY ; 3. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + movdqa XKEY, [KEYS + 5*16] + + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + aesdec XDATA2, XKEY4 + aesdec XDATA3, XKEY4 + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, XKEY ; 5. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + movdqa XKEY, [KEYS + 7*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + aesdec XDATA2, XKEY6 + aesdec XDATA3, XKEY6 + + movdqa XKEY8, [KEYS + 8*16] + + aesdec XDATA0, XKEY ; 7. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + movdqa XKEY, [KEYS + 9*16] + + aesdec XDATA0, XKEY8 ; 8. DEC + aesdec XDATA1, XKEY8 + aesdec XDATA2, XKEY8 + aesdec XDATA3, XKEY8 + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, XKEY ; 9. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + aesdeclast XDATA0, XKEY10 ; 10. DEC + aesdeclast XDATA1, XKEY10 + aesdeclast XDATA2, XKEY10 + aesdeclast XDATA3, XKEY10 + + pxor XDATA0, IV_TMP + pxor XDATA1, XSAVED0 + pxor XDATA2, XSAVED1 + pxor XDATA3, XSAVED2 + + movdqu [OUT + 0*16], XDATA0 + movdqu [OUT + 1*16], XDATA1 + movdqu [OUT + 2*16], XDATA2 + movdqu [OUT + 3*16], XDATA3 + + cmp LEN, 4*16 + jz done + jmp main_loop + + align 16 +main_loop: + ; load cipher text + movdqu XDATA0, [IN + IDX + 0*16] + movdqu XDATA1, [IN + IDX + 1*16] + movdqu XDATA2, [IN + IDX + 2*16] + movdqu XDATA3, [IN + IDX + 3*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XSAVED1, XDATA1 + movdqa XSAVED2, XDATA2 + movdqa XSAVED3, XDATA3 + + movdqa XKEY, [KEYS + 1*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + pxor XDATA2, XKEY0 + pxor XDATA3, XKEY0 + + add IDX, 4*16 + + aesdec XDATA0, XKEY ; 1. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + movdqa XKEY, [KEYS + 3*16] + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + aesdec XDATA2, XKEY2 + aesdec XDATA3, XKEY2 + + aesdec XDATA0, XKEY ; 3. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + movdqa XKEY, [KEYS + 5*16] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + aesdec XDATA2, XKEY4 + aesdec XDATA3, XKEY4 + + aesdec XDATA0, XKEY ; 5. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + movdqa XKEY, [KEYS + 7*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + aesdec XDATA2, XKEY6 + aesdec XDATA3, XKEY6 + + aesdec XDATA0, XKEY ; 7. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + movdqa XKEY, [KEYS + 9*16] + + aesdec XDATA0, XKEY8 ; 8. DEC + aesdec XDATA1, XKEY8 + aesdec XDATA2, XKEY8 + aesdec XDATA3, XKEY8 + + aesdec XDATA0, XKEY ; 9. DEC + aesdec XDATA1, XKEY + aesdec XDATA2, XKEY + aesdec XDATA3, XKEY + + aesdeclast XDATA0, XKEY10 ; 10. DEC + aesdeclast XDATA1, XKEY10 + aesdeclast XDATA2, XKEY10 + aesdeclast XDATA3, XKEY10 + + pxor XDATA0, XIV + pxor XDATA1, XSAVED0 + pxor XDATA2, XSAVED1 + pxor XDATA3, XSAVED2 + + movdqu [OUT + IDX + 0*16 - 4*16], XDATA0 + movdqu [OUT + IDX + 1*16 - 4*16], XDATA1 + movdqu [OUT + IDX + 2*16 - 4*16], XDATA2 + movdqu [OUT + IDX + 3*16 - 4*16], XDATA3 + + movdqa XIV, XSAVED3 + + CMP IDX, LEN + jne main_loop + +done: +; Don't write back IV +; movdqu [IV], XIV + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/aes128_cbc_mac_x4.asm b/src/spdk/intel-ipsec-mb/sse/aes128_cbc_mac_x4.asm new file mode 100644 index 00000000..46c6cb3a --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes128_cbc_mac_x4.asm @@ -0,0 +1,31 @@ +;; +;; Copyright (c) 2017-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;;; Routine to compute CBC-MAC based on 128 bit CBC AES encryptionk code + +%define CBC_MAC +%include "aes_cbc_enc_128_x4.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/aes128_cntr_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes128_cntr_by4_sse.asm new file mode 100644 index 00000000..3df1f74f --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes128_cntr_by4_sse.asm @@ -0,0 +1,331 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "memcpy.asm" + +; routine to do AES128 CNTR enc/decrypt "by4" +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +extern byteswap_const, ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4 + +%define CONCAT(a,b) a %+ b +%define MOVDQ movdqu + +%define xdata0 xmm0 +%define xdata1 xmm1 +%define xdata2 xmm2 +%define xdata3 xmm3 +%define xdata4 xmm4 +%define xdata5 xmm5 +%define xdata6 xmm6 +%define xdata7 xmm7 +%define xcounter xmm8 +%define xbyteswap xmm9 +%define xkey0 xmm10 +%define xkey3 xmm11 +%define xkey6 xmm12 +%define xkey9 xmm13 +%define xkeyA xmm14 +%define xkeyB xmm15 + +%ifdef LINUX +%define p_in rdi +%define p_IV rsi +%define p_keys rdx +%define p_out rcx +%define num_bytes r8 +%define p_ivlen r9 +%else +%define p_in rcx +%define p_IV rdx +%define p_keys r8 +%define p_out r9 +%define num_bytes r10 +%define p_ivlen qword [rsp + 8*6] +%endif + +%define p_tmp rsp + _buffer +%define tmp r11 + +%macro do_aes_load 1 + do_aes %1, 1 +%endmacro + +%macro do_aes_noload 1 + do_aes %1, 0 +%endmacro + +; do_aes num_in_par load_keys +; This increments p_in, but not p_out +%macro do_aes 2 +%define %%by %1 +%define %%load_keys %2 + +%if (%%load_keys) + movdqa xkey0, [p_keys + 0*16] +%endif + + movdqa xdata0, xcounter + pshufb xdata0, xbyteswap +%assign i 1 +%rep (%%by - 1) + movdqa CONCAT(xdata,i), xcounter + paddd CONCAT(xdata,i), [rel CONCAT(ddq_add_,i)] + pshufb CONCAT(xdata,i), xbyteswap +%assign i (i + 1) +%endrep + + movdqa xkeyA, [p_keys + 1*16] + + pxor xdata0, xkey0 + paddd xcounter, [rel CONCAT(ddq_add_,%%by)] +%assign i 1 +%rep (%%by - 1) + pxor CONCAT(xdata,i), xkey0 +%assign i (i + 1) +%endrep + + movdqa xkeyB, [p_keys + 2*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 1 +%assign i (i+1) +%endrep + +%if (%%load_keys) + movdqa xkey3, [p_keys + 3*16] +%endif +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyB ; key 2 +%assign i (i+1) +%endrep + + add p_in, 16*%%by + + movdqa xkeyB, [p_keys + 4*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkey3 ; key 3 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 5*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyB ; key 4 +%assign i (i+1) +%endrep + +%if (%%load_keys) + movdqa xkey6, [p_keys + 6*16] +%endif +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 5 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 7*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkey6 ; key 6 +%assign i (i+1) +%endrep + + movdqa xkeyB, [p_keys + 8*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 7 +%assign i (i+1) +%endrep + +%if (%%load_keys) + movdqa xkey9, [p_keys + 9*16] +%endif +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyB ; key 8 +%assign i (i+1) +%endrep + + movdqa xkeyB, [p_keys + 10*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkey9 ; key 9 +%assign i (i+1) +%endrep + +%assign i 0 +%rep %%by + aesenclast CONCAT(xdata,i), xkeyB ; key 10 +%assign i (i+1) +%endrep + +%assign i 0 +%rep (%%by / 2) +%assign j (i+1) + MOVDQ xkeyA, [p_in + i*16 - 16*%%by] + MOVDQ xkeyB, [p_in + j*16 - 16*%%by] + pxor CONCAT(xdata,i), xkeyA + pxor CONCAT(xdata,j), xkeyB +%assign i (i+2) +%endrep +%if (i < %%by) + MOVDQ xkeyA, [p_in + i*16 - 16*%%by] + pxor CONCAT(xdata,i), xkeyA +%endif + +%assign i 0 +%rep %%by + MOVDQ [p_out + i*16], CONCAT(xdata,i) +%assign i (i+1) +%endrep +%endmacro + +struc STACK +_buffer: resq 2 +_rsp_save: resq 1 +endstruc + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +section .text + +;; aes_cntr_128_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len) +align 32 +MKGLOBAL(aes_cntr_128_sse,function,internal) +aes_cntr_128_sse: + +%ifndef LINUX + mov num_bytes, [rsp + 8*5] ; arg5 +%endif + + movdqa xbyteswap, [rel byteswap_const] + test p_ivlen, 16 + jnz iv_is_16_bytes + ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001 + mov DWORD(tmp), 0x01000000 + pinsrq xcounter, [p_IV], 0 + pinsrd xcounter, [p_IV + 8], 2 + pinsrd xcounter, DWORD(tmp), 3 +bswap_iv: + pshufb xcounter, xbyteswap + + mov tmp, num_bytes + and tmp, 3*16 + jz chk ; x4 > or < 15 (not 3 lines) + + ; 1 <= tmp <= 3 + cmp tmp, 2*16 + jg eq3 + je eq2 +eq1: + do_aes_load 1 ; 1 block + add p_out, 1*16 + jmp chk + +eq2: + do_aes_load 2 ; 2 blocks + add p_out, 2*16 + jmp chk + +eq3: + do_aes_load 3 ; 3 blocks + add p_out, 3*16 + ; fall through to chk +chk: + and num_bytes, ~(3*16) + jz do_return2 + cmp num_bytes, 16 + jb last + + ; process multiples of 4 blocks + movdqa xkey0, [p_keys + 0*16] + movdqa xkey3, [p_keys + 3*16] + movdqa xkey6, [p_keys + 6*16] + movdqa xkey9, [p_keys + 9*16] + jmp main_loop2 + +align 32 +main_loop2: + ; num_bytes is a multiple of 4 blocks + partial bytes + do_aes_noload 4 + add p_out, 4*16 + sub num_bytes, 4*16 + cmp num_bytes, 4*16 + jae main_loop2 + + test num_bytes, 15 ; partial bytes to be processed? + jnz last + +do_return2: + ; don't return updated IV +; pshufb xcounter, xbyteswap +; movdqu [p_IV], xcounter + ret + +last: + ;; Code dealing with the partial block cases + ; reserve 16 byte aligned buffer on the stack + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + mov [rsp + _rsp_save], rax ; save SP + + ; copy input bytes into scratch buffer + memcpy_sse_16_1 p_tmp, p_in, num_bytes, tmp, rax + ; Encryption of a single partial block (p_tmp) + pshufb xcounter, xbyteswap + movdqa xdata0, xcounter + pxor xdata0, [p_keys + 16*0] +%assign i 1 +%rep 9 + aesenc xdata0, [p_keys + 16*i] +%assign i (i+1) +%endrep + ; created keystream + aesenclast xdata0, [p_keys + 16*i] + ; xor keystream with the message (scratch) + pxor xdata0, [p_tmp] + movdqa [p_tmp], xdata0 + ; copy result into the output buffer + memcpy_sse_16_1 p_out, p_tmp, num_bytes, tmp, rax + ; remove the stack frame + mov rsp, [rsp + _rsp_save] ; original SP + jmp do_return2 + +iv_is_16_bytes: + ; Read 16 byte IV: Nonce + ESP IV + block counter (BE) + movdqu xcounter, [p_IV] + jmp bswap_iv + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/aes192_cbc_dec_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes192_cbc_dec_by4_sse.asm new file mode 100644 index 00000000..dcc24b7c --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes192_cbc_dec_by4_sse.asm @@ -0,0 +1,585 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; routine to do AES cbc decrypt on 16n bytes doing AES by 4 + +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +; void aes_cbc_dec_192_sse(void *in, +; UINT128 *IV, +; UINT128 keys[13], // +1 over key length +; void *out, +; UINT64 len_bytes); +; +; arg 1: IN: pointer to input (cipher text) +; arg 2: IV: pointer to IV +; arg 3: KEYS: pointer to keys +; arg 4: OUT: pointer to output (plain text) +; arg 5: LEN: length in bytes (multiple of 16) +; +%include "os.asm" + +%define MOVDQ movdqu + +%ifdef LINUX +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%else +%define IN rcx +%define IV rdx +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%endif + +%define IDX rax +%define TMP IDX +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 +%define XKEY0 xmm4 +%define XKEY2 xmm5 +%define XKEY4 xmm6 +%define XKEY6 xmm7 +%define XKEY10 xmm8 +%define XIV xmm9 +%define XSAVED0 xmm10 +%define XSAVED1 xmm11 +%define XSAVED2 xmm12 +%define XSAVED3 xmm13 +%define XKEY_A xmm14 +%define XKEY_B xmm15 + +%define IV_TMP XSAVED3 + +section .text + +MKGLOBAL(aes_cbc_dec_192_sse,function,internal) +aes_cbc_dec_192_sse: +%ifndef LINUX + mov LEN, [rsp + 8*5] +%endif + + mov TMP, LEN + and TMP, 3*16 + jz initial_4 + cmp TMP, 2*16 + jb initial_1 + ja initial_3 + +initial_2: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + movdqu XDATA1, [IN + 1*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XIV, XDATA1 + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, [KEYS + 1*16] ; 1. DEC + aesdec XDATA1, [KEYS + 1*16] + + mov IDX, 2*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, [KEYS + 3*16] ; 3. DEC + aesdec XDATA1, [KEYS + 3*16] + + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, [KEYS + 5*16] ; 5. DEC + aesdec XDATA1, [KEYS + 5*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + + movdqa XKEY_B, [KEYS + 8*16] + + aesdec XDATA0, [KEYS + 7*16] ; 7. DEC + aesdec XDATA1, [KEYS + 7*16] + + aesdec XDATA0, XKEY_B ; 8. DEC + aesdec XDATA1, XKEY_B + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, [KEYS + 9*16] ; 9. DEC + aesdec XDATA1, [KEYS + 9*16] + + aesdec XDATA0, XKEY10 ; 10. DEC + aesdec XDATA1, XKEY10 + + aesdec XDATA0, [KEYS + 11*16] ; 11. DEC + aesdec XDATA1, [KEYS + 11*16] + + aesdeclast XDATA0, [KEYS + 12*16] ; 12. DEC + aesdeclast XDATA1, [KEYS + 12*16] + + pxor XDATA0, IV_TMP + pxor XDATA1, XSAVED0 + + movdqu [OUT + 0*16], XDATA0 + movdqu [OUT + 1*16], XDATA1 + + cmp LEN, 2*16 + je done + jmp main_loop + + + align 16 +initial_1: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XIV, XDATA0 + + pxor XDATA0, XKEY0 ; 0. ARK + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, [KEYS + 1*16] ; 1. DEC + + mov IDX, 1*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, [KEYS + 3*16] ; 3. DEC + + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, [KEYS + 5*16] ; 5. DEC + + aesdec XDATA0, XKEY6 ; 6. DEC + + movdqa XKEY_B, [KEYS + 8*16] + + aesdec XDATA0, [KEYS + 7*16] ; 7. DEC + + aesdec XDATA0, XKEY_B ; 8. DEC + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, [KEYS + 9*16] ; 9. DEC + + aesdec XDATA0, XKEY10 ; 10. DEC + + aesdec XDATA0, [KEYS + 11*16] ; 11. DEC + + aesdeclast XDATA0, [KEYS + 12*16] ; 12. DEC + + pxor XDATA0, IV_TMP + + movdqu [OUT + 0*16], XDATA0 + + cmp LEN, 1*16 + je done + jmp main_loop + + +initial_3: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + movdqu XDATA1, [IN + 1*16] + movdqu XDATA2, [IN + 2*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XSAVED1, XDATA1 + movdqa XIV, XDATA2 + + movdqa XKEY_A, [KEYS + 1*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + pxor XDATA2, XKEY0 + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, XKEY_A ; 1. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 3*16] + mov IDX, 3*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + aesdec XDATA2, XKEY2 + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, XKEY_A ; 3. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 5*16] + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + aesdec XDATA2, XKEY4 + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, XKEY_A ; 5. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 7*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + aesdec XDATA2, XKEY6 + + movdqa XKEY_B, [KEYS + 8*16] + + aesdec XDATA0, XKEY_A ; 7. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 9*16] + + aesdec XDATA0, XKEY_B ; 8. DEC + aesdec XDATA1, XKEY_B + aesdec XDATA2, XKEY_B + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, XKEY_A ; 9. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 11*16] + + aesdec XDATA0, XKEY10 ; 10. DEC + aesdec XDATA1, XKEY10 + aesdec XDATA2, XKEY10 + + movdqa XKEY_B, [KEYS + 12*16] + + aesdec XDATA0, XKEY_A ; 11. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 13*16] + + aesdeclast XDATA0, XKEY_B ; 12. DEC + aesdeclast XDATA1, XKEY_B + aesdeclast XDATA2, XKEY_B + + + + pxor XDATA0, IV_TMP + pxor XDATA1, XSAVED0 + pxor XDATA2, XSAVED1 + + movdqu [OUT + 0*16], XDATA0 + movdqu [OUT + 1*16], XDATA1 + movdqu [OUT + 2*16], XDATA2 + + cmp LEN, 3*16 + je done + jmp main_loop + + + align 16 +initial_4: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + movdqu XDATA1, [IN + 1*16] + movdqu XDATA2, [IN + 2*16] + movdqu XDATA3, [IN + 3*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XSAVED1, XDATA1 + movdqa XSAVED2, XDATA2 + movdqa XIV, XDATA3 + + movdqa XKEY_A, [KEYS + 1*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + pxor XDATA2, XKEY0 + pxor XDATA3, XKEY0 + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, XKEY_A ; 1. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 3*16] + + mov IDX, 4*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + aesdec XDATA2, XKEY2 + aesdec XDATA3, XKEY2 + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, XKEY_A ; 3. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 5*16] + + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + aesdec XDATA2, XKEY4 + aesdec XDATA3, XKEY4 + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, XKEY_A ; 5. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 7*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + aesdec XDATA2, XKEY6 + aesdec XDATA3, XKEY6 + + movdqa XKEY_B, [KEYS + 8*16] + + aesdec XDATA0, XKEY_A ; 7. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 9*16] + + aesdec XDATA0, XKEY_B ; 8. DEC + aesdec XDATA1, XKEY_B + aesdec XDATA2, XKEY_B + aesdec XDATA3, XKEY_B + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, XKEY_A ; 9. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 11*16] + + aesdec XDATA0, XKEY10 ; 10. DEC + aesdec XDATA1, XKEY10 + aesdec XDATA2, XKEY10 + aesdec XDATA3, XKEY10 + + movdqa XKEY_B, [KEYS + 12*16] + + aesdec XDATA0, XKEY_A ; 11. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + + + aesdeclast XDATA0, XKEY_B ; 12. DEC + aesdeclast XDATA1, XKEY_B + aesdeclast XDATA2, XKEY_B + aesdeclast XDATA3, XKEY_B + + pxor XDATA0, IV_TMP + pxor XDATA1, XSAVED0 + pxor XDATA2, XSAVED1 + pxor XDATA3, XSAVED2 + + movdqu [OUT + 0*16], XDATA0 + movdqu [OUT + 1*16], XDATA1 + movdqu [OUT + 2*16], XDATA2 + movdqu [OUT + 3*16], XDATA3 + + cmp LEN, 4*16 + jz done + jmp main_loop + + align 16 +main_loop: + ; load cipher text + movdqu XDATA0, [IN + IDX + 0*16] + movdqu XDATA1, [IN + IDX + 1*16] + movdqu XDATA2, [IN + IDX + 2*16] + movdqu XDATA3, [IN + IDX + 3*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XSAVED1, XDATA1 + movdqa XSAVED2, XDATA2 + movdqa XSAVED3, XDATA3 + + movdqa XKEY_A, [KEYS + 1*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + pxor XDATA2, XKEY0 + pxor XDATA3, XKEY0 + + add IDX, 4*16 + + aesdec XDATA0, XKEY_A ; 1. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 3*16] + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + aesdec XDATA2, XKEY2 + aesdec XDATA3, XKEY2 + + aesdec XDATA0, XKEY_A ; 3. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 5*16] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + aesdec XDATA2, XKEY4 + aesdec XDATA3, XKEY4 + + aesdec XDATA0, XKEY_A ; 5. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 7*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + aesdec XDATA2, XKEY6 + aesdec XDATA3, XKEY6 + + movdqa XKEY_B, [KEYS + 8*16] + + aesdec XDATA0, XKEY_A ; 7. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 9*16] + + aesdec XDATA0, XKEY_B ; 8. DEC + aesdec XDATA1, XKEY_B + aesdec XDATA2, XKEY_B + aesdec XDATA3, XKEY_B + + aesdec XDATA0, XKEY_A ; 9. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 11*16] + + aesdec XDATA0, XKEY10 ; 10. DEC + aesdec XDATA1, XKEY10 + aesdec XDATA2, XKEY10 + aesdec XDATA3, XKEY10 + + movdqa XKEY_B, [KEYS + 12*16] + + aesdec XDATA0, XKEY_A ; 11. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + aesdeclast XDATA0, XKEY_B ; 12. DECLAST + aesdeclast XDATA1, XKEY_B + aesdeclast XDATA2, XKEY_B + aesdeclast XDATA3, XKEY_B + + pxor XDATA0, XIV + pxor XDATA1, XSAVED0 + pxor XDATA2, XSAVED1 + pxor XDATA3, XSAVED2 + + movdqu [OUT + IDX + 0*16 - 4*16], XDATA0 + movdqu [OUT + IDX + 1*16 - 4*16], XDATA1 + movdqu [OUT + IDX + 2*16 - 4*16], XDATA2 + movdqu [OUT + IDX + 3*16 - 4*16], XDATA3 + + movdqa XIV, XSAVED3 + + CMP IDX, LEN + jne main_loop + +done: +; Don't write back IV +; movdqu [IV], XIV + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/aes192_cntr_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes192_cntr_by4_sse.asm new file mode 100644 index 00000000..786604df --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes192_cntr_by4_sse.asm @@ -0,0 +1,346 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "memcpy.asm" + +; routine to do AES192 CNTR enc/decrypt "by4" +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +extern byteswap_const, ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4 + +%define CONCAT(a,b) a %+ b +%define MOVDQ movdqu + +%define xdata0 xmm0 +%define xdata1 xmm1 +%define xdata2 xmm2 +%define xdata3 xmm3 +%define xdata4 xmm4 +%define xdata5 xmm5 +%define xdata6 xmm6 +%define xdata7 xmm7 +%define xcounter xmm8 +%define xbyteswap xmm9 +%define xkey0 xmm10 +%define xkey4 xmm11 +%define xkey8 xmm12 +%define xkey12 xmm13 +%define xkeyA xmm14 +%define xkeyB xmm15 + +%ifdef LINUX +%define p_in rdi +%define p_IV rsi +%define p_keys rdx +%define p_out rcx +%define num_bytes r8 +%define p_ivlen r9 +%else +%define p_in rcx +%define p_IV rdx +%define p_keys r8 +%define p_out r9 +%define num_bytes r10 +%define p_ivlen qword [rsp + 8*6] +%endif + +%define tmp r11 +%define p_tmp rsp + _buffer + +%macro do_aes_load 1 + do_aes %1, 1 +%endmacro + +%macro do_aes_noload 1 + do_aes %1, 0 +%endmacro + + +; do_aes num_in_par load_keys +; This increments p_in, but not p_out +%macro do_aes 2 +%define %%by %1 +%define %%load_keys %2 + +%if (%%load_keys) + movdqa xkey0, [p_keys + 0*16] +%endif + + movdqa xdata0, xcounter + pshufb xdata0, xbyteswap +%assign i 1 +%rep (%%by - 1) + movdqa CONCAT(xdata,i), xcounter + paddd CONCAT(xdata,i), [rel CONCAT(ddq_add_,i)] + pshufb CONCAT(xdata,i), xbyteswap +%assign i (i + 1) +%endrep + + movdqa xkeyA, [p_keys + 1*16] + + pxor xdata0, xkey0 + paddd xcounter, [rel CONCAT(ddq_add_,%%by)] +%assign i 1 +%rep (%%by - 1) + pxor CONCAT(xdata,i), xkey0 +%assign i (i + 1) +%endrep + + movdqa xkeyB, [p_keys + 2*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 1 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 3*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyB ; key 2 +%assign i (i+1) +%endrep + + add p_in, 16*%%by + +%if (%%load_keys) + movdqa xkey4, [p_keys + 4*16] +%endif +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 3 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 5*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkey4 ; key 4 +%assign i (i+1) +%endrep + + movdqa xkeyB, [p_keys + 6*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 5 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 7*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyB ; key 6 +%assign i (i+1) +%endrep + +%if (%%load_keys) + movdqa xkey8, [p_keys + 8*16] +%endif +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 7 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 9*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkey8 ; key 8 +%assign i (i+1) +%endrep + + movdqa xkeyB, [p_keys + 10*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 9 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 11*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyB ; key 10 +%assign i (i+1) +%endrep + +%if (%%load_keys) + movdqa xkey12, [p_keys + 12*16] +%endif +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 11 +%assign i (i+1) +%endrep + +%assign i 0 +%rep %%by + aesenclast CONCAT(xdata,i), xkey12 ; key 12 +%assign i (i+1) +%endrep + +%assign i 0 +%rep (%%by / 2) +%assign j (i+1) + MOVDQ xkeyA, [p_in + i*16 - 16*%%by] + MOVDQ xkeyB, [p_in + j*16 - 16*%%by] + pxor CONCAT(xdata,i), xkeyA + pxor CONCAT(xdata,j), xkeyB +%assign i (i+2) +%endrep +%if (i < %%by) + MOVDQ xkeyA, [p_in + i*16 - 16*%%by] + pxor CONCAT(xdata,i), xkeyA +%endif + +%assign i 0 +%rep %%by + MOVDQ [p_out + i*16], CONCAT(xdata,i) +%assign i (i+1) +%endrep +%endmacro + +struc STACK +_buffer: resq 2 +_rsp_save: resq 1 +endstruc + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +section .text + +;; aes_cntr_192_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len) +align 32 +MKGLOBAL(aes_cntr_192_sse,function,internal) +aes_cntr_192_sse: + +%ifndef LINUX + mov num_bytes, [rsp + 8*5] +%endif + + movdqa xbyteswap, [rel byteswap_const] + test p_ivlen, 16 + jnz iv_is_16_bytes + ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001 + mov DWORD(tmp), 0x01000000 + pinsrq xcounter, [p_IV], 0 + pinsrd xcounter, [p_IV + 8], 2 + pinsrd xcounter, DWORD(tmp), 3 +bswap_iv: + pshufb xcounter, xbyteswap + + mov tmp, num_bytes + and tmp, 3*16 + jz chk ; x4 > or < 15 (not 3 lines) + + ; 1 <= tmp <= 3 + cmp tmp, 2*16 + jg eq3 + je eq2 +eq1: + do_aes_load 1 + add p_out, 1*16 + jmp chk + +eq2: + do_aes_load 2 + add p_out, 2*16 + jmp chk + +eq3: + do_aes_load 3 + add p_out, 3*16 + ; fall through to chk +chk: + and num_bytes, ~(3*16) + jz do_return2 + cmp num_bytes, 16 + jb last + + ; process multiples of 4 blocks + movdqa xkey0, [p_keys + 0*16] + movdqa xkey4, [p_keys + 4*16] + movdqa xkey8, [p_keys + 8*16] + movdqa xkey12, [p_keys + 12*16] + jmp main_loop2 + +align 32 +main_loop2: + ; num_bytes is a multiple of 4 and >0 + do_aes_noload 4 + add p_out, 4*16 + sub num_bytes, 4*16 + cmp num_bytes, 4*16 + jae main_loop2 + + test num_bytes, 15 ; partial bytes to be processed? + jnz last + +do_return2: + ; don't return updated IV +; pshufb xcounter, xbyteswap +; movdqu [p_IV], xcounter + ret + +last: + ;; Code dealing with the partial block cases + ; reserve 16 byte aligned buffer on stack + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + mov [rsp + _rsp_save], rax ; save SP + + ; copy input bytes into scratch buffer + memcpy_sse_16_1 p_tmp, p_in, num_bytes, tmp, rax + ; Encryption of a single partial block (p_tmp) + pshufb xcounter, xbyteswap + movdqa xdata0, xcounter + pxor xdata0, [p_keys + 16*0] +%assign i 1 +%rep 11 + aesenc xdata0, [p_keys + 16*i] +%assign i (i+1) +%endrep + ; created keystream + aesenclast xdata0, [p_keys + 16*i] + ; xor keystream with the message (scratch) + pxor xdata0, [p_tmp] + movdqa [p_tmp], xdata0 + ; copy result into the output buffer + memcpy_sse_16_1 p_out, p_tmp, num_bytes, tmp, rax + ; remove the stack frame + mov rsp, [rsp + _rsp_save] ; original SP + jmp do_return2 + +iv_is_16_bytes: + ; Read 16 byte IV: Nonce + ESP IV + block counter (BE) + movdqu xcounter, [p_IV] + jmp bswap_iv + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/aes256_cbc_dec_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes256_cbc_dec_by4_sse.asm new file mode 100644 index 00000000..c9b4bd3a --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes256_cbc_dec_by4_sse.asm @@ -0,0 +1,630 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; routine to do AES cbc decrypt on 16n bytes doing AES by 4 + +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +; void aes_cbc_dec_256_sse(void *in, +; UINT128 *IV, +; UINT128 keys[15], +; void *out, +; UINT64 len_bytes); +; +; arg 1: rcx: pointer to input (cipher text) +; arg 2: rdx: pointer to IV +; arg 3: r8: pointer to keys +; arg 4: r9: pointer to output (plain text) +; arg 5: sp: length in bytes (multiple of 16) +; + +%include "os.asm" + +%define MOVDQ movdqu + +%ifdef LINUX +%define IN rdi +%define IV rsi +%define KEYS rdx +%define OUT rcx +%define LEN r8 +%else +%define IN rcx +%define IV rdx +%define KEYS r8 +%define OUT r9 +%define LEN r10 +%endif + +%define IDX rax +%define TMP IDX +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 +%define XKEY0 xmm4 +%define XKEY2 xmm5 +%define XKEY4 xmm6 +%define XKEY6 xmm7 +%define XKEY10 xmm8 +%define XIV xmm9 +%define XSAVED0 xmm10 +%define XSAVED1 xmm11 +%define XSAVED2 xmm12 +%define XSAVED3 xmm13 +%define XKEY_A xmm14 +%define XKEY_B xmm15 + +%define IV_TMP XSAVED3 + +section .text + +MKGLOBAL(aes_cbc_dec_256_sse,function,internal) +aes_cbc_dec_256_sse: +%ifndef LINUX + mov LEN, [rsp + 8*5] +%endif + + mov TMP, LEN + and TMP, 3*16 + jz initial_4 + cmp TMP, 2*16 + jb initial_1 + ja initial_3 + +initial_2: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + movdqu XDATA1, [IN + 1*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XIV, XDATA1 + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, [KEYS + 1*16] ; 1. DEC + aesdec XDATA1, [KEYS + 1*16] + + mov IDX, 2*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, [KEYS + 3*16] ; 3. DEC + aesdec XDATA1, [KEYS + 3*16] + + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, [KEYS + 5*16] ; 5. DEC + aesdec XDATA1, [KEYS + 5*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + + movdqa XKEY_B, [KEYS + 8*16] + + aesdec XDATA0, [KEYS + 7*16] ; 7. DEC + aesdec XDATA1, [KEYS + 7*16] + + aesdec XDATA0, XKEY_B ; 8. DEC + aesdec XDATA1, XKEY_B + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, [KEYS + 9*16] ; 9. DEC + aesdec XDATA1, [KEYS + 9*16] + + aesdec XDATA0, XKEY10 ; 10. DEC + aesdec XDATA1, XKEY10 + + aesdec XDATA0, [KEYS + 11*16] ; 11. DEC + aesdec XDATA1, [KEYS + 11*16] + + aesdec XDATA0, [KEYS + 12*16] ; 12. DEC + aesdec XDATA1, [KEYS + 12*16] + + aesdec XDATA0, [KEYS + 13*16] ; 13. DEC + aesdec XDATA1, [KEYS + 13*16] + + aesdeclast XDATA0, [KEYS + 14*16] ; 14. DEC + aesdeclast XDATA1, [KEYS + 14*16] + + pxor XDATA0, IV_TMP + pxor XDATA1, XSAVED0 + + movdqu [OUT + 0*16], XDATA0 + movdqu [OUT + 1*16], XDATA1 + + cmp LEN, 2*16 + je done + jmp main_loop + + + align 16 +initial_1: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XIV, XDATA0 + + pxor XDATA0, XKEY0 ; 0. ARK + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, [KEYS + 1*16] ; 1. DEC + + mov IDX, 1*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, [KEYS + 3*16] ; 3. DEC + + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, [KEYS + 5*16] ; 5. DEC + + aesdec XDATA0, XKEY6 ; 6. DEC + + movdqa XKEY_B, [KEYS + 8*16] + + aesdec XDATA0, [KEYS + 7*16] ; 7. DEC + + aesdec XDATA0, XKEY_B ; 8. DEC + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, [KEYS + 9*16] ; 9. DEC + + aesdec XDATA0, XKEY10 ; 10. DEC + + aesdec XDATA0, [KEYS + 11*16] ; 11. DEC + + aesdec XDATA0, [KEYS + 12*16] ; 12. DEC + + aesdec XDATA0, [KEYS + 13*16] ; 13. DEC + + aesdeclast XDATA0, [KEYS + 14*16] ; 14. DEC + + pxor XDATA0, IV_TMP + + movdqu [OUT + 0*16], XDATA0 + + cmp LEN, 1*16 + je done + jmp main_loop + + +initial_3: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + movdqu XDATA1, [IN + 1*16] + movdqu XDATA2, [IN + 2*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XSAVED1, XDATA1 + movdqa XIV, XDATA2 + + movdqa XKEY_A, [KEYS + 1*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + pxor XDATA2, XKEY0 + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, XKEY_A ; 1. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 3*16] + mov IDX, 3*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + aesdec XDATA2, XKEY2 + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, XKEY_A ; 3. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 5*16] + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + aesdec XDATA2, XKEY4 + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, XKEY_A ; 5. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 7*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + aesdec XDATA2, XKEY6 + + movdqa XKEY_B, [KEYS + 8*16] + + aesdec XDATA0, XKEY_A ; 7. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 9*16] + + aesdec XDATA0, XKEY_B ; 8. DEC + aesdec XDATA1, XKEY_B + aesdec XDATA2, XKEY_B + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, XKEY_A ; 9. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 11*16] + + aesdec XDATA0, XKEY10 ; 10. DEC + aesdec XDATA1, XKEY10 + aesdec XDATA2, XKEY10 + + movdqa XKEY_B, [KEYS + 12*16] + + aesdec XDATA0, XKEY_A ; 11. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + movdqa XKEY_A, [KEYS + 13*16] + + aesdec XDATA0, XKEY_B ; 12. DEC + aesdec XDATA1, XKEY_B + aesdec XDATA2, XKEY_B + + movdqa XKEY_B, [KEYS + 14*16] + + aesdec XDATA0, XKEY_A ; 13. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + + aesdeclast XDATA0, XKEY_B ; 14. DEC + aesdeclast XDATA1, XKEY_B + aesdeclast XDATA2, XKEY_B + + pxor XDATA0, IV_TMP + pxor XDATA1, XSAVED0 + pxor XDATA2, XSAVED1 + + movdqu [OUT + 0*16], XDATA0 + movdqu [OUT + 1*16], XDATA1 + movdqu [OUT + 2*16], XDATA2 + + cmp LEN, 3*16 + je done + jmp main_loop + + + align 16 +initial_4: + ; load cipher text + movdqu XDATA0, [IN + 0*16] + movdqu XDATA1, [IN + 1*16] + movdqu XDATA2, [IN + 2*16] + movdqu XDATA3, [IN + 3*16] + + movdqa XKEY0, [KEYS + 0*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XSAVED1, XDATA1 + movdqa XSAVED2, XDATA2 + movdqa XIV, XDATA3 + + movdqa XKEY_A, [KEYS + 1*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + pxor XDATA2, XKEY0 + pxor XDATA3, XKEY0 + + movdqa XKEY2, [KEYS + 2*16] + + aesdec XDATA0, XKEY_A ; 1. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 3*16] + + mov IDX, 4*16 + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + aesdec XDATA2, XKEY2 + aesdec XDATA3, XKEY2 + + movdqa XKEY4, [KEYS + 4*16] + + aesdec XDATA0, XKEY_A ; 3. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 5*16] + + movdqu IV_TMP, [IV] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + aesdec XDATA2, XKEY4 + aesdec XDATA3, XKEY4 + + movdqa XKEY6, [KEYS + 6*16] + + aesdec XDATA0, XKEY_A ; 5. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 7*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + aesdec XDATA2, XKEY6 + aesdec XDATA3, XKEY6 + + movdqa XKEY_B, [KEYS + 8*16] + + aesdec XDATA0, XKEY_A ; 7. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 9*16] + + aesdec XDATA0, XKEY_B ; 8. DEC + aesdec XDATA1, XKEY_B + aesdec XDATA2, XKEY_B + aesdec XDATA3, XKEY_B + + movdqa XKEY10, [KEYS + 10*16] + + aesdec XDATA0, XKEY_A ; 9. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 11*16] + + aesdec XDATA0, XKEY10 ; 10. DEC + aesdec XDATA1, XKEY10 + aesdec XDATA2, XKEY10 + aesdec XDATA3, XKEY10 + + movdqa XKEY_B, [KEYS + 12*16] + + aesdec XDATA0, XKEY_A ; 11. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 13*16] + + aesdec XDATA0, XKEY_B ; 12. DEC + aesdec XDATA1, XKEY_B + aesdec XDATA2, XKEY_B + aesdec XDATA3, XKEY_B + + movdqa XKEY_B, [KEYS + 14*16] + + aesdec XDATA0, XKEY_A ; 13. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + aesdeclast XDATA0, XKEY_B ; 14. DEC + aesdeclast XDATA1, XKEY_B + aesdeclast XDATA2, XKEY_B + aesdeclast XDATA3, XKEY_B + + pxor XDATA0, IV_TMP + pxor XDATA1, XSAVED0 + pxor XDATA2, XSAVED1 + pxor XDATA3, XSAVED2 + + movdqu [OUT + 0*16], XDATA0 + movdqu [OUT + 1*16], XDATA1 + movdqu [OUT + 2*16], XDATA2 + movdqu [OUT + 3*16], XDATA3 + + cmp LEN, 4*16 + jz done + jmp main_loop + + align 16 +main_loop: + ; load cipher text + movdqu XDATA0, [IN + IDX + 0*16] + movdqu XDATA1, [IN + IDX + 1*16] + movdqu XDATA2, [IN + IDX + 2*16] + movdqu XDATA3, [IN + IDX + 3*16] + + ; save cipher text + movdqa XSAVED0, XDATA0 + movdqa XSAVED1, XDATA1 + movdqa XSAVED2, XDATA2 + movdqa XSAVED3, XDATA3 + + movdqa XKEY_A, [KEYS + 1*16] + + pxor XDATA0, XKEY0 ; 0. ARK + pxor XDATA1, XKEY0 + pxor XDATA2, XKEY0 + pxor XDATA3, XKEY0 + + add IDX, 4*16 + + aesdec XDATA0, XKEY_A ; 1. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 3*16] + + aesdec XDATA0, XKEY2 ; 2. DEC + aesdec XDATA1, XKEY2 + aesdec XDATA2, XKEY2 + aesdec XDATA3, XKEY2 + + aesdec XDATA0, XKEY_A ; 3. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 5*16] + + aesdec XDATA0, XKEY4 ; 4. DEC + aesdec XDATA1, XKEY4 + aesdec XDATA2, XKEY4 + aesdec XDATA3, XKEY4 + + aesdec XDATA0, XKEY_A ; 5. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 7*16] + + aesdec XDATA0, XKEY6 ; 6. DEC + aesdec XDATA1, XKEY6 + aesdec XDATA2, XKEY6 + aesdec XDATA3, XKEY6 + + movdqa XKEY_B, [KEYS + 8*16] + + aesdec XDATA0, XKEY_A ; 7. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 9*16] + + aesdec XDATA0, XKEY_B ; 8. DEC + aesdec XDATA1, XKEY_B + aesdec XDATA2, XKEY_B + aesdec XDATA3, XKEY_B + + aesdec XDATA0, XKEY_A ; 9. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 11*16] + + aesdec XDATA0, XKEY10 ; 10. DEC + aesdec XDATA1, XKEY10 + aesdec XDATA2, XKEY10 + aesdec XDATA3, XKEY10 + + movdqa XKEY_B, [KEYS + 12*16] + + aesdec XDATA0, XKEY_A ; 11. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + movdqa XKEY_A, [KEYS + 13*16] + + aesdec XDATA0, XKEY_B ; 12. DEC + aesdec XDATA1, XKEY_B + aesdec XDATA2, XKEY_B + aesdec XDATA3, XKEY_B + + movdqa XKEY_B, [KEYS + 14*16] + + aesdec XDATA0, XKEY_A ; 13. DEC + aesdec XDATA1, XKEY_A + aesdec XDATA2, XKEY_A + aesdec XDATA3, XKEY_A + + aesdeclast XDATA0, XKEY_B ; 14. DEC + aesdeclast XDATA1, XKEY_B + aesdeclast XDATA2, XKEY_B + aesdeclast XDATA3, XKEY_B + + pxor XDATA0, XIV + pxor XDATA1, XSAVED0 + pxor XDATA2, XSAVED1 + pxor XDATA3, XSAVED2 + + movdqu [OUT + IDX + 0*16 - 4*16], XDATA0 + movdqu [OUT + IDX + 1*16 - 4*16], XDATA1 + movdqu [OUT + IDX + 2*16 - 4*16], XDATA2 + movdqu [OUT + IDX + 3*16 - 4*16], XDATA3 + + movdqa XIV, XSAVED3 + + CMP IDX, LEN + jne main_loop + +done: +; Don't write back IV +; movdqu [IV], XIV + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/aes256_cntr_by4_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes256_cntr_by4_sse.asm new file mode 100644 index 00000000..114f20bd --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes256_cntr_by4_sse.asm @@ -0,0 +1,360 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "memcpy.asm" + +; routine to do AES256 CNTR enc/decrypt "by4" +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +extern byteswap_const, ddq_add_1, ddq_add_2, ddq_add_3, ddq_add_4 + +%define CONCAT(a,b) a %+ b +%define MOVDQ movdqu + +%define xdata0 xmm0 +%define xdata1 xmm1 +%define xdata2 xmm2 +%define xdata3 xmm3 +%define xdata4 xmm4 +%define xdata5 xmm5 +%define xdata6 xmm6 +%define xdata7 xmm7 +%define xcounter xmm8 +%define xbyteswap xmm9 +%define xkey0 xmm10 +%define xkey4 xmm11 +%define xkey8 xmm12 +%define xkey12 xmm13 +%define xkeyA xmm14 +%define xkeyB xmm15 + +%ifdef LINUX +%define p_in rdi +%define p_IV rsi +%define p_keys rdx +%define p_out rcx +%define num_bytes r8 +%define p_ivlen r9 +%else +%define p_in rcx +%define p_IV rdx +%define p_keys r8 +%define p_out r9 +%define num_bytes r10 +%define p_ivlen qword [rsp + 8*6] +%endif + +%define tmp r11 +%define p_tmp rsp + _buffer + +%macro do_aes_load 1 + do_aes %1, 1 +%endmacro + +%macro do_aes_noload 1 + do_aes %1, 0 +%endmacro + + +; do_aes num_in_par load_keys +; This increments p_in, but not p_out +%macro do_aes 2 +%define %%by %1 +%define %%load_keys %2 + +%if (%%load_keys) + movdqa xkey0, [p_keys + 0*16] +%endif + + movdqa xdata0, xcounter + pshufb xdata0, xbyteswap +%assign i 1 +%rep (%%by - 1) + movdqa CONCAT(xdata,i), xcounter + paddd CONCAT(xdata,i), [rel CONCAT(ddq_add_,i)] + pshufb CONCAT(xdata,i), xbyteswap +%assign i (i + 1) +%endrep + + movdqa xkeyA, [p_keys + 1*16] + + pxor xdata0, xkey0 + paddd xcounter, [rel CONCAT(ddq_add_,%%by)] +%assign i 1 +%rep (%%by - 1) + pxor CONCAT(xdata,i), xkey0 +%assign i (i + 1) +%endrep + + movdqa xkeyB, [p_keys + 2*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 1 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 3*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyB ; key 2 +%assign i (i+1) +%endrep + + add p_in, 16*%%by + +%if (%%load_keys) + movdqa xkey4, [p_keys + 4*16] +%endif +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 3 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 5*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkey4 ; key 4 +%assign i (i+1) +%endrep + + movdqa xkeyB, [p_keys + 6*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 5 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 7*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyB ; key 6 +%assign i (i+1) +%endrep + +%if (%%load_keys) + movdqa xkey8, [p_keys + 8*16] +%endif +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 7 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 9*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkey8 ; key 8 +%assign i (i+1) +%endrep + + movdqa xkeyB, [p_keys + 10*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 9 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 11*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyB ; key 10 +%assign i (i+1) +%endrep + +%if (%%load_keys) + movdqa xkey12, [p_keys + 12*16] +%endif +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 11 +%assign i (i+1) +%endrep + + movdqa xkeyA, [p_keys + 13*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkey12 ; key 12 +%assign i (i+1) +%endrep + + movdqa xkeyB, [p_keys + 14*16] +%assign i 0 +%rep %%by + aesenc CONCAT(xdata,i), xkeyA ; key 13 +%assign i (i+1) +%endrep + +%assign i 0 +%rep %%by + aesenclast CONCAT(xdata,i), xkeyB ; key 14 +%assign i (i+1) +%endrep + +%assign i 0 +%rep (%%by / 2) +%assign j (i+1) + MOVDQ xkeyA, [p_in + i*16 - 16*%%by] + MOVDQ xkeyB, [p_in + j*16 - 16*%%by] + pxor CONCAT(xdata,i), xkeyA + pxor CONCAT(xdata,j), xkeyB +%assign i (i+2) +%endrep +%if (i < %%by) + MOVDQ xkeyA, [p_in + i*16 - 16*%%by] + pxor CONCAT(xdata,i), xkeyA +%endif + +%assign i 0 +%rep %%by + MOVDQ [p_out + i*16], CONCAT(xdata,i) +%assign i (i+1) +%endrep +%endmacro + +struc STACK +_buffer: resq 2 +_rsp_save: resq 1 +endstruc + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +section .text + +;; aes_cntr_256_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len) +align 32 +MKGLOBAL(aes_cntr_256_sse,function,internal) +aes_cntr_256_sse: + +%ifndef LINUX + mov num_bytes, [rsp + 8*5] +%endif + + movdqa xbyteswap, [rel byteswap_const] + test p_ivlen, 16 + jnz iv_is_16_bytes + ; Read 12 bytes: Nonce + ESP IV. Then pad with block counter 0x00000001 + mov DWORD(tmp), 0x01000000 + pinsrq xcounter, [p_IV], 0 + pinsrd xcounter, [p_IV + 8], 2 + pinsrd xcounter, DWORD(tmp), 3 +bswap_iv: + pshufb xcounter, xbyteswap + + mov tmp, num_bytes + and tmp, 3*16 + jz chk ; x4 > or < 15 (not 3 lines) + + ; 1 <= tmp <= 3 + cmp tmp, 2*16 + jg eq3 + je eq2 +eq1: + do_aes_load 1 + add p_out, 1*16 + jmp chk + +eq2: + do_aes_load 2 + add p_out, 2*16 + jmp chk + +eq3: + do_aes_load 3 + add p_out, 3*16 + ; fall through to chk +chk: + and num_bytes, ~(3*16) + jz do_return2 + cmp num_bytes, 16 + jb last + + ; process multiples of 4 blocks + movdqa xkey0, [p_keys + 0*16] + movdqa xkey4, [p_keys + 4*16] + movdqa xkey8, [p_keys + 8*16] + movdqa xkey12, [p_keys + 12*16] + jmp main_loop2 + +align 32 +main_loop2: + ; num_bytes is a multiple of 4 and >0 + do_aes_noload 4 + add p_out, 4*16 + sub num_bytes, 4*16 + cmp num_bytes, 4*16 + jae main_loop2 + + test num_bytes, 15 ; partial bytes to be processed? + jnz last + +do_return2: + ; don't return updated IV +; pshufb xcounter, xbyteswap +; movdqu [p_IV], xcounter + ret + +last: + ;; Code dealing with the partial block cases + ; reserve 16 byte aligned buffer on stack + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + mov [rsp + _rsp_save], rax ; save SP + + ; copy input bytes into scratch buffer + memcpy_sse_16_1 p_tmp, p_in, num_bytes, tmp, rax + ; Encryption of a single partial block (p_tmp) + pshufb xcounter, xbyteswap + movdqa xdata0, xcounter + pxor xdata0, [p_keys + 16*0] +%assign i 1 +%rep 13 + aesenc xdata0, [p_keys + 16*i] +%assign i (i+1) +%endrep + ; created keystream + aesenclast xdata0, [p_keys + 16*i] + ; xor keystream with the message (scratch) + pxor xdata0, [p_tmp] + movdqa [p_tmp], xdata0 + ; copy result into the output buffer + memcpy_sse_16_1 p_out, p_tmp, num_bytes, tmp, rax + ; remove the stack frame + mov rsp, [rsp + _rsp_save] ; original SP + jmp do_return2 + +iv_is_16_bytes: + ; Read 16 byte IV: Nonce + ESP IV + block counter (BE) + movdqu xcounter, [p_IV] + jmp bswap_iv + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_128_x4.asm b/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_128_x4.asm new file mode 100644 index 00000000..13c324be --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_128_x4.asm @@ -0,0 +1,366 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;;; Routine to do a 128 bit CBC AES encryption / CBC-MAC digest computation +;;; processes 4 buffers at a time, single data structure as input +;;; Updates In and Out pointers at end + +%include "os.asm" +%include "mb_mgr_datastruct.asm" + +%define MOVDQ movdqu ;; assume buffers not aligned +%macro pxor2 2 + MOVDQ XTMP, %2 + pxor %1, XTMP +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; struct AES_ARGS_x8 { +;; void* in[8]; +;; void* out[8]; +;; UINT128* keys[8]; +;; UINT128 IV[8]; +;; } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void aes_cbc_enc_128_x4(AES_ARGS_x8 *args, UINT64 len); +;; arg 1: ARG : addr of AES_ARGS_x8 structure +;; arg 2: LEN : len (in units of bytes) + +struc STACK +_gpr_save: resq 8 +endstruc + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define arg4 rcx +%else +%define arg1 rcx +%define arg2 rdx +%define arg3 rdi ;r8 +%define arg4 rsi ;r9 +%endif + +%define ARG arg1 +%define LEN arg2 + +%define IDX rax + +%define IN0 r8 +%define KEYS0 rbx + +%define IN1 r10 +%define KEYS1 arg3 + +%define IN2 r12 +%define KEYS2 arg4 + +%define IN3 r14 +%define KEYS3 rbp + +%ifndef CBC_MAC +;; No cipher text write back for CBC-MAC +%define OUT0 r9 +%define OUT1 r11 +%define OUT2 r13 +%define OUT3 r15 +%endif + +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 + +%define XKEY0_3 xmm4 +%define XKEY0_6 [KEYS0 + 16*6] +%define XTMP xmm5 +%define XKEY0_9 xmm6 + +%define XKEY1_3 xmm7 +%define XKEY1_6 xmm8 +%define XKEY1_9 xmm9 + +%define XKEY2_3 xmm10 +%define XKEY2_6 xmm11 +%define XKEY2_9 xmm12 + +%define XKEY3_3 xmm13 +%define XKEY3_6 xmm14 +%define XKEY3_9 xmm15 + +section .text + +%ifdef CBC_MAC +MKGLOBAL(aes128_cbc_mac_x4,function,internal) +aes128_cbc_mac_x4: +%else +MKGLOBAL(aes_cbc_enc_128_x4,function,internal) +aes_cbc_enc_128_x4: +%endif + sub rsp, STACK_size + mov [rsp + _gpr_save + 8*0], rbp +%ifdef CBC_MAC + mov [rsp + _gpr_save + 8*1], rbx + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif +%endif + mov IDX, 16 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + mov IN0, [ARG + _aesarg_in + 8*0] + mov IN1, [ARG + _aesarg_in + 8*1] + mov IN2, [ARG + _aesarg_in + 8*2] + mov IN3, [ARG + _aesarg_in + 8*3] + + MOVDQ XDATA0, [IN0] ; load first block of plain text + MOVDQ XDATA1, [IN1] ; load first block of plain text + MOVDQ XDATA2, [IN2] ; load first block of plain text + MOVDQ XDATA3, [IN3] ; load first block of plain text + + mov KEYS0, [ARG + _aesarg_keys + 8*0] + mov KEYS1, [ARG + _aesarg_keys + 8*1] + mov KEYS2, [ARG + _aesarg_keys + 8*2] + mov KEYS3, [ARG + _aesarg_keys + 8*3] + + pxor XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV + pxor XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV + pxor XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV + pxor XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV + +%ifndef CBC_MAC + mov OUT0, [ARG + _aesarg_out + 8*0] + mov OUT1, [ARG + _aesarg_out + 8*1] + mov OUT2, [ARG + _aesarg_out + 8*2] + mov OUT3, [ARG + _aesarg_out + 8*3] +%endif + + pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK + pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK + pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK + pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK + + aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + + aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + + movdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key + movdqa XKEY1_3, [KEYS1 + 16*3] ; load round 3 key + movdqa XKEY2_3, [KEYS2 + 16*3] ; load round 3 key + movdqa XKEY3_3, [KEYS3 + 16*3] ; load round 3 key + + aesenc XDATA0, XKEY0_3 ; 3. ENC + aesenc XDATA1, XKEY1_3 ; 3. ENC + aesenc XDATA2, XKEY2_3 ; 3. ENC + aesenc XDATA3, XKEY3_3 ; 3. ENC + + aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC + aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + + aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC + aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + + movdqa XKEY1_6, [KEYS1 + 16*6] ; load round 6 key + movdqa XKEY2_6, [KEYS2 + 16*6] ; load round 6 key + movdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key + + aesenc XDATA0, XKEY0_6 ; 6. ENC + aesenc XDATA1, XKEY1_6 ; 6. ENC + aesenc XDATA2, XKEY2_6 ; 6. ENC + aesenc XDATA3, XKEY3_6 ; 6. ENC + + aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + + aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + + movdqa XKEY0_9, [KEYS0 + 16*9] ; load round 9 key + movdqa XKEY1_9, [KEYS1 + 16*9] ; load round 9 key + movdqa XKEY2_9, [KEYS2 + 16*9] ; load round 9 key + movdqa XKEY3_9, [KEYS3 + 16*9] ; load round 9 key + + aesenc XDATA0, XKEY0_9 ; 9. ENC + aesenc XDATA1, XKEY1_9 ; 9. ENC + aesenc XDATA2, XKEY2_9 ; 9. ENC + aesenc XDATA3, XKEY3_9 ; 9. ENC + + aesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC + aesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC + aesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC + aesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC + +%ifndef CBC_MAC + MOVDQ [OUT0], XDATA0 ; write back ciphertext + MOVDQ [OUT1], XDATA1 ; write back ciphertext + MOVDQ [OUT2], XDATA2 ; write back ciphertext + MOVDQ [OUT3], XDATA3 ; write back ciphertext +%endif + cmp LEN, IDX + je done + +main_loop: + pxor2 XDATA0, [IN0 + IDX] ; plaintext XOR IV + pxor2 XDATA1, [IN1 + IDX] ; plaintext XOR IV + pxor2 XDATA2, [IN2 + IDX] ; plaintext XOR IV + pxor2 XDATA3, [IN3 + IDX] ; plaintext XOR IV + + pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK + pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK + pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK + pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK + + aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + + aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + + aesenc XDATA0, XKEY0_3 ; 3. ENC + aesenc XDATA1, XKEY1_3 ; 3. ENC + aesenc XDATA2, XKEY2_3 ; 3. ENC + aesenc XDATA3, XKEY3_3 ; 3. ENC + + aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC + aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + + aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC + aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + + aesenc XDATA0, XKEY0_6 ; 6. ENC + aesenc XDATA1, XKEY1_6 ; 6. ENC + aesenc XDATA2, XKEY2_6 ; 6. ENC + aesenc XDATA3, XKEY3_6 ; 6. ENC + + aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + + aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + + aesenc XDATA0, XKEY0_9 ; 9. ENC + aesenc XDATA1, XKEY1_9 ; 9. ENC + aesenc XDATA2, XKEY2_9 ; 9. ENC + aesenc XDATA3, XKEY3_9 ; 9. ENC + + aesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC + aesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC + aesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC + aesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC + +%ifndef CBC_MAC + ;; No cipher text write back for CBC-MAC + MOVDQ [OUT0 + IDX], XDATA0 ; write back ciphertext + MOVDQ [OUT1 + IDX], XDATA1 ; write back ciphertext + MOVDQ [OUT2 + IDX], XDATA2 ; write back ciphertext + MOVDQ [OUT3 + IDX], XDATA3 ; write back ciphertext +%endif + + add IDX, 16 + cmp LEN, IDX + jne main_loop + +done: + ;; update IV / store digest for CBC-MAC + movdqa [ARG + _aesarg_IV + 16*0], XDATA0 + movdqa [ARG + _aesarg_IV + 16*1], XDATA1 + movdqa [ARG + _aesarg_IV + 16*2], XDATA2 + movdqa [ARG + _aesarg_IV + 16*3], XDATA3 + + ;; update IN and OUT + add IN0, LEN + mov [ARG + _aesarg_in + 8*0], IN0 + add IN1, LEN + mov [ARG + _aesarg_in + 8*1], IN1 + add IN2, LEN + mov [ARG + _aesarg_in + 8*2], IN2 + add IN3, LEN + mov [ARG + _aesarg_in + 8*3], IN3 + +%ifndef CBC_MAC + ;; No OUT pointer updates for CBC-MAC + add OUT0, LEN + mov [ARG + _aesarg_out + 8*0], OUT0 + add OUT1, LEN + mov [ARG + _aesarg_out + 8*1], OUT1 + add OUT2, LEN + mov [ARG + _aesarg_out + 8*2], OUT2 + add OUT3, LEN + mov [ARG + _aesarg_out + 8*3], OUT3 +%endif + +%ifdef CBC_MAC + mov rbx, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif +%endif + mov rbp, [rsp + _gpr_save + 8*0] + add rsp, STACK_size + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_192_x4.asm b/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_192_x4.asm new file mode 100644 index 00000000..e442d633 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_192_x4.asm @@ -0,0 +1,345 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;;; routine to do a 192 bit CBC AES encrypt +;;; process 4 buffers at a time, single data structure as input +;;; Updates In and Out pointers at end + +%include "os.asm" +%include "mb_mgr_datastruct.asm" + +%define MOVDQ movdqu ;; assume buffers not aligned +%macro pxor2 2 + MOVDQ XTMP, %2 + pxor %1, XTMP +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; struct AES_ARGS_x8 { +;; void* in[8]; +;; void* out[8]; +;; UINT128* keys[8]; +;; UINT128 IV[8]; +;; } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void aes_cbc_enc_192_x4(AES_ARGS_x8 *args, UINT64 len); +;; arg 1: ARG : addr of AES_ARGS_x8 structure +;; arg 2: LEN : len (in units of bytes) + +%ifdef LINUX +%define ARG rdi +%define LEN rsi +%define REG3 rcx +%define REG4 rdx +%else +%define ARG rcx +%define LEN rdx +%define REG3 rsi +%define REG4 rdi +%endif + +%define IDX rax + +%define IN0 r8 +%define KEYS0 rbx +%define OUT0 r9 + +%define IN1 r10 +%define KEYS1 REG3 +%define OUT1 r11 + +%define IN2 r12 +%define KEYS2 REG4 +%define OUT2 r13 + +%define IN3 r14 +%define KEYS3 rbp +%define OUT3 r15 + + +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 + +%define XKEY0_3 xmm4 +%define XKEY0_6 [KEYS0 + 16*6] +%define XTMP xmm5 +%define XKEY0_9 xmm6 + +%define XKEY1_3 xmm7 +%define XKEY1_6 xmm8 +%define XKEY1_9 xmm9 + +%define XKEY2_3 xmm10 +%define XKEY2_6 xmm11 +%define XKEY2_9 xmm12 + +%define XKEY3_3 xmm13 +%define XKEY3_6 xmm14 +%define XKEY3_9 xmm15 + +section .text + +MKGLOBAL(aes_cbc_enc_192_x4,function,internal) +aes_cbc_enc_192_x4: + + push rbp + + mov IDX, 16 + + mov IN0, [ARG + _aesarg_in + 8*0] + mov IN1, [ARG + _aesarg_in + 8*1] + mov IN2, [ARG + _aesarg_in + 8*2] + mov IN3, [ARG + _aesarg_in + 8*3] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + MOVDQ XDATA0, [IN0] ; load first block of plain text + MOVDQ XDATA1, [IN1] ; load first block of plain text + MOVDQ XDATA2, [IN2] ; load first block of plain text + MOVDQ XDATA3, [IN3] ; load first block of plain text + + mov KEYS0, [ARG + _aesarg_keys + 8*0] + mov KEYS1, [ARG + _aesarg_keys + 8*1] + mov KEYS2, [ARG + _aesarg_keys + 8*2] + mov KEYS3, [ARG + _aesarg_keys + 8*3] + + pxor XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV + pxor XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV + pxor XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV + pxor XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV + + mov OUT0, [ARG + _aesarg_out + 8*0] + mov OUT1, [ARG + _aesarg_out + 8*1] + mov OUT2, [ARG + _aesarg_out + 8*2] + mov OUT3, [ARG + _aesarg_out + 8*3] + + pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK + pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK + pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK + pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK + + aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + + aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + + movdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key + movdqa XKEY1_3, [KEYS1 + 16*3] ; load round 3 key + movdqa XKEY2_3, [KEYS2 + 16*3] ; load round 3 key + movdqa XKEY3_3, [KEYS3 + 16*3] ; load round 3 key + + aesenc XDATA0, XKEY0_3 ; 3. ENC + aesenc XDATA1, XKEY1_3 ; 3. ENC + aesenc XDATA2, XKEY2_3 ; 3. ENC + aesenc XDATA3, XKEY3_3 ; 3. ENC + + aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC + aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + + aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC + aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + + movdqa XKEY1_6, [KEYS1 + 16*6] ; load round 6 key + movdqa XKEY2_6, [KEYS2 + 16*6] ; load round 6 key + movdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key + + aesenc XDATA0, XKEY0_6 ; 6. ENC + aesenc XDATA1, XKEY1_6 ; 6. ENC + aesenc XDATA2, XKEY2_6 ; 6. ENC + aesenc XDATA3, XKEY3_6 ; 6. ENC + + aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + + aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + + movdqa XKEY0_9, [KEYS0 + 16*9] ; load round 9 key + movdqa XKEY1_9, [KEYS1 + 16*9] ; load round 9 key + movdqa XKEY2_9, [KEYS2 + 16*9] ; load round 9 key + movdqa XKEY3_9, [KEYS3 + 16*9] ; load round 9 key + + aesenc XDATA0, XKEY0_9 ; 9. ENC + aesenc XDATA1, XKEY1_9 ; 9. ENC + aesenc XDATA2, XKEY2_9 ; 9. ENC + aesenc XDATA3, XKEY3_9 ; 9. ENC + + aesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC + aesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC + aesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC + aesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC + + aesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC + aesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC + aesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC + aesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC + + aesenclast XDATA0, [KEYS0 + 16*12] ; 12. ENC + aesenclast XDATA1, [KEYS1 + 16*12] ; 12. ENC + aesenclast XDATA2, [KEYS2 + 16*12] ; 12. ENC + aesenclast XDATA3, [KEYS3 + 16*12] ; 12. ENC + + MOVDQ [OUT0], XDATA0 ; write back ciphertext + MOVDQ [OUT1], XDATA1 ; write back ciphertext + MOVDQ [OUT2], XDATA2 ; write back ciphertext + MOVDQ [OUT3], XDATA3 ; write back ciphertext + + cmp LEN, IDX + je done + +main_loop: + pxor2 XDATA0, [IN0 + IDX] ; plaintext XOR IV + pxor2 XDATA1, [IN1 + IDX] ; plaintext XOR IV + pxor2 XDATA2, [IN2 + IDX] ; plaintext XOR IV + pxor2 XDATA3, [IN3 + IDX] ; plaintext XOR IV + + + pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK + pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK + pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK + pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK + + aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + + aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + + aesenc XDATA0, XKEY0_3 ; 3. ENC + aesenc XDATA1, XKEY1_3 ; 3. ENC + aesenc XDATA2, XKEY2_3 ; 3. ENC + aesenc XDATA3, XKEY3_3 ; 3. ENC + + aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC + aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + + aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC + aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + + aesenc XDATA0, XKEY0_6 ; 6. ENC + aesenc XDATA1, XKEY1_6 ; 6. ENC + aesenc XDATA2, XKEY2_6 ; 6. ENC + aesenc XDATA3, XKEY3_6 ; 6. ENC + + aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + + aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + + aesenc XDATA0, XKEY0_9 ; 9. ENC + aesenc XDATA1, XKEY1_9 ; 9. ENC + aesenc XDATA2, XKEY2_9 ; 9. ENC + aesenc XDATA3, XKEY3_9 ; 9. ENC + + aesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC + aesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC + aesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC + aesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC + + aesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC + aesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC + aesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC + aesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC + + aesenclast XDATA0, [KEYS0 + 16*12] ; 12. ENC + aesenclast XDATA1, [KEYS1 + 16*12] ; 12. ENC + aesenclast XDATA2, [KEYS2 + 16*12] ; 12. ENC + aesenclast XDATA3, [KEYS3 + 16*12] ; 12. ENC + + + + MOVDQ [OUT0 + IDX], XDATA0 ; write back ciphertext + MOVDQ [OUT1 + IDX], XDATA1 ; write back ciphertex + MOVDQ [OUT2 + IDX], XDATA2 ; write back ciphertex + MOVDQ [OUT3 + IDX], XDATA3 ; write back ciphertex + + + add IDX, 16 + cmp LEN, IDX + jne main_loop + +done: + ;; update IV + movdqa [ARG + _aesarg_IV + 16*0], XDATA0 + movdqa [ARG + _aesarg_IV + 16*1], XDATA1 + movdqa [ARG + _aesarg_IV + 16*2], XDATA2 + movdqa [ARG + _aesarg_IV + 16*3], XDATA3 + + ;; update IN and OUT + add IN0, LEN + mov [ARG + _aesarg_in + 8*0], IN0 + add IN1, LEN + mov [ARG + _aesarg_in + 8*1], IN1 + add IN2, LEN + mov [ARG + _aesarg_in + 8*2], IN2 + add IN3, LEN + mov [ARG + _aesarg_in + 8*3], IN3 + + add OUT0, LEN + mov [ARG + _aesarg_out + 8*0], OUT0 + add OUT1, LEN + mov [ARG + _aesarg_out + 8*1], OUT1 + add OUT2, LEN + mov [ARG + _aesarg_out + 8*2], OUT2 + add OUT3, LEN + mov [ARG + _aesarg_out + 8*3], OUT3 + + pop rbp + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_256_x4.asm b/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_256_x4.asm new file mode 100644 index 00000000..937e2dbe --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes_cbc_enc_256_x4.asm @@ -0,0 +1,364 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;;; routine to do a 256 bit CBC AES encrypt +;;; process 4 buffers at a time, single data structure as input +;;; Updates In and Out pointers at end + +%include "os.asm" +%include "mb_mgr_datastruct.asm" + +%define MOVDQ movdqu ;; assume buffers not aligned +%macro pxor2 2 + MOVDQ XTMP, %2 + pxor %1, XTMP +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; struct AES_ARGS_x8 { +;; void* in[8]; +;; void* out[8]; +;; UINT128* keys[8]; +;; UINT128 IV[8]; +;; } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void aes_cbc_enc_256_x4(AES_ARGS_x8 *args, UINT64 len); +;; arg 1: ARG : addr of AES_ARGS_x8 structure +;; arg 2: LEN : len (in units of bytes) + +%ifdef LINUX +%define ARG rdi +%define LEN rsi +%define REG3 rcx +%define REG4 rdx +%else +%define ARG rcx +%define LEN rdx +%define REG3 rsi +%define REG4 rdi +%endif + +%define IDX rax + +%define IN0 r8 +%define KEYS0 rbx +%define OUT0 r9 + +%define IN1 r10 +%define KEYS1 REG3 +%define OUT1 r11 + +%define IN2 r12 +%define KEYS2 REG4 +%define OUT2 r13 + +%define IN3 r14 +%define KEYS3 rbp +%define OUT3 r15 + + +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 + +%define XKEY0_3 xmm4 +%define XKEY0_6 [KEYS0 + 16*6] +%define XTMP xmm5 +%define XKEY0_9 xmm6 + +%define XKEY1_3 xmm7 +%define XKEY1_6 xmm8 +%define XKEY1_9 xmm9 + +%define XKEY2_3 xmm10 +%define XKEY2_6 xmm11 +%define XKEY2_9 xmm12 + +%define XKEY3_3 xmm13 +%define XKEY3_6 xmm14 +%define XKEY3_9 xmm15 + +section .text + +MKGLOBAL(aes_cbc_enc_256_x4,function,internal) +aes_cbc_enc_256_x4: + + push rbp + + mov IDX, 16 + + mov IN0, [ARG + _aesarg_in + 8*0] + mov IN1, [ARG + _aesarg_in + 8*1] + mov IN2, [ARG + _aesarg_in + 8*2] + mov IN3, [ARG + _aesarg_in + 8*3] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + MOVDQ XDATA0, [IN0] ; load first block of plain text + MOVDQ XDATA1, [IN1] ; load first block of plain text + MOVDQ XDATA2, [IN2] ; load first block of plain text + MOVDQ XDATA3, [IN3] ; load first block of plain text + + mov KEYS0, [ARG + _aesarg_keys + 8*0] + mov KEYS1, [ARG + _aesarg_keys + 8*1] + mov KEYS2, [ARG + _aesarg_keys + 8*2] + mov KEYS3, [ARG + _aesarg_keys + 8*3] + + pxor XDATA0, [ARG + _aesarg_IV + 16*0] ; plaintext XOR IV + pxor XDATA1, [ARG + _aesarg_IV + 16*1] ; plaintext XOR IV + pxor XDATA2, [ARG + _aesarg_IV + 16*2] ; plaintext XOR IV + pxor XDATA3, [ARG + _aesarg_IV + 16*3] ; plaintext XOR IV + + mov OUT0, [ARG + _aesarg_out + 8*0] + mov OUT1, [ARG + _aesarg_out + 8*1] + mov OUT2, [ARG + _aesarg_out + 8*2] + mov OUT3, [ARG + _aesarg_out + 8*3] + + pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK + pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK + pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK + pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK + + aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + + aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + + movdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key + movdqa XKEY1_3, [KEYS1 + 16*3] ; load round 3 key + movdqa XKEY2_3, [KEYS2 + 16*3] ; load round 3 key + movdqa XKEY3_3, [KEYS3 + 16*3] ; load round 3 key + + aesenc XDATA0, XKEY0_3 ; 3. ENC + aesenc XDATA1, XKEY1_3 ; 3. ENC + aesenc XDATA2, XKEY2_3 ; 3. ENC + aesenc XDATA3, XKEY3_3 ; 3. ENC + + aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC + aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + + aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC + aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + + movdqa XKEY1_6, [KEYS1 + 16*6] ; load round 6 key + movdqa XKEY2_6, [KEYS2 + 16*6] ; load round 6 key + movdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key + + aesenc XDATA0, XKEY0_6 ; 6. ENC + aesenc XDATA1, XKEY1_6 ; 6. ENC + aesenc XDATA2, XKEY2_6 ; 6. ENC + aesenc XDATA3, XKEY3_6 ; 6. ENC + + aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + + aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + + movdqa XKEY0_9, [KEYS0 + 16*9] ; load round 9 key + movdqa XKEY1_9, [KEYS1 + 16*9] ; load round 9 key + movdqa XKEY2_9, [KEYS2 + 16*9] ; load round 9 key + movdqa XKEY3_9, [KEYS3 + 16*9] ; load round 9 key + + aesenc XDATA0, XKEY0_9 ; 9. ENC + aesenc XDATA1, XKEY1_9 ; 9. ENC + aesenc XDATA2, XKEY2_9 ; 9. ENC + aesenc XDATA3, XKEY3_9 ; 9. ENC + + aesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC + aesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC + aesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC + aesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC + + aesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC + aesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC + aesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC + aesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC + + aesenc XDATA0, [KEYS0 + 16*12] ; 12. ENC + aesenc XDATA1, [KEYS1 + 16*12] ; 12. ENC + aesenc XDATA2, [KEYS2 + 16*12] ; 12. ENC + aesenc XDATA3, [KEYS3 + 16*12] ; 12. ENC + + aesenc XDATA0, [KEYS0 + 16*13] ; 13. ENC + aesenc XDATA1, [KEYS1 + 16*13] ; 13. ENC + aesenc XDATA2, [KEYS2 + 16*13] ; 13. ENC + aesenc XDATA3, [KEYS3 + 16*13] ; 13. ENC + + aesenclast XDATA0, [KEYS0 + 16*14] ; 14. ENC + aesenclast XDATA1, [KEYS1 + 16*14] ; 14. ENC + aesenclast XDATA2, [KEYS2 + 16*14] ; 14. ENC + aesenclast XDATA3, [KEYS3 + 16*14] ; 14. ENC + + MOVDQ [OUT0], XDATA0 ; write back ciphertext + MOVDQ [OUT1], XDATA1 ; write back ciphertext + MOVDQ [OUT2], XDATA2 ; write back ciphertext + MOVDQ [OUT3], XDATA3 ; write back ciphertext + + cmp LEN, IDX + je done + +main_loop: + pxor2 XDATA0, [IN0 + IDX] ; plaintext XOR IV + pxor2 XDATA1, [IN1 + IDX] ; plaintext XOR IV + pxor2 XDATA2, [IN2 + IDX] ; plaintext XOR IV + pxor2 XDATA3, [IN3 + IDX] ; plaintext XOR IV + + + pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK + pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK + pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK + pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK + + aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + + aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + + aesenc XDATA0, XKEY0_3 ; 3. ENC + aesenc XDATA1, XKEY1_3 ; 3. ENC + aesenc XDATA2, XKEY2_3 ; 3. ENC + aesenc XDATA3, XKEY3_3 ; 3. ENC + + aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC + aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + + aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC + aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + + aesenc XDATA0, XKEY0_6 ; 6. ENC + aesenc XDATA1, XKEY1_6 ; 6. ENC + aesenc XDATA2, XKEY2_6 ; 6. ENC + aesenc XDATA3, XKEY3_6 ; 6. ENC + + aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + + aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + + aesenc XDATA0, XKEY0_9 ; 9. ENC + aesenc XDATA1, XKEY1_9 ; 9. ENC + aesenc XDATA2, XKEY2_9 ; 9. ENC + aesenc XDATA3, XKEY3_9 ; 9. ENC + + aesenc XDATA0, [KEYS0 + 16*10] ; 10. ENC + aesenc XDATA1, [KEYS1 + 16*10] ; 10. ENC + aesenc XDATA2, [KEYS2 + 16*10] ; 10. ENC + aesenc XDATA3, [KEYS3 + 16*10] ; 10. ENC + + aesenc XDATA0, [KEYS0 + 16*11] ; 11. ENC + aesenc XDATA1, [KEYS1 + 16*11] ; 11. ENC + aesenc XDATA2, [KEYS2 + 16*11] ; 11. ENC + aesenc XDATA3, [KEYS3 + 16*11] ; 11. ENC + + aesenc XDATA0, [KEYS0 + 16*12] ; 12. ENC + aesenc XDATA1, [KEYS1 + 16*12] ; 12. ENC + aesenc XDATA2, [KEYS2 + 16*12] ; 12. ENC + aesenc XDATA3, [KEYS3 + 16*12] ; 12. ENC + + aesenc XDATA0, [KEYS0 + 16*13] ; 13. ENC + aesenc XDATA1, [KEYS1 + 16*13] ; 13. ENC + aesenc XDATA2, [KEYS2 + 16*13] ; 13. ENC + aesenc XDATA3, [KEYS3 + 16*13] ; 13. ENC + + aesenclast XDATA0, [KEYS0 + 16*14] ; 14. ENC + aesenclast XDATA1, [KEYS1 + 16*14] ; 14. ENC + aesenclast XDATA2, [KEYS2 + 16*14] ; 14. ENC + aesenclast XDATA3, [KEYS3 + 16*14] ; 14. ENC + + + MOVDQ [OUT0 + IDX], XDATA0 ; write back ciphertext + MOVDQ [OUT1 + IDX], XDATA1 ; write back ciphertex + MOVDQ [OUT2 + IDX], XDATA2 ; write back ciphertex + MOVDQ [OUT3 + IDX], XDATA3 ; write back ciphertex + + + add IDX, 16 + cmp LEN, IDX + jne main_loop + +done: + ;; update IV + movdqa [ARG + _aesarg_IV + 16*0], XDATA0 + movdqa [ARG + _aesarg_IV + 16*1], XDATA1 + movdqa [ARG + _aesarg_IV + 16*2], XDATA2 + movdqa [ARG + _aesarg_IV + 16*3], XDATA3 + + ;; update IN and OUT + add IN0, LEN + mov [ARG + _aesarg_in + 8*0], IN0 + add IN1, LEN + mov [ARG + _aesarg_in + 8*1], IN1 + add IN2, LEN + mov [ARG + _aesarg_in + 8*2], IN2 + add IN3, LEN + mov [ARG + _aesarg_in + 8*3], IN3 + + add OUT0, LEN + mov [ARG + _aesarg_out + 8*0], OUT0 + add OUT1, LEN + mov [ARG + _aesarg_out + 8*1], OUT1 + add OUT2, LEN + mov [ARG + _aesarg_out + 8*2], OUT2 + add OUT3, LEN + mov [ARG + _aesarg_out + 8*3], OUT3 + + pop rbp + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/aes_cfb_128_sse.asm b/src/spdk/intel-ipsec-mb/sse/aes_cfb_128_sse.asm new file mode 100644 index 00000000..0b160c23 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes_cfb_128_sse.asm @@ -0,0 +1,162 @@ +;; +;; Copyright (c) 2017-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "memcpy.asm" + +;;; Routine to do 128 bit CFB AES encrypt/decrypt operations on one block only. +;;; It processes only one buffer at a time. +;;; It is designed to manage partial blocks of DOCSIS 3.1 SEC BPI + +;; In System V AMD64 ABI +;; calle saves: RBX, RBP, R12-R15 +;; Windows x64 ABI +;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 +;; +;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Windows clobbers: RAX R9 R10 R11 +;; Windows preserves: RBX RCX RDX RBP RSI RDI R8 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Linux clobbers: RAX R9 R10 +;; Linux preserves: RBX RCX RDX RBP RSI RDI R8 R11 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; +;; Linux/Windows clobbers: xmm0 +;; + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define arg4 rcx +%define arg5 r8 +%else +%define arg1 rcx +%define arg2 rdx +%define arg3 r8 +%define arg4 r9 +%define arg5 [rsp + 5*8] +%endif + +%define OUT arg1 +%define IN arg2 +%define IV arg3 +%define KEYS arg4 +%ifdef LINUX +%define LEN arg5 +%else +%define LEN2 arg5 +%define LEN r11 +%endif + +%define TMP0 rax +%define TMP1 r10 +%define PTR0 rsp + _buffer + +%define XDATA xmm0 + +section .text + +struc STACK +_buffer: resq 2 +_rsp_save: resq 1 +endstruc + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void aes_cfb_128_one(void *out, void *in, void *iv, void *keys) +;; arg 1: OUT : addr to put clear/cipher text out +;; arg 2: IN : addr to take cipher/clear text from +;; arg 3: IV : initialization vector +;; arg 4: KEYS: pointer to expanded keys structure (16 byte aligned) +;; arg 5: LEN: length of the text to encrypt/decrypt (valid range is 0 to 16) +;; +;; AES CFB128 one block encrypt/decrypt implementation. +;; The function doesn't update IV. The result of operation can be found in OUT. +;; +;; It is primarly designed to process partial block of +;; DOCSIS 3.1 AES Packet PDU Encryption (I.10) +;; +;; It process up to one block only (up to 16 bytes). +;; +;; It makes sure not to read more than LEN bytes from IN and +;; not to store more than LEN bytes to OUT. + +MKGLOBAL(aes_cfb_128_one_sse,function,) +align 32 +aes_cfb_128_one_sse: +%ifndef LINUX + mov LEN, LEN2 +%endif + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + mov [rsp + _rsp_save], rax + + test LEN, 16 + jz copy_in_lt16 + movdqu XDATA, [IN] + movdqa [PTR0], XDATA + jmp copy_in_end +copy_in_lt16: + memcpy_sse_16 PTR0, IN, LEN, TMP0, TMP1 +copy_in_end: + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + movdqu XDATA, [IV] ; IV (or next to last block) + pxor XDATA, [KEYS + 16*0] ; 0. ARK + aesenc XDATA, [KEYS + 16*1] ; 1. ENC + aesenc XDATA, [KEYS + 16*2] ; 2. ENC + aesenc XDATA, [KEYS + 16*3] ; 3. ENC + aesenc XDATA, [KEYS + 16*4] ; 4. ENC + aesenc XDATA, [KEYS + 16*5] ; 5. ENC + aesenc XDATA, [KEYS + 16*6] ; 6. ENC + aesenc XDATA, [KEYS + 16*7] ; 7. ENC + aesenc XDATA, [KEYS + 16*8] ; 8. ENC + aesenc XDATA, [KEYS + 16*9] ; 9. ENC + aesenclast XDATA, [KEYS + 16*10] ; 10. ENC + + pxor XDATA, [PTR0] ; plaintext/ciphertext XOR block cipher encryption + + test LEN, 16 + jz copy_out_lt16 + movdqu [OUT], XDATA + jmp copy_out_end +copy_out_lt16: + movdqa [PTR0], XDATA + memcpy_sse_16 OUT, PTR0, LEN, TMP0, TMP1 +copy_out_end: + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + mov rsp, [rsp + _rsp_save] ; original SP + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/aes_xcbc_mac_128_x4.asm b/src/spdk/intel-ipsec-mb/sse/aes_xcbc_mac_128_x4.asm new file mode 100644 index 00000000..b806a817 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/aes_xcbc_mac_128_x4.asm @@ -0,0 +1,298 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;;; routine to do 128 bit AES XCBC +;;; process 4 buffers at a time, single data structure as input +;;; Updates In pointer at end + +;; clobbers all registers except for ARG1 and rbp + +%include "os.asm" +%include "mb_mgr_datastruct.asm" + +%define MOVDQ movdqu ;; assume buffers not aligned +%macro pxor2 2 + MOVDQ XTMP, %2 + pxor %1, XTMP +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; struct AES_XCBC_ARGS_x8 { +;; void* in[8]; +;; UINT128* keys[8]; +;; UINT128 ICV[8]; +;; } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void aes_xcbc_mac_128_x4(AES_XCBC_ARGS_x8 *args, UINT64 len); +;; arg 1: ARG : addr of AES_XCBC_ARGS_x8 structure +;; arg 2: LEN : len (in units of bytes) + +%ifdef LINUX +%define ARG rdi +%define LEN rsi +%define REG3 rcx +%define REG4 rdx +%else +%define ARG rcx +%define LEN rdx +%define REG3 rsi +%define REG4 rdi +%endif + +%define IDX rax + +%define IN0 r8 +%define KEYS0 rbx +%define OUT0 r9 + +%define IN1 r10 +%define KEYS1 REG3 +%define OUT1 r11 + +%define IN2 r12 +%define KEYS2 REG4 +%define OUT2 r13 + +%define IN3 r14 +%define KEYS3 rbp +%define OUT3 r15 + + +%define XDATA0 xmm0 +%define XDATA1 xmm1 +%define XDATA2 xmm2 +%define XDATA3 xmm3 + +%define XKEY0_3 xmm4 +%define XKEY0_6 [KEYS0 + 16*6] +%define XTMP xmm5 +%define XKEY0_9 xmm6 + +%define XKEY1_3 xmm7 +%define XKEY1_6 xmm8 +%define XKEY1_9 xmm9 + +%define XKEY2_3 xmm10 +%define XKEY2_6 xmm11 +%define XKEY2_9 xmm12 + +%define XKEY3_3 xmm13 +%define XKEY3_6 xmm14 +%define XKEY3_9 xmm15 + +section .text + +MKGLOBAL(aes_xcbc_mac_128_x4,function,internal) +aes_xcbc_mac_128_x4: + + push rbp + + mov IDX, 16 + + mov IN0, [ARG + _aesxcbcarg_in + 8*0] + mov IN1, [ARG + _aesxcbcarg_in + 8*1] + mov IN2, [ARG + _aesxcbcarg_in + 8*2] + mov IN3, [ARG + _aesxcbcarg_in + 8*3] + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + MOVDQ XDATA0, [IN0] ; load first block of plain text + MOVDQ XDATA1, [IN1] ; load first block of plain text + MOVDQ XDATA2, [IN2] ; load first block of plain text + MOVDQ XDATA3, [IN3] ; load first block of plain text + + mov KEYS0, [ARG + _aesxcbcarg_keys + 8*0] + mov KEYS1, [ARG + _aesxcbcarg_keys + 8*1] + mov KEYS2, [ARG + _aesxcbcarg_keys + 8*2] + mov KEYS3, [ARG + _aesxcbcarg_keys + 8*3] + + pxor XDATA0, [ARG + _aesxcbcarg_ICV + 16*0] ; plaintext XOR ICV + pxor XDATA1, [ARG + _aesxcbcarg_ICV + 16*1] ; plaintext XOR ICV + pxor XDATA2, [ARG + _aesxcbcarg_ICV + 16*2] ; plaintext XOR ICV + pxor XDATA3, [ARG + _aesxcbcarg_ICV + 16*3] ; plaintext XOR ICV + + pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK + pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK + pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK + pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK + + aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + + aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + + movdqa XKEY0_3, [KEYS0 + 16*3] ; load round 3 key + movdqa XKEY1_3, [KEYS1 + 16*3] ; load round 3 key + movdqa XKEY2_3, [KEYS2 + 16*3] ; load round 3 key + movdqa XKEY3_3, [KEYS3 + 16*3] ; load round 3 key + + aesenc XDATA0, XKEY0_3 ; 3. ENC + aesenc XDATA1, XKEY1_3 ; 3. ENC + aesenc XDATA2, XKEY2_3 ; 3. ENC + aesenc XDATA3, XKEY3_3 ; 3. ENC + + aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC + aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + + aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC + aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + + movdqa XKEY1_6, [KEYS1 + 16*6] ; load round 6 key + movdqa XKEY2_6, [KEYS2 + 16*6] ; load round 6 key + movdqa XKEY3_6, [KEYS3 + 16*6] ; load round 6 key + + aesenc XDATA0, XKEY0_6 ; 6. ENC + aesenc XDATA1, XKEY1_6 ; 6. ENC + aesenc XDATA2, XKEY2_6 ; 6. ENC + aesenc XDATA3, XKEY3_6 ; 6. ENC + + aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + + aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + + movdqa XKEY0_9, [KEYS0 + 16*9] ; load round 9 key + movdqa XKEY1_9, [KEYS1 + 16*9] ; load round 9 key + movdqa XKEY2_9, [KEYS2 + 16*9] ; load round 9 key + movdqa XKEY3_9, [KEYS3 + 16*9] ; load round 9 key + + aesenc XDATA0, XKEY0_9 ; 9. ENC + aesenc XDATA1, XKEY1_9 ; 9. ENC + aesenc XDATA2, XKEY2_9 ; 9. ENC + aesenc XDATA3, XKEY3_9 ; 9. ENC + + aesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC + aesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC + aesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC + aesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC + + cmp LEN, IDX + je done + +main_loop: + pxor2 XDATA0, [IN0 + IDX] ; plaintext XOR ICV + pxor2 XDATA1, [IN1 + IDX] ; plaintext XOR ICV + pxor2 XDATA2, [IN2 + IDX] ; plaintext XOR ICV + pxor2 XDATA3, [IN3 + IDX] ; plaintext XOR ICV + + pxor XDATA0, [KEYS0 + 16*0] ; 0. ARK + pxor XDATA1, [KEYS1 + 16*0] ; 0. ARK + pxor XDATA2, [KEYS2 + 16*0] ; 0. ARK + pxor XDATA3, [KEYS3 + 16*0] ; 0. ARK + + aesenc XDATA0, [KEYS0 + 16*1] ; 1. ENC + aesenc XDATA1, [KEYS1 + 16*1] ; 1. ENC + aesenc XDATA2, [KEYS2 + 16*1] ; 1. ENC + aesenc XDATA3, [KEYS3 + 16*1] ; 1. ENC + + aesenc XDATA0, [KEYS0 + 16*2] ; 2. ENC + aesenc XDATA1, [KEYS1 + 16*2] ; 2. ENC + aesenc XDATA2, [KEYS2 + 16*2] ; 2. ENC + aesenc XDATA3, [KEYS3 + 16*2] ; 2. ENC + + aesenc XDATA0, XKEY0_3 ; 3. ENC + aesenc XDATA1, XKEY1_3 ; 3. ENC + aesenc XDATA2, XKEY2_3 ; 3. ENC + aesenc XDATA3, XKEY3_3 ; 3. ENC + + aesenc XDATA0, [KEYS0 + 16*4] ; 4. ENC + aesenc XDATA1, [KEYS1 + 16*4] ; 4. ENC + aesenc XDATA2, [KEYS2 + 16*4] ; 4. ENC + aesenc XDATA3, [KEYS3 + 16*4] ; 4. ENC + + aesenc XDATA0, [KEYS0 + 16*5] ; 5. ENC + aesenc XDATA1, [KEYS1 + 16*5] ; 5. ENC + aesenc XDATA2, [KEYS2 + 16*5] ; 5. ENC + aesenc XDATA3, [KEYS3 + 16*5] ; 5. ENC + + aesenc XDATA0, XKEY0_6 ; 6. ENC + aesenc XDATA1, XKEY1_6 ; 6. ENC + aesenc XDATA2, XKEY2_6 ; 6. ENC + aesenc XDATA3, XKEY3_6 ; 6. ENC + + aesenc XDATA0, [KEYS0 + 16*7] ; 7. ENC + aesenc XDATA1, [KEYS1 + 16*7] ; 7. ENC + aesenc XDATA2, [KEYS2 + 16*7] ; 7. ENC + aesenc XDATA3, [KEYS3 + 16*7] ; 7. ENC + + aesenc XDATA0, [KEYS0 + 16*8] ; 8. ENC + aesenc XDATA1, [KEYS1 + 16*8] ; 8. ENC + aesenc XDATA2, [KEYS2 + 16*8] ; 8. ENC + aesenc XDATA3, [KEYS3 + 16*8] ; 8. ENC + + aesenc XDATA0, XKEY0_9 ; 9. ENC + aesenc XDATA1, XKEY1_9 ; 9. ENC + aesenc XDATA2, XKEY2_9 ; 9. ENC + aesenc XDATA3, XKEY3_9 ; 9. ENC + + aesenclast XDATA0, [KEYS0 + 16*10] ; 10. ENC + aesenclast XDATA1, [KEYS1 + 16*10] ; 10. ENC + aesenclast XDATA2, [KEYS2 + 16*10] ; 10. ENC + aesenclast XDATA3, [KEYS3 + 16*10] ; 10. ENC + + add IDX, 16 + cmp LEN, IDX + jne main_loop + +done: + ;; update ICV + movdqa [ARG + _aesxcbcarg_ICV + 16*0], XDATA0 + movdqa [ARG + _aesxcbcarg_ICV + 16*1], XDATA1 + movdqa [ARG + _aesxcbcarg_ICV + 16*2], XDATA2 + movdqa [ARG + _aesxcbcarg_ICV + 16*3], XDATA3 + + ;; update IN + add IN0, LEN + mov [ARG + _aesxcbcarg_in + 8*0], IN0 + add IN1, LEN + mov [ARG + _aesxcbcarg_in + 8*1], IN1 + add IN2, LEN + mov [ARG + _aesxcbcarg_in + 8*2], IN2 + add IN3, LEN + mov [ARG + _aesxcbcarg_in + 8*3], IN3 + + pop rbp + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/gcm128_sse.asm b/src/spdk/intel-ipsec-mb/sse/gcm128_sse.asm new file mode 100644 index 00000000..c622e726 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/gcm128_sse.asm @@ -0,0 +1,30 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2018 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define GCM128_MODE 1 +%include "gcm_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/gcm192_sse.asm b/src/spdk/intel-ipsec-mb/sse/gcm192_sse.asm new file mode 100644 index 00000000..c37c3be6 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/gcm192_sse.asm @@ -0,0 +1,31 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2017-2018, Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM192_MODE 1 +%include "gcm_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/gcm256_sse.asm b/src/spdk/intel-ipsec-mb/sse/gcm256_sse.asm new file mode 100644 index 00000000..646020b5 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/gcm256_sse.asm @@ -0,0 +1,31 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2018 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM256_MODE 1 +%include "gcm_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/gcm_sse.asm b/src/spdk/intel-ipsec-mb/sse/gcm_sse.asm new file mode 100644 index 00000000..6b283608 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/gcm_sse.asm @@ -0,0 +1,2166 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2011-2018 Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; Authors: +; Erdinc Ozturk +; Vinodh Gopal +; James Guilford +; +; +; References: +; This code was derived and highly optimized from the code described in paper: +; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010 +; +; For the shift-based reductions used in this code, we used the method described in paper: +; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010. +; +; +; +; +; Assumptions: +; +; +; +; iv: +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Salt (From the SA) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | Initialization Vector | +; | (This is the sequence number from IPSec header) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x1 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; +; +; AAD: +; AAD will be padded with 0 to the next 16byte multiple +; for example, assume AAD is a u32 vector +; +; if AAD is 8 bytes: +; AAD[3] = {A0, A1}; +; padded AAD in xmm register = {A1 A0 0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A1) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 32-bit Sequence Number (A0) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 32-bit Sequence Number +; +; if AAD is 12 bytes: +; AAD[3] = {A0, A1, A2}; +; padded AAD in xmm register = {A2 A1 A0 0} +; +; 0 1 2 3 +; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | SPI (A2) | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 64-bit Extended Sequence Number {A1,A0} | +; | | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; | 0x0 | +; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +; +; AAD Format with 64-bit Extended Sequence Number +; +; +; aadLen: +; Must be a multiple of 4 bytes and from the definition of the spec. +; The code additionally supports any aadLen length. +; +; TLen: +; from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +; +; poly = x^128 + x^127 + x^126 + x^121 + 1 +; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part. +; + +%include "os.asm" +%include "reg_sizes.asm" +%include "gcm_defines.asm" + +%ifndef GCM128_MODE +%ifndef GCM192_MODE +%ifndef GCM256_MODE +%error "No GCM mode selected for gcm_sse.asm!" +%endif +%endif +%endif + +%ifdef GCM128_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ sse +%define NROUNDS 9 +%endif + +%ifdef GCM192_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ sse +%define NROUNDS 11 +%endif + +%ifdef GCM256_MODE +%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ sse +%define NROUNDS 13 +%endif + +default rel +; need to push 4 registers into stack to maintain +%define STACK_OFFSET 8*4 + +%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register) +%define TMP3 16*1 ; Temporary storage for AES State 3 +%define TMP4 16*2 ; Temporary storage for AES State 4 +%define TMP5 16*3 ; Temporary storage for AES State 5 +%define TMP6 16*4 ; Temporary storage for AES State 6 +%define TMP7 16*5 ; Temporary storage for AES State 7 +%define TMP8 16*6 ; Temporary storage for AES State 8 + +%define LOCAL_STORAGE 16*7 + +%ifidn __OUTPUT_FORMAT__, win64 + %define XMM_STORAGE 16*10 +%else + %define XMM_STORAGE 0 +%endif + +%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Utility Macros +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +; Input: A and B (128-bits each, bit-reflected) +; Output: C = A*B*x mod poly, (i.e. >>1 ) +; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GHASH_MUL 7 +%define %%GH %1 ; 16 Bytes +%define %%HK %2 ; 16 Bytes +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 + ; %%GH, %%HK hold the values for the two operands which are carry-less multiplied + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; Karatsuba Method + movdqa %%T1, %%GH + pshufd %%T2, %%GH, 01001110b + pshufd %%T3, %%HK, 01001110b + pxor %%T2, %%GH ; %%T2 = (a1+a0) + pxor %%T3, %%HK ; %%T3 = (b1+b0) + + pclmulqdq %%T1, %%HK, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0 + pclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T2, %%GH + pxor %%T2, %%T1 ; %%T2 = a0*b1+a1*b0 + + movdqa %%T3, %%T2 + pslldq %%T3, 8 ; shift-L %%T3 2 DWs + psrldq %%T2, 8 ; shift-R %%T2 2 DWs + pxor %%GH, %%T3 + pxor %%T1, %%T2 ; <%%T1:%%GH> holds the result of the carry-less multiplication of %%GH by %%HK + + + ;first phase of the reduction + movdqa %%T2, %%GH + movdqa %%T3, %%GH + movdqa %%T4, %%GH ; move %%GH into %%T2, %%T3, %%T4 in order to perform the three shifts independently + + pslld %%T2, 31 ; packed right shifting << 31 + pslld %%T3, 30 ; packed right shifting shift << 30 + pslld %%T4, 25 ; packed right shifting shift << 25 + pxor %%T2, %%T3 ; xor the shifted versions + pxor %%T2, %%T4 + + movdqa %%T5, %%T2 + psrldq %%T5, 4 ; shift-R %%T5 1 DW + + pslldq %%T2, 12 ; shift-L %%T2 3 DWs + pxor %%GH, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;second phase of the reduction + movdqa %%T2,%%GH ; make 3 copies of %%GH (in in %%T2, %%T3, %%T4) for doing three shift operations + movdqa %%T3,%%GH + movdqa %%T4,%%GH + + psrld %%T2,1 ; packed left shifting >> 1 + psrld %%T3,2 ; packed left shifting >> 2 + psrld %%T4,7 ; packed left shifting >> 7 + pxor %%T2,%%T3 ; xor the shifted versions + pxor %%T2,%%T4 + + pxor %%T2, %%T5 + pxor %%GH, %%T2 + pxor %%GH, %%T1 ; the result is in %%T1 + + +%endmacro + + +%macro PRECOMPUTE 8 +%define %%GDATA %1 +%define %%HK %2 +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 +%define %%T4 %6 +%define %%T5 %7 +%define %%T6 %8 + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + movdqa %%T4, %%HK + pshufd %%T1, %%HK, 01001110b + pxor %%T1, %%HK + movdqu [%%GDATA + HashKey_k], %%T1 + + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^2<<1 mod poly + movdqu [%%GDATA + HashKey_2], %%T4 ; [HashKey_2] = HashKey^2<<1 mod poly + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_2_k], %%T1 + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^3<<1 mod poly + movdqu [%%GDATA + HashKey_3], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_3_k], %%T1 + + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^4<<1 mod poly + movdqu [%%GDATA + HashKey_4], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_4_k], %%T1 + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^5<<1 mod poly + movdqu [%%GDATA + HashKey_5], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_5_k], %%T1 + + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^6<<1 mod poly + movdqu [%%GDATA + HashKey_6], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_6_k], %%T1 + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^7<<1 mod poly + movdqu [%%GDATA + HashKey_7], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_7_k], %%T1 + + GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^8<<1 mod poly + movdqu [%%GDATA + HashKey_8], %%T4 + pshufd %%T1, %%T4, 01001110b + pxor %%T1, %%T4 + movdqu [%%GDATA + HashKey_8_k], %%T1 + + +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes. +; Returns 0 if data has length 0. +; Input: The input data (INPUT), that data's length (LENGTH). +; Output: The packed xmm register (OUTPUT). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro READ_SMALL_DATA_INPUT 6 +%define %%OUTPUT %1 ; %%OUTPUT is an xmm register +%define %%INPUT %2 +%define %%LENGTH %3 +%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers +%define %%COUNTER %5 +%define %%TMP1 %6 + + pxor %%OUTPUT, %%OUTPUT + mov %%COUNTER, %%LENGTH + mov %%END_READ_LOCATION, %%INPUT + add %%END_READ_LOCATION, %%LENGTH + xor %%TMP1, %%TMP1 + + + cmp %%COUNTER, 8 + jl %%_byte_loop_2 + pinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists + je %%_done + + sub %%COUNTER, 8 + +%%_byte_loop_1: ;Read in data 1 byte at a time while data is left + shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in + dec %%END_READ_LOCATION + mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] + dec %%COUNTER + jg %%_byte_loop_1 + pinsrq %%OUTPUT, %%TMP1, 1 + jmp %%_done + +%%_byte_loop_2: ;Read in data 1 byte at a time while data is left + cmp %%COUNTER, 0 + je %%_done + shl %%TMP1, 8 ;This loop handles when no bytes were already read in + dec %%END_READ_LOCATION + mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION] + dec %%COUNTER + jg %%_byte_loop_2 + pinsrq %%OUTPUT, %%TMP1, 0 +%%_done: + +%endmacro ; READ_SMALL_DATA_INPUT + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. +; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). +; Output: The hash of the data (AAD_HASH). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro CALC_AAD_HASH 14 +%define %%A_IN %1 +%define %%A_LEN %2 +%define %%AAD_HASH %3 +%define %%HASH_KEY %4 +%define %%XTMP1 %5 ; xmm temp reg 5 +%define %%XTMP2 %6 +%define %%XTMP3 %7 +%define %%XTMP4 %8 +%define %%XTMP5 %9 ; xmm temp reg 5 +%define %%T1 %10 ; temp reg 1 +%define %%T2 %11 +%define %%T3 %12 +%define %%T4 %13 +%define %%T5 %14 ; temp reg 5 + + + mov %%T1, %%A_IN ; T1 = AAD + mov %%T2, %%A_LEN ; T2 = aadLen + pxor %%AAD_HASH, %%AAD_HASH + + cmp %%T2, 16 + jl %%_get_small_AAD_block + +%%_get_AAD_loop16: + + movdqu %%XTMP1, [%%T1] + ;byte-reflect the AAD data + pshufb %%XTMP1, [SHUF_MASK] + pxor %%AAD_HASH, %%XTMP1 + GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 + + sub %%T2, 16 + je %%_CALC_AAD_done + + add %%T1, 16 + cmp %%T2, 16 + jge %%_get_AAD_loop16 + +%%_get_small_AAD_block: + READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5 + ;byte-reflect the AAD data + pshufb %%XTMP1, [SHUF_MASK] + pxor %%AAD_HASH, %%XTMP1 + GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 + +%%_CALC_AAD_done: + +%endmacro ; CALC_AAD_HASH + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls. +; Requires the input data be at least 1 byte long. +; Input: gcm_key_data (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN), +; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET), +; and whether encoding or decoding (ENC_DEC). +; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro PARTIAL_BLOCK 8 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%DATA_OFFSET %6 +%define %%AAD_HASH %7 +%define %%ENC_DEC %8 + mov r13, [%%GDATA_CTX + PBlockLen] + cmp r13, 0 + je %%_partial_block_done ;Leave Macro if no partial blocks + + cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading + jl %%_fewer_than_16_bytes + XLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register + jmp %%_data_read + +%%_fewer_than_16_bytes: + lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15 + mov r13, [%%GDATA_CTX + PBlockLen] + +%%_data_read: ;Finished reading in data + + + movdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = ctx_data.partial_block_enc_key + movdqu xmm13, [%%GDATA_KEY + HashKey] + + lea r12, [SHIFT_MASK] + + add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16) + movdqu xmm2, [r12] ; get the appropriate shuffle mask + pshufb xmm9, xmm2 ;shift right r13 bytes + +%ifidn %%ENC_DEC, DEC + movdqa xmm3, xmm1 + pxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn) + + mov r15, %%PLAIN_CYPH_LEN + add r15, r13 + sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly + sub r12, r15 +%%_no_extra_mask_1: + + movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 + pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 + + pand xmm3, xmm1 + pshufb xmm3, [SHUF_MASK] + pshufb xmm3, xmm2 + pxor %%AAD_HASH, xmm3 + + + cmp r15,0 + jl %%_partial_incomplete_1 + + GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + xor rax,rax + mov [%%GDATA_CTX + PBlockLen], rax + jmp %%_dec_done +%%_partial_incomplete_1: +%ifidn __OUTPUT_FORMAT__, win64 + mov rax, %%PLAIN_CYPH_LEN + add [%%GDATA_CTX + PBlockLen], rax +%else + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%endif +%%_dec_done: + movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH + +%else + pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + + mov r15, %%PLAIN_CYPH_LEN + add r15, r13 + sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block + jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly + sub r12, r15 +%%_no_extra_mask_2: + + movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9 + pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9 + + pshufb xmm9, [SHUF_MASK] + pshufb xmm9, xmm2 + pxor %%AAD_HASH, xmm9 + + cmp r15,0 + jl %%_partial_incomplete_2 + + GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + xor rax,rax + mov [%%GDATA_CTX + PBlockLen], rax + jmp %%_encode_done +%%_partial_incomplete_2: +%ifidn __OUTPUT_FORMAT__, win64 + mov rax, %%PLAIN_CYPH_LEN + add [%%GDATA_CTX + PBlockLen], rax +%else + add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN +%endif +%%_encode_done: + movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH + + pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext + pshufb xmm9, xmm2 +%endif + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; output encrypted Bytes + cmp r15,0 + jl %%_partial_fill + mov r12, r13 + mov r13, 16 + sub r13, r12 ; Set r13 to be the number of bytes to write out + jmp %%_count_set +%%_partial_fill: + mov r13, %%PLAIN_CYPH_LEN +%%_count_set: + movq rax, xmm9 + cmp r13, 8 + jle %%_less_than_8_bytes_left + + mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax + add %%DATA_OFFSET, 8 + psrldq xmm9, 8 + movq rax, xmm9 + sub r13, 8 +%%_less_than_8_bytes_left: + mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al + add %%DATA_OFFSET, 1 + shr rax, 8 + sub r13, 1 + jne %%_less_than_8_bytes_left + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%%_partial_block_done: +%endmacro ; PARTIAL_BLOCK + + +; if a = number of total plaintext bytes +; b = floor(a/16) +; %%num_initial_blocks = b mod 8; +; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext +; %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified +; Updated AAD_HASH is returned in %%T3 + +%macro INITIAL_BLOCKS 24 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%LENGTH %5 +%define %%DATA_OFFSET %6 +%define %%num_initial_blocks %7 ; can be 0, 1, 2, 3, 4, 5, 6 or 7 +%define %%T1 %8 +%define %%HASH_KEY %9 +%define %%T3 %10 +%define %%T4 %11 +%define %%T5 %12 +%define %%CTR %13 +%define %%XMM1 %14 +%define %%XMM2 %15 +%define %%XMM3 %16 +%define %%XMM4 %17 +%define %%XMM5 %18 +%define %%XMM6 %19 +%define %%XMM7 %20 +%define %%XMM8 %21 +%define %%T6 %22 +%define %%T_key %23 +%define %%ENC_DEC %24 + +%assign i (8-%%num_initial_blocks) + movdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg + + ; start AES for %%num_initial_blocks blocks + movdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0 + + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + paddd %%CTR, [ONE] ; INCR Y0 + movdqa reg(i), %%CTR + pshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap +%assign i (i+1) +%endrep + +movdqu %%T_key, [%%GDATA_KEY+16*0] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + pxor reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j 1 +%rep NROUNDS ; encrypt N blocks with 13 key rounds (11 for GCM192) +movdqu %%T_key, [%%GDATA_KEY+16*j] +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + aesenc reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign j (j+1) +%endrep + + +movdqu %%T_key, [%%GDATA_KEY+16*j] ; encrypt with last (14th) key round (12 for GCM192) +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + aesenclast reg(i),%%T_key +%assign i (i+1) +%endrep + +%assign i (9-%%num_initial_blocks) +%rep %%num_initial_blocks + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + pxor reg(i), %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks + add %%DATA_OFFSET, 16 + %ifidn %%ENC_DEC, DEC + movdqa reg(i), %%T1 + %endif + pshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations +%assign i (i+1) +%endrep + + +%assign i (8-%%num_initial_blocks) +%assign j (9-%%num_initial_blocks) + +%rep %%num_initial_blocks + pxor reg(j), reg(i) + GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks +%assign i (i+1) +%assign j (j+1) +%endrep + ; %%XMM8 has the current Hash Value + movdqa %%T3, %%XMM8 + + cmp %%LENGTH, 128 + jl %%_initial_blocks_done ; no need for precomputed constants + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM1, %%CTR + pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM2, %%CTR + pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM3, %%CTR + pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM4, %%CTR + pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM5, %%CTR + pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM6, %%CTR + pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM7, %%CTR + pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + + paddd %%CTR, [ONE] ; INCR Y0 + movdqa %%XMM8, %%CTR + pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + + movdqu %%T_key, [%%GDATA_KEY+16*0] + pxor %%XMM1, %%T_key + pxor %%XMM2, %%T_key + pxor %%XMM3, %%T_key + pxor %%XMM4, %%T_key + pxor %%XMM5, %%T_key + pxor %%XMM6, %%T_key + pxor %%XMM7, %%T_key + pxor %%XMM8, %%T_key + + +%assign i 1 +%rep NROUNDS ; do early (13) rounds (11 for GCM192) + movdqu %%T_key, [%%GDATA_KEY+16*i] + aesenc %%XMM1, %%T_key + aesenc %%XMM2, %%T_key + aesenc %%XMM3, %%T_key + aesenc %%XMM4, %%T_key + aesenc %%XMM5, %%T_key + aesenc %%XMM6, %%T_key + aesenc %%XMM7, %%T_key + aesenc %%XMM8, %%T_key +%assign i (i+1) +%endrep + + + movdqu %%T_key, [%%GDATA_KEY+16*i] ; do final key round + aesenclast %%XMM1, %%T_key + aesenclast %%XMM2, %%T_key + aesenclast %%XMM3, %%T_key + aesenclast %%XMM4, %%T_key + aesenclast %%XMM5, %%T_key + aesenclast %%XMM6, %%T_key + aesenclast %%XMM7, %%T_key + aesenclast %%XMM8, %%T_key + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0] + pxor %%XMM1, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM1, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1] + pxor %%XMM2, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM2, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2] + pxor %%XMM3, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM3, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3] + pxor %%XMM4, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM4, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4] + pxor %%XMM5, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM5, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5] + pxor %%XMM6, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM6, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6] + pxor %%XMM7, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM7, %%T1 + %endif + + XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7] + pxor %%XMM8, %%T1 + XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8 + %ifidn %%ENC_DEC, DEC + movdqa %%XMM8, %%T1 + %endif + + add %%DATA_OFFSET, 128 + + pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + pxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext + pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_initial_blocks_done: + + +%endmacro + + + +; encrypt 8 blocks at a time +; ghash the 8 previously encrypted ciphertext blocks +; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified +; %%DATA_OFFSET is the data offset value +%macro GHASH_8_ENCRYPT_8_PARALLEL 22 +%define %%GDATA %1 +%define %%CYPH_PLAIN_OUT %2 +%define %%PLAIN_CYPH_IN %3 +%define %%DATA_OFFSET %4 +%define %%T1 %5 +%define %%T2 %6 +%define %%T3 %7 +%define %%T4 %8 +%define %%T5 %9 +%define %%T6 %10 +%define %%CTR %11 +%define %%XMM1 %12 +%define %%XMM2 %13 +%define %%XMM3 %14 +%define %%XMM4 %15 +%define %%XMM5 %16 +%define %%XMM6 %17 +%define %%XMM7 %18 +%define %%XMM8 %19 +%define %%T7 %20 +%define %%loop_idx %21 +%define %%ENC_DEC %22 + + movdqa %%T7, %%XMM1 + movdqu [rsp + TMP2], %%XMM2 + movdqu [rsp + TMP3], %%XMM3 + movdqu [rsp + TMP4], %%XMM4 + movdqu [rsp + TMP5], %%XMM5 + movdqu [rsp + TMP6], %%XMM6 + movdqu [rsp + TMP7], %%XMM7 + movdqu [rsp + TMP8], %%XMM8 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Karatsuba Method + + movdqa %%T4, %%T7 + pshufd %%T6, %%T7, 01001110b + pxor %%T6, %%T7 + %ifidn %%loop_idx, in_order + paddd %%CTR, [ONE] ; INCR CNT + %else + paddd %%CTR, [ONEf] ; INCR CNT + %endif + movdqu %%T5, [%%GDATA + HashKey_8] + pclmulqdq %%T4, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T7, %%T5, 0x00 ; %%T7 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_8_k] + pclmulqdq %%T6, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + movdqa %%XMM1, %%CTR + + %ifidn %%loop_idx, in_order + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM2, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM3, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM4, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM5, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM6, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM7, %%CTR + + paddd %%CTR, [ONE] ; INCR CNT + movdqa %%XMM8, %%CTR + + pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + %else + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM2, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM3, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM4, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM5, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM6, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM7, %%CTR + + paddd %%CTR, [ONEf] ; INCR CNT + movdqa %%XMM8, %%CTR + %endif + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + movdqu %%T1, [%%GDATA + 16*0] + pxor %%XMM1, %%T1 + pxor %%XMM2, %%T1 + pxor %%XMM3, %%T1 + pxor %%XMM4, %%T1 + pxor %%XMM5, %%T1 + pxor %%XMM6, %%T1 + pxor %%XMM7, %%T1 + pxor %%XMM8, %%T1 + + ;; %%XMM6, %%T5 hold the values for the two operands which are carry-less multiplied + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Karatsuba Method + movdqu %%T1, [rsp + TMP2] + movdqa %%T3, %%T1 + + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_7] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_7_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*1] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + + movdqu %%T1, [%%GDATA + 16*2] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; Karatsuba Method + movdqu %%T1, [rsp + TMP3] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_6] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_6_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*3] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [rsp + TMP4] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_5] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_5_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*4] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*5] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [rsp + TMP5] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_4] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_4_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + + movdqu %%T1, [%%GDATA + 16*6] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + movdqu %%T1, [rsp + TMP6] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_3] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_3_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*7] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [rsp + TMP7] + movdqa %%T3, %%T1 + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey_2] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_2_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part + pxor %%T7, %%T3 + pxor %%T6, %%T2 + + movdqu %%T1, [%%GDATA + 16*8] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + + ;; %%XMM8, %%T5 hold the values for the two operands which are carry-less multiplied + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Karatsuba Method + movdqu %%T1, [rsp + TMP8] + movdqa %%T3, %%T1 + + pshufd %%T2, %%T3, 01001110b + pxor %%T2, %%T3 + movdqu %%T5, [%%GDATA + HashKey] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0 + movdqu %%T5, [%%GDATA + HashKey_k] + pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + pxor %%T7, %%T3 + pxor %%T4, %%T1 + + movdqu %%T1, [%%GDATA + 16*9] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + +%ifdef GCM128_MODE + movdqu %%T5, [%%GDATA + 16*10] +%endif +%ifdef GCM192_MODE + movdqu %%T1, [%%GDATA + 16*10] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*11] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T5, [%%GDATA + 16*12] ; finish last key round +%endif +%ifdef GCM256_MODE + movdqu %%T1, [%%GDATA + 16*10] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*11] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*12] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T1, [%%GDATA + 16*13] + aesenc %%XMM1, %%T1 + aesenc %%XMM2, %%T1 + aesenc %%XMM3, %%T1 + aesenc %%XMM4, %%T1 + aesenc %%XMM5, %%T1 + aesenc %%XMM6, %%T1 + aesenc %%XMM7, %%T1 + aesenc %%XMM8, %%T1 + + movdqu %%T5, [%%GDATA + 16*14] ; finish last key round +%endif + +%assign i 0 +%assign j 1 +%rep 8 + XLDR %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i] + +%ifidn %%ENC_DEC, DEC + movdqa %%T3, %%T1 +%endif + + pxor %%T1, %%T5 + aesenclast reg(j), %%T1 ; XMM1:XMM8 + XSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], reg(j) ; Write to the Output buffer + +%ifidn %%ENC_DEC, DEC + movdqa reg(j), %%T3 +%endif +%assign i (i+1) +%assign j (j+1) +%endrep + + + + + pxor %%T2, %%T6 + pxor %%T2, %%T4 + pxor %%T2, %%T7 + + + movdqa %%T3, %%T2 + pslldq %%T3, 8 ; shift-L %%T3 2 DWs + psrldq %%T2, 8 ; shift-R %%T2 2 DWs + pxor %%T7, %%T3 + pxor %%T4, %%T2 ; accumulate the results in %%T4:%%T7 + + + + ;first phase of the reduction + movdqa %%T2, %%T7 + movdqa %%T3, %%T7 + movdqa %%T1, %%T7 ; move %%T7 into %%T2, %%T3, %%T1 in order to perform the three shifts independently + + pslld %%T2, 31 ; packed right shifting << 31 + pslld %%T3, 30 ; packed right shifting shift << 30 + pslld %%T1, 25 ; packed right shifting shift << 25 + pxor %%T2, %%T3 ; xor the shifted versions + pxor %%T2, %%T1 + + movdqa %%T5, %%T2 + psrldq %%T5, 4 ; shift-R %%T5 1 DW + + pslldq %%T2, 12 ; shift-L %%T2 3 DWs + pxor %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap + pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap + + ;second phase of the reduction + movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T1) for doing three shift operations + movdqa %%T3,%%T7 + movdqa %%T1,%%T7 + + psrld %%T2,1 ; packed left shifting >> 1 + psrld %%T3,2 ; packed left shifting >> 2 + psrld %%T1,7 ; packed left shifting >> 7 + pxor %%T2,%%T3 ; xor the shifted versions + pxor %%T2,%%T1 + + pxor %%T2, %%T5 + pxor %%T7, %%T2 + pxor %%T7, %%T4 ; the result is in %%T4 + + + pxor %%XMM1, %%T7 + +%endmacro + + +; GHASH the last 4 ciphertext blocks. +%macro GHASH_LAST_8 16 +%define %%GDATA %1 +%define %%T1 %2 +%define %%T2 %3 +%define %%T3 %4 +%define %%T4 %5 +%define %%T5 %6 +%define %%T6 %7 +%define %%T7 %8 +%define %%XMM1 %9 +%define %%XMM2 %10 +%define %%XMM3 %11 +%define %%XMM4 %12 +%define %%XMM5 %13 +%define %%XMM6 %14 +%define %%XMM7 %15 +%define %%XMM8 %16 + + ; Karatsuba Method + movdqa %%T6, %%XMM1 + pshufd %%T2, %%XMM1, 01001110b + pxor %%T2, %%XMM1 + movdqu %%T5, [%%GDATA + HashKey_8] + pclmulqdq %%T6, %%T5, 0x11 ; %%T6 = a1*b1 + + pclmulqdq %%XMM1, %%T5, 0x00 ; %%XMM1 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_8_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + movdqa %%T7, %%XMM1 + movdqa %%XMM1, %%T2 ; result in %%T6, %%T7, %%XMM1 + + + ; Karatsuba Method + movdqa %%T1, %%XMM2 + pshufd %%T2, %%XMM2, 01001110b + pxor %%T2, %%XMM2 + movdqu %%T5, [%%GDATA + HashKey_7] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM2, %%T5, 0x00 ; %%XMM2 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_7_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM2 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + + ; Karatsuba Method + movdqa %%T1, %%XMM3 + pshufd %%T2, %%XMM3, 01001110b + pxor %%T2, %%XMM3 + movdqu %%T5, [%%GDATA + HashKey_6] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM3, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_6_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM3 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + ; Karatsuba Method + movdqa %%T1, %%XMM4 + pshufd %%T2, %%XMM4, 01001110b + pxor %%T2, %%XMM4 + movdqu %%T5, [%%GDATA + HashKey_5] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM4, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_5_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM4 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + ; Karatsuba Method + movdqa %%T1, %%XMM5 + pshufd %%T2, %%XMM5, 01001110b + pxor %%T2, %%XMM5 + movdqu %%T5, [%%GDATA + HashKey_4] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM5, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_4_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM5 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + ; Karatsuba Method + movdqa %%T1, %%XMM6 + pshufd %%T2, %%XMM6, 01001110b + pxor %%T2, %%XMM6 + movdqu %%T5, [%%GDATA + HashKey_3] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM6, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_3_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM6 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + ; Karatsuba Method + movdqa %%T1, %%XMM7 + pshufd %%T2, %%XMM7, 01001110b + pxor %%T2, %%XMM7 + movdqu %%T5, [%%GDATA + HashKey_2] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM7, %%T5, 0x00 ; %%XMM3 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_2_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM7 + pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1 + + + ; Karatsuba Method + movdqa %%T1, %%XMM8 + pshufd %%T2, %%XMM8, 01001110b + pxor %%T2, %%XMM8 + movdqu %%T5, [%%GDATA + HashKey] + pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1 + + pclmulqdq %%XMM8, %%T5, 0x00 ; %%XMM4 = a0*b0 + movdqu %%T4, [%%GDATA + HashKey_k] + pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0) + + pxor %%T6, %%T1 + pxor %%T7, %%XMM8 + pxor %%T2, %%XMM1 + pxor %%T2, %%T6 + pxor %%T2, %%T7 ; middle section of the temp results combined as in Karatsuba algorithm + + + movdqa %%T4, %%T2 + pslldq %%T4, 8 ; shift-L %%T4 2 DWs + psrldq %%T2, 8 ; shift-R %%T2 2 DWs + pxor %%T7, %%T4 + pxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications + + + ;first phase of the reduction + movdqa %%T2, %%T7 + movdqa %%T3, %%T7 + movdqa %%T4, %%T7 ; move %%T7 into %%T2, %%T3, %%T4 in order to perform the three shifts independently + + pslld %%T2, 31 ; packed right shifting << 31 + pslld %%T3, 30 ; packed right shifting shift << 30 + pslld %%T4, 25 ; packed right shifting shift << 25 + pxor %%T2, %%T3 ; xor the shifted versions + pxor %%T2, %%T4 + + movdqa %%T1, %%T2 + psrldq %%T1, 4 ; shift-R %%T1 1 DW + + pslldq %%T2, 12 ; shift-L %%T2 3 DWs + pxor %%T7, %%T2 ; first phase of the reduction complete + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + ;second phase of the reduction + movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T4) for doing three shift operations + movdqa %%T3,%%T7 + movdqa %%T4,%%T7 + + psrld %%T2,1 ; packed left shifting >> 1 + psrld %%T3,2 ; packed left shifting >> 2 + psrld %%T4,7 ; packed left shifting >> 7 + pxor %%T2,%%T3 ; xor the shifted versions + pxor %%T2,%%T4 + + pxor %%T2, %%T1 + pxor %%T7, %%T2 + pxor %%T6, %%T7 ; the result is in %%T6 + +%endmacro + +; Encryption of a single block +%macro ENCRYPT_SINGLE_BLOCK 3 +%define %%GDATA %1 +%define %%ST %2 +%define %%T1 %3 + movdqu %%T1, [%%GDATA+16*0] + pxor %%ST, %%T1 +%assign i 1 +%rep NROUNDS + movdqu %%T1, [%%GDATA+16*i] + aesenc %%ST, %%T1 +%assign i (i+1) +%endrep + movdqu %%T1, [%%GDATA+16*i] + aesenclast %%ST, %%T1 +%endmacro + + +;; Start of Stack Setup + +%macro FUNC_SAVE 0 + ;; Required for Update/GCM_ENC + ;the number of pushes must equal STACK_OFFSET + push r12 + push r13 + push r14 + push r15 + mov r14, rsp + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 + movdqu [rsp + LOCAL_STORAGE + 1*16],xmm7 + movdqu [rsp + LOCAL_STORAGE + 2*16],xmm8 + movdqu [rsp + LOCAL_STORAGE + 3*16],xmm9 + movdqu [rsp + LOCAL_STORAGE + 4*16],xmm10 + movdqu [rsp + LOCAL_STORAGE + 5*16],xmm11 + movdqu [rsp + LOCAL_STORAGE + 6*16],xmm12 + movdqu [rsp + LOCAL_STORAGE + 7*16],xmm13 + movdqu [rsp + LOCAL_STORAGE + 8*16],xmm14 + movdqu [rsp + LOCAL_STORAGE + 9*16],xmm15 +%endif +%endmacro + + +%macro FUNC_RESTORE 0 + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16] + movdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16] + movdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16] + movdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16] + movdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16] + movdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16] + movdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16] + movdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16] + movdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16] + movdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16] +%endif + +;; Required for Update/GCM_ENC + mov rsp, r14 + pop r15 + pop r14 + pop r13 + pop r12 +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding. +; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV, +; Additional Authentication data (A_IN), Additional Data length (A_LEN). +; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA. +; Clobbers rax, r10-r13 and xmm0-xmm6 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_INIT 5 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%IV %3 +%define %%A_IN %4 +%define %%A_LEN %5 +%define %%AAD_HASH xmm0 +%define %%SUBHASH xmm1 + + + movdqu %%SUBHASH, [%%GDATA_KEY + HashKey] + + CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax + pxor xmm2, xmm3 + mov r10, %%A_LEN + + movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash + mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length + xor r10, r10 + mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0 + mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0 + movdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0 + mov r10, %%IV + movdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001 + pinsrq xmm2, [r10], 0 + pinsrd xmm2, [r10+8], 2 + movdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv + + pshufb xmm2, [SHUF_MASK] + + movdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data +; struct has been initialized by GCM_INIT. +; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. +; Input: gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN), +; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC) +; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10-r15, and xmm0-xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_ENC_DEC 6 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%CYPH_PLAIN_OUT %3 +%define %%PLAIN_CYPH_IN %4 +%define %%PLAIN_CYPH_LEN %5 +%define %%ENC_DEC %6 +%define %%DATA_OFFSET r11 + +; Macro flow: +; calculate the number of 16byte blocks in the message +; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted' +; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left' +; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes' + + cmp %%PLAIN_CYPH_LEN, 0 + je %%_multiple_of_16_bytes + + xor %%DATA_OFFSET, %%DATA_OFFSET +%ifidn __OUTPUT_FORMAT__, win64 + mov r12, %%PLAIN_CYPH_LEN + add [%%GDATA_CTX + InLen], r12 ;Update length of data processed +%else + add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN ;Update length of data processed +%endif + movdqu xmm13, [%%GDATA_KEY + HashKey] ; xmm13 = HashKey + movdqu xmm8, [%%GDATA_CTX + AadHash] + + + PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC + + mov r13, %%PLAIN_CYPH_LEN ; save the number of bytes of plaintext/ciphertext + sub r13, %%DATA_OFFSET + mov r10, r13 ;save the amount of data left to process in r10 + and r13, -16 ; r13 = r13 - (r13 mod 16) + + mov r12, r13 + shr r12, 4 + and r12, 7 + jz %%_initial_num_blocks_is_0 + + cmp r12, 7 + je %%_initial_num_blocks_is_7 + cmp r12, 6 + je %%_initial_num_blocks_is_6 + cmp r12, 5 + je %%_initial_num_blocks_is_5 + cmp r12, 4 + je %%_initial_num_blocks_is_4 + cmp r12, 3 + je %%_initial_num_blocks_is_3 + cmp r12, 2 + je %%_initial_num_blocks_is_2 + + jmp %%_initial_num_blocks_is_1 + +%%_initial_num_blocks_is_7: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*7 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_6: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*6 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_5: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*5 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_4: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*4 + jmp %%_initial_blocks_encrypted + + +%%_initial_num_blocks_is_3: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*3 + jmp %%_initial_blocks_encrypted +%%_initial_num_blocks_is_2: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16*2 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_1: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + sub r13, 16 + jmp %%_initial_blocks_encrypted + +%%_initial_num_blocks_is_0: + INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC + + +%%_initial_blocks_encrypted: + cmp r13, 0 + je %%_zero_cipher_left + + sub r13, 128 + je %%_eight_cipher_left + + + + + movd r15d, xmm9 + and r15d, 255 + pshufb xmm9, [SHUF_MASK] + + +%%_encrypt_by_8_new: + cmp r15d, 255-8 + jg %%_encrypt_by_8 + + + + add r15b, 8 + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC + add %%DATA_OFFSET, 128 + sub r13, 128 + jne %%_encrypt_by_8_new + + pshufb xmm9, [SHUF_MASK] + jmp %%_eight_cipher_left + +%%_encrypt_by_8: + pshufb xmm9, [SHUF_MASK] + add r15b, 8 + GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC + pshufb xmm9, [SHUF_MASK] + add %%DATA_OFFSET, 128 + sub r13, 128 + jne %%_encrypt_by_8_new + + pshufb xmm9, [SHUF_MASK] + + + + +%%_eight_cipher_left: + GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 + + +%%_zero_cipher_left: + movdqu [%%GDATA_CTX + AadHash], xmm14 + movdqu [%%GDATA_CTX + CurCount], xmm9 + + mov r13, r10 + and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16) + + je %%_multiple_of_16_bytes + + mov [%%GDATA_CTX + PBlockLen], r13 ; my_ctx.data.partial_blck_length = r13 + ; handle the last <16 Byte block seperately + + paddd xmm9, [ONE] ; INCR CNT to get Yn + movdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx.data.current_counter = xmm9 + pshufb xmm9, [SHUF_MASK] + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9, xmm2 ; E(K, Yn) + movdqu [%%GDATA_CTX + PBlockEncKey], xmm9 ; my_ctx_data.partial_block_enc_key = xmm9 + + cmp %%PLAIN_CYPH_LEN, 16 + jge %%_large_enough_update + + lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET] + READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax + lea r12, [SHIFT_MASK + 16] + sub r12, r13 + jmp %%_data_read + +%%_large_enough_update: + sub %%DATA_OFFSET, 16 + add %%DATA_OFFSET, r13 + + movdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block + + sub %%DATA_OFFSET, r13 + add %%DATA_OFFSET, 16 + + lea r12, [SHIFT_MASK + 16] + sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16) + movdqu xmm2, [r12] ; get the appropriate shuffle mask + pshufb xmm1, xmm2 ; shift right 16-r13 bytes +%%_data_read: + %ifidn %%ENC_DEC, DEC + movdqa xmm2, xmm1 + pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9 + pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9 + pand xmm2, xmm1 + pshufb xmm2, [SHUF_MASK] + pxor xmm14, xmm2 + movdqu [%%GDATA_CTX + AadHash], xmm14 + + %else + pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn) + movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9 + pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9 + pshufb xmm9, [SHUF_MASK] + pxor xmm14, xmm9 + movdqu [%%GDATA_CTX + AadHash], xmm14 + + pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext + %endif + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ; output r13 Bytes + movq rax, xmm9 + cmp r13, 8 + jle %%_less_than_8_bytes_left + + mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax + add %%DATA_OFFSET, 8 + psrldq xmm9, 8 + movq rax, xmm9 + sub r13, 8 + +%%_less_than_8_bytes_left: + mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al + add %%DATA_OFFSET, 1 + shr rax, 8 + sub r13, 1 + jne %%_less_than_8_bytes_left + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%%_multiple_of_16_bytes: + +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes. +; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data * (GDATA_CTX) and +; whether encoding or decoding (ENC_DEC). +; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN) +; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_COMPLETE 5 +%define %%GDATA_KEY %1 +%define %%GDATA_CTX %2 +%define %%AUTH_TAG %3 +%define %%AUTH_TAG_LEN %4 +%define %%ENC_DEC %5 +%define %%PLAIN_CYPH_LEN rax + + mov r12, [%%GDATA_CTX + PBlockLen] ; r12 = aadLen (number of bytes) + movdqu xmm14, [%%GDATA_CTX + AadHash] + movdqu xmm13, [%%GDATA_KEY + HashKey] + + cmp r12, 0 + + je %%_partial_done + + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block + movdqu [%%GDATA_CTX + AadHash], xmm14 + +%%_partial_done: + + mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes) + mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen] + + shl r12, 3 ; convert into number of bits + movd xmm15, r12d ; len(A) in xmm15 + + shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128) + movq xmm1, %%PLAIN_CYPH_LEN + pslldq xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000 + pxor xmm15, xmm1 ; xmm15 = len(A)||len(C) + + pxor xmm14, xmm15 + GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation + pshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap + + movdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0 + + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9, xmm2 ; E(K, Y0) + + pxor xmm9, xmm14 + + + +%%_return_T: + mov r10, %%AUTH_TAG ; r10 = authTag + mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len + + cmp r11, 16 + je %%_T_16 + + cmp r11, 12 + je %%_T_12 + +%%_T_8: + movq rax, xmm9 + mov [r10], rax + jmp %%_return_T_done +%%_T_12: + movq rax, xmm9 + mov [r10], rax + psrldq xmm9, 8 + movd eax, xmm9 + mov [r10 + 8], eax + jmp %%_return_T_done + +%%_T_16: + movdqu [r10], xmm9 + +%%_return_T_done: +%endmacro ;GCM_COMPLETE + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_precomp_128_sse / aes_gcm_precomp_192_sse / aes_gcm_precomp_256_sse +; (struct gcm_key_data *key_data); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(precomp,_),function,) +FN_NAME(precomp,_): + + push r12 + push r13 + push r14 + push r15 + + mov r14, rsp + + + + sub rsp, VARIABLE_OFFSET + and rsp, ~63 ; align rsp to 64 bytes + +%ifidn __OUTPUT_FORMAT__, win64 + ; only xmm6 needs to be maintained + movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6 +%endif + + pxor xmm6, xmm6 + ENCRYPT_SINGLE_BLOCK arg1, xmm6, xmm2 ; xmm6 = HashKey + + pshufb xmm6, [SHUF_MASK] + ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; + movdqa xmm2, xmm6 + psllq xmm6, 1 + psrlq xmm2, 63 + movdqa xmm1, xmm2 + pslldq xmm2, 8 + psrldq xmm1, 8 + por xmm6, xmm2 + ;reduction + pshufd xmm2, xmm1, 00100100b + pcmpeqd xmm2, [TWOONE] + pand xmm2, [POLY] + pxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly + + + PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm6, [rsp + LOCAL_STORAGE + 0*16] +%endif + mov rsp, r14 + + pop r15 + pop r14 + pop r13 + pop r12 +ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_init_128_sse / aes_gcm_init_192_sse / aes_gcm_init_256_sse ( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *iv, +; const u8 *aad, +; u64 aad_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(init,_),function,) +FN_NAME(init,_): + push r12 + push r13 +%ifidn __OUTPUT_FORMAT__, win64 + push r14 + push r15 + mov r14, rsp + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 1*16 + movdqu [rsp + 0*16], xmm6 +%endif + + GCM_INIT arg1, arg2, arg3, arg4, arg5 + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm6 , [rsp + 0*16] + mov rsp, r14 + pop r15 + pop r14 +%endif + pop r13 + pop r12 + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_update_sse / aes_gcm_enc_192_update_sse / aes_gcm_enc_256_update_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(enc,_update_),function,) +FN_NAME(enc,_update_): + + FUNC_SAVE + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC + + FUNC_RESTORE + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_256_update_sse / aes_gcm_dec_192_update_sse / aes_gcm_dec_256_update_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(dec,_update_),function,) +FN_NAME(dec,_update_): + + FUNC_SAVE + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC + + FUNC_RESTORE + + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_finalize_sse / aes_gcm_enc_192_finalize_sse / aes_gcm_enc_256_finalize_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(enc,_finalize_),function,) +FN_NAME(enc,_finalize_): + + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 5*16 + movdqu [rsp + 0*16],xmm6 + movdqu [rsp + 1*16],xmm9 + movdqu [rsp + 2*16],xmm11 + movdqu [rsp + 3*16],xmm14 + movdqu [rsp + 4*16],xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, ENC + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm15 , [rsp + 4*16] + movdqu xmm14 , [rsp+ 3*16] + movdqu xmm11 , [rsp + 2*16] + movdqu xmm9 , [rsp + 1*16] + movdqu xmm6 , [rsp + 0*16] + add rsp, 5*16 +%endif + + pop r12 + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_finalize_sse / aes_gcm_dec_192_finalize_sse / aes_gcm_dec_256_finalize_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(dec,_finalize_),function,) +FN_NAME(dec,_finalize_): + + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 5*16 + movdqu [rsp + 0*16],xmm6 + movdqu [rsp + 1*16],xmm9 + movdqu [rsp + 2*16],xmm11 + movdqu [rsp + 3*16],xmm14 + movdqu [rsp + 4*16],xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, DEC + +%ifidn __OUTPUT_FORMAT__, win64 + movdqu xmm15 , [rsp + 4*16] + movdqu xmm14 , [rsp+ 3*16] + movdqu xmm11 , [rsp + 2*16] + movdqu xmm9 , [rsp + 1*16] + movdqu xmm6 , [rsp + 0*16] + add rsp, 5*16 +%endif + + pop r12 + ret + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_sse / aes_gcm_enc_192_sse / aes_gcm_enc_256_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(enc,_),function,) +FN_NAME(enc,_): + + FUNC_SAVE + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC + + GCM_COMPLETE arg1, arg2, arg9, arg10, ENC + + FUNC_RESTORE + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_sse / aes_gcm_dec_192_sse / aes_gcm_dec_256_sse +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 plaintext_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +MKGLOBAL(FN_NAME(dec,_),function,) +FN_NAME(dec,_): + + FUNC_SAVE + + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC + + GCM_COMPLETE arg1, arg2, arg9, arg10, DEC + + FUNC_RESTORE + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_flush_sse.asm new file mode 100644 index 00000000..fdc5bbb1 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_flush_sse.asm @@ -0,0 +1,30 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define AES_CBC_ENC_X4 aes_cbc_enc_192_x4 +%define FLUSH_JOB_AES_ENC flush_job_aes192_enc_sse +%include "mb_mgr_aes_flush_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_submit_sse.asm new file mode 100644 index 00000000..9b8ed393 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes192_submit_sse.asm @@ -0,0 +1,30 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define AES_CBC_ENC_X4 aes_cbc_enc_192_x4 +%define SUBMIT_JOB_AES_ENC submit_job_aes192_enc_sse +%include "mb_mgr_aes_submit_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_flush_sse.asm new file mode 100644 index 00000000..44e2ed12 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_flush_sse.asm @@ -0,0 +1,30 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define AES_CBC_ENC_X4 aes_cbc_enc_256_x4 +%define FLUSH_JOB_AES_ENC flush_job_aes256_enc_sse +%include "mb_mgr_aes_flush_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_submit_sse.asm new file mode 100644 index 00000000..615d13ee --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes256_submit_sse.asm @@ -0,0 +1,30 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define AES_CBC_ENC_X4 aes_cbc_enc_256_x4 +%define SUBMIT_JOB_AES_ENC submit_job_aes256_enc_sse +%include "mb_mgr_aes_submit_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_flush_sse.asm new file mode 100644 index 00000000..db216e9f --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_flush_sse.asm @@ -0,0 +1,210 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +%ifndef AES_CBC_ENC_X4 +%define AES_CBC_ENC_X4 aes_cbc_enc_128_x4 +%define FLUSH_JOB_AES_ENC flush_job_aes128_enc_sse +%endif + +; void AES_CBC_ENC_X4(AES_ARGS_x8 *args, UINT64 len_in_bytes); +extern AES_CBC_ENC_X4 + +section .data +default rel + +align 16 +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 + ;ddq 0x00000000000000000000FFFF00000000 + dq 0x0000FFFF00000000, 0x0000000000000000 + ;ddq 0x0000000000000000FFFF000000000000 + dq 0xFFFF000000000000, 0x0000000000000000 +one: dq 1 +two: dq 2 +three: dq 3 + +section .text + +%define APPEND(a,b) a %+ b + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define job_rax rax + +%if 1 +%define unused_lanes rbx +%define tmp1 rbx + +%define good_lane rdx +%define iv rdx + +%define tmp2 rax + +; idx needs to be in rbp +%define tmp rbp +%define idx rbp + +%define tmp3 r8 +%endif + +; STACK_SPACE needs to be an odd multiple of 8 +; This routine and its callee clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +; JOB* FLUSH_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job) +; arg 1 : state +; arg 2 : job +MKGLOBAL(FLUSH_JOB_AES_ENC,function,internal) +FLUSH_JOB_AES_ENC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + ; check for empty + mov unused_lanes, [state + _aes_unused_lanes] + bt unused_lanes, 32+7 + jc return_null + + ; find a lane with a non-null job + xor good_lane, good_lane + cmp qword [state + _aes_job_in_lane + 1*8], 0 + cmovne good_lane, [rel one] + cmp qword [state + _aes_job_in_lane + 2*8], 0 + cmovne good_lane, [rel two] + cmp qword [state + _aes_job_in_lane + 3*8], 0 + cmovne good_lane, [rel three] + + ; copy good_lane to empty lanes + mov tmp1, [state + _aes_args_in + good_lane*8] + mov tmp2, [state + _aes_args_out + good_lane*8] + mov tmp3, [state + _aes_args_keys + good_lane*8] + shl good_lane, 4 ; multiply by 16 + movdqa xmm2, [state + _aes_args_IV + good_lane] + movdqa xmm0, [state + _aes_lens] + +%assign I 0 +%rep 4 + cmp qword [state + _aes_job_in_lane + I*8], 0 + jne APPEND(skip_,I) + mov [state + _aes_args_in + I*8], tmp1 + mov [state + _aes_args_out + I*8], tmp2 + mov [state + _aes_args_keys + I*8], tmp3 + movdqa [state + _aes_args_IV + I*16], xmm2 + por xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + + ; Find min length + phminposuw xmm1, xmm0 + pextrw len2, xmm1, 0 ; min value + pextrw idx, xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + pshuflw xmm1, xmm1, 0 + psubw xmm0, xmm1 + movdqa [state + _aes_lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call AES_CBC_ENC_X4 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + mov job_rax, [state + _aes_job_in_lane + idx*8] +; Don't write back IV +; mov iv, [job_rax + _iv] + mov unused_lanes, [state + _aes_unused_lanes] + mov qword [state + _aes_job_in_lane + idx*8], 0 + or dword [job_rax + _status], STS_COMPLETED_AES + shl unused_lanes, 8 + or unused_lanes, idx +; shl idx, 4 ; multiply by 16 + mov [state + _aes_unused_lanes], unused_lanes +; movdqa xmm0, [state + _aes_args_IV + idx] +; movdqu [iv], xmm0 + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +return_null: + xor job_rax, job_rax + jmp return + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_submit_sse.asm new file mode 100644 index 00000000..ddbd03fd --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_submit_sse.asm @@ -0,0 +1,179 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +%ifndef AES_CBC_ENC_X4 +%define AES_CBC_ENC_X4 aes_cbc_enc_128_x4 +%define SUBMIT_JOB_AES_ENC submit_job_aes128_enc_sse +%endif + +; void AES_CBC_ENC_X4(AES_ARGS_x8 *args, UINT64 len_in_bytes); +extern AES_CBC_ENC_X4 + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define job_rax rax + +%if 1 +; idx needs to be in rbp +%define len rbp +%define idx rbp +%define tmp rbp + +%define lane r8 + +%define iv r9 + +%define unused_lanes rbx +%endif + +; STACK_SPACE needs to be an odd multiple of 8 +; This routine and its callee clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +section .text + +; JOB* SUBMIT_JOB_AES_ENC(MB_MGR_AES_OOO *state, JOB_AES_HMAC *job) +; arg 1 : state +; arg 2 : job +MKGLOBAL(SUBMIT_JOB_AES_ENC,function,internal) +SUBMIT_JOB_AES_ENC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _aes_unused_lanes] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + mov len, [job + _msg_len_to_cipher_in_bytes] + and len, -16 ; DOCSIS may pass size unaligned to block size + mov iv, [job + _iv] + mov [state + _aes_unused_lanes], unused_lanes + + mov [state + _aes_job_in_lane + lane*8], job + mov [state + _aes_lens + 2*lane], WORD(len) + + mov tmp, [job + _src] + add tmp, [job + _cipher_start_src_offset_in_bytes] + movdqu xmm0, [iv] + mov [state + _aes_args_in + lane*8], tmp + mov tmp, [job + _aes_enc_key_expanded] + mov [state + _aes_args_keys + lane*8], tmp + mov tmp, [job + _dst] + mov [state + _aes_args_out + lane*8], tmp + shl lane, 4 ; multiply by 16 + movdqa [state + _aes_args_IV + lane], xmm0 + + cmp unused_lanes, 0xff + jne return_null + + ; Find min length + movdqa xmm0, [state + _aes_lens] + phminposuw xmm1, xmm0 + pextrw len2, xmm1, 0 ; min value + pextrw idx, xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + pshuflw xmm1, xmm1, 0 + psubw xmm0, xmm1 + movdqa [state + _aes_lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call AES_CBC_ENC_X4 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + mov job_rax, [state + _aes_job_in_lane + idx*8] +; Don't write back IV +; mov iv, [job_rax + _iv] + mov unused_lanes, [state + _aes_unused_lanes] + mov qword [state + _aes_job_in_lane + idx*8], 0 + or dword [job_rax + _status], STS_COMPLETED_AES + shl unused_lanes, 8 + or unused_lanes, idx +; shl idx, 4 ; multiply by 16 + mov [state + _aes_unused_lanes], unused_lanes +; movdqa xmm0, [state + _aes_args_IV + idx] +; movdqu [iv], xmm0 + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +return_null: + xor job_rax, job_rax + jmp return + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_flush_sse.asm new file mode 100644 index 00000000..18562745 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_flush_sse.asm @@ -0,0 +1,225 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" + +%ifndef AES_XCBC_X4 +%define AES_XCBC_X4 aes_xcbc_mac_128_x4 +%define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_sse +%endif + +; void AES_XCBC_X4(AES_XCBC_ARGS_x8 *args, UINT64 len_in_bytes); +extern AES_XCBC_X4 + +section .data +default rel + +align 16 +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 + ;ddq 0x00000000000000000000FFFF00000000 + dq 0x0000FFFF00000000, 0x0000000000000000 + ;ddq 0x0000000000000000FFFF000000000000 + dq 0xFFFF000000000000, 0x0000000000000000 +one: dq 1 +two: dq 2 +three: dq 3 + +section .text + +%define APPEND(a,b) a %+ b + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define job_rax rax + +%if 1 +%define unused_lanes rbx +%define tmp1 rbx + +%define icv rdx + +%define tmp2 rax + +; idx needs to be in rbp +%define tmp r10 +%define idx rbp + +%define tmp3 r8 +%define lane_data r9 +%endif + +; STACK_SPACE needs to be an odd multiple of 8 +; This routine and its callee clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +; JOB* FLUSH_JOB_AES_XCBC(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job) +; arg 1 : state +; arg 2 : job +MKGLOBAL(FLUSH_JOB_AES_XCBC,function,internal) +FLUSH_JOB_AES_XCBC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + ; check for empty + mov unused_lanes, [state + _aes_xcbc_unused_lanes] + bt unused_lanes, 32+7 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _aes_xcbc_ldata + 1 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + cmovne idx, [rel one] + cmp qword [state + _aes_xcbc_ldata + 2 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + cmovne idx, [rel two] + cmp qword [state + _aes_xcbc_ldata + 3 * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + cmovne idx, [rel three] + +copy_lane_data: + ; copy idx to empty lanes + mov tmp1, [state + _aes_xcbc_args_in + idx*8] + mov tmp3, [state + _aes_xcbc_args_keys + idx*8] + shl idx, 4 ; multiply by 16 + movdqa xmm2, [state + _aes_xcbc_args_ICV + idx] + movdqa xmm0, [state + _aes_xcbc_lens] + +%assign I 0 +%rep 4 + cmp qword [state + _aes_xcbc_ldata + I * _XCBC_LANE_DATA_size + _xcbc_job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _aes_xcbc_args_in + I*8], tmp1 + mov [state + _aes_xcbc_args_keys + I*8], tmp3 + movdqa [state + _aes_xcbc_args_ICV + I*16], xmm2 + por xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + + movdqa [state + _aes_xcbc_lens], xmm0 + + ; Find min length + phminposuw xmm1, xmm0 + pextrw len2, xmm1, 0 ; min value + pextrw idx, xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + pshuflw xmm1, xmm1, 0 + psubw xmm0, xmm1 + movdqa [state + _aes_xcbc_lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call AES_XCBC_X4 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _XCBC_LANE_DATA_size + lea lane_data, [state + _aes_xcbc_ldata + lane_data] + cmp dword [lane_data + _xcbc_final_done], 0 + jne end_loop + + mov dword [lane_data + _xcbc_final_done], 1 + mov word [state + _aes_xcbc_lens + 2*idx], 16 + lea tmp, [lane_data + _xcbc_final_block] + mov [state + _aes_xcbc_args_in + 8*idx], tmp + jmp copy_lane_data + +end_loop: + mov job_rax, [lane_data + _xcbc_job_in_lane] + mov icv, [job_rax + _auth_tag_output] + mov unused_lanes, [state + _aes_xcbc_unused_lanes] + mov qword [lane_data + _xcbc_job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 8 + or unused_lanes, idx + shl idx, 4 ; multiply by 16 + mov [state + _aes_xcbc_unused_lanes], unused_lanes + + ; copy 12 bytes + movdqa xmm0, [state + _aes_xcbc_args_ICV + idx] + movq [icv], xmm0 + pextrd [icv + 8], xmm0, 2 + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +return_null: + xor job_rax, job_rax + jmp return + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_submit_sse.asm new file mode 100644 index 00000000..fbf3d5de --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_aes_xcbc_submit_sse.asm @@ -0,0 +1,250 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" + +%include "reg_sizes.asm" +%include "memcpy.asm" +%ifndef AES_XCBC_X4 +%define AES_XCBC_X4 aes_xcbc_mac_128_x4 +%define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_sse +%endif + +; void AES_XCBC_X4(AES_XCBC_ARGS_x8 *args, UINT64 len_in_bytes); +extern AES_XCBC_X4 + +section .data +default rel + +align 16 +x80: ;ddq 0x00000000000000000000000000000080 + dq 0x0000000000000080, 0x0000000000000000 + +section .text + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + +%define job_rax rax + +%if 1 +; idx needs to be in rbp +%define idx rbp +%define last_len rbp + +%define lane r8 + +%define icv r9 +%define p2 r9 + +%define tmp r10 +%define len r11 +%define lane_data r12 +%define p r13 +%define tmp2 r14 + +%define unused_lanes rbx +%endif + +; STACK_SPACE needs to be an odd multiple of 8 +; This routine and its callee clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +; JOB* SUBMIT_JOB_AES_XCBC(MB_MGR_AES_XCBC_OOO *state, JOB_AES_HMAC *job) +; arg 1 : state +; arg 2 : job +MKGLOBAL(SUBMIT_JOB_AES_XCBC,function,internal) +SUBMIT_JOB_AES_XCBC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _aes_xcbc_unused_lanes] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + imul lane_data, lane, _XCBC_LANE_DATA_size + lea lane_data, [state + _aes_xcbc_ldata + lane_data] + mov [state + _aes_xcbc_unused_lanes], unused_lanes + mov len, [job + _msg_len_to_hash_in_bytes] + mov [lane_data + _xcbc_job_in_lane], job + mov dword [lane_data + _xcbc_final_done], 0 + mov tmp, [job + _k1_expanded] + mov [state + _aes_xcbc_args_keys + lane*8], tmp + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + + mov last_len, len + + cmp len, 16 + jle small_buffer + + mov [state + _aes_xcbc_args_in + lane*8], p + add p, len ; set point to end of data + + and last_len, 15 ; Check lsbs of msg len + jnz slow_copy ; if not 16B mult, do slow copy + +fast_copy: + movdqu xmm0, [p - 16] ; load last block M[n] + mov tmp, [job + _k2] ; load K2 address + movdqu xmm1, [tmp] ; load K2 + pxor xmm0, xmm1 ; M[n] XOR K2 + movdqa [lane_data + _xcbc_final_block], xmm0 + sub len, 16 ; take last block off length +end_fast_copy: + + mov [state + _aes_xcbc_lens + 2*lane], WORD(len) + + pxor xmm0, xmm0 + shl lane, 4 ; multiply by 16 + movdqa [state + _aes_xcbc_args_ICV + lane], xmm0 + + cmp unused_lanes, 0xff + jne return_null + +start_loop: + ; Find min length + movdqa xmm0, [state + _aes_xcbc_lens] + phminposuw xmm1, xmm0 + pextrw len2, xmm1, 0 ; min value + pextrw idx, xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + pshuflw xmm1, xmm1, 0 + psubw xmm0, xmm1 + movdqa [state + _aes_xcbc_lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call AES_XCBC_X4 + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _XCBC_LANE_DATA_size + lea lane_data, [state + _aes_xcbc_ldata + lane_data] + cmp dword [lane_data + _xcbc_final_done], 0 + jne end_loop + + mov dword [lane_data + _xcbc_final_done], 1 + mov word [state + _aes_xcbc_lens + 2*idx], 16 + lea tmp, [lane_data + _xcbc_final_block] + mov [state + _aes_xcbc_args_in + 8*idx], tmp + jmp start_loop + +end_loop: + ; process completed job "idx" + mov job_rax, [lane_data + _xcbc_job_in_lane] + mov icv, [job_rax + _auth_tag_output] + mov unused_lanes, [state + _aes_xcbc_unused_lanes] + mov qword [lane_data + _xcbc_job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 8 + or unused_lanes, idx + shl idx, 4 ; multiply by 16 + mov [state + _aes_xcbc_unused_lanes], unused_lanes + + ; copy 12 bytes + movdqa xmm0, [state + _aes_xcbc_args_ICV + idx] + movq [icv], xmm0 + pextrd [icv + 8], xmm0, 2 + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +small_buffer: + ; For buffers <= 16 Bytes + ; The input data is set to final block + lea tmp, [lane_data + _xcbc_final_block] ; final block + mov [state + _aes_xcbc_args_in + lane*8], tmp + add p, len ; set point to end of data + cmp len, 16 + je fast_copy + +slow_copy: + and len, ~15 ; take final block off len + sub p, last_len ; adjust data pointer + lea p2, [lane_data + _xcbc_final_block + 16] ; upper part of final + sub p2, last_len ; adjust data pointer backwards + memcpy_sse_16_1 p2, p, last_len, tmp, tmp2 + movdqa xmm0, [rel x80] ; fill reg with padding + movdqu [lane_data + _xcbc_final_block + 16], xmm0 ; add padding + movdqu xmm0, [p2] ; load final block to process + mov tmp, [job + _k3] ; load K3 address + movdqu xmm1, [tmp] ; load K3 + pxor xmm0, xmm1 ; M[n] XOR K3 + movdqu [lane_data + _xcbc_final_block], xmm0 ; write final block + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_ni_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_ni_sse.asm new file mode 100644 index 00000000..0aaad7e5 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_ni_sse.asm @@ -0,0 +1,260 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; In System V AMD64 ABI +;; calle saves: RBX, RBP, R12-R15 +;; Windows x64 ABI +;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 +;; +;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Windows clobbers: RAX RCX RDX R8 +;; Windows preserves: RBX RBP RSI RDI R9 R10 R11 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Linux clobbers: RAX RSI RDI R8 +;; Linux preserves: RBX RCX RDX RBP R9 R10 R11 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; +;; Linux/Windows clobbers: xmm0 - xmm15 +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +;%define DO_DBGPRINT +%include "dbgprint.asm" + +extern sha1_ni + +section .data +default rel + +align 16 +byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b +one: + dq 1 + +section .text + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r12-r15 +%define idx rbp + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 +%define p2 r8 + +; This routine clobbers rbx, rbp +struc STACK +_gpr_save: resq 4 +_rsp_save: resq 1 +endstruc + +%define APPEND(a,b) a %+ b + +; JOB* flush_job_hmac_ni_sse(MB_MGR_HMAC_SHA_1_OOO *state) +; arg 1 : state +MKGLOBAL(flush_job_hmac_ni_sse,function,internal) +flush_job_hmac_ni_sse: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp +%ifndef LINUX + mov [rsp + _gpr_save + 8*2], rsi + mov [rsp + _gpr_save + 8*3], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + DBGPRINTL "enter sha1-ni-sse flush" + mov unused_lanes, [state + _unused_lanes] + bt unused_lanes, 16+7 + jc return_null + + ; find a lane with a non-null job, assume it is 0 then check 1 + xor idx, idx + cmp qword [state + _ldata + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel one] + DBGPRINTL64 "idx:", idx + +copy_lane_data: + ; copy valid lane (idx) to empty lanes + mov tmp, [state + _args_data_ptr + PTR_SZ*idx] + movzx len2, word [state + _lens + idx*2] + + DBGPRINTL64 "ptr", tmp + + ; there are only two lanes so if one is empty it is easy to determine which one + xor idx, 1 + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + xor idx, 1 + + ; No need to find min length - only two lanes available + cmp len2, 0 + je len_is_0 + + ; Set length on both lanes to 0 + mov dword [state + _lens], 0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_ni + ; state is intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens + 2*idx], 1 + DBGPRINTL64 "outer-block-index", idx + lea tmp, [lane_data + _outer_block] + DBGPRINTL64 "outer block ptr:", tmp + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + + ;; idx determines which column + ;; read off from consecutive rows +%if SHA1NI_DIGEST_ROW_SIZE != 20 +%error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!" +%endif + lea p2, [idx + idx*4] + movdqu xmm0, [state + _args_digest + p2*4] + pshufb xmm0, [rel byteswap] + mov DWORD(tmp), [state + _args_digest + p2*4 + 4*SHA1_DIGEST_WORD_SIZE] + bswap DWORD(tmp) + movdqa [lane_data + _outer_block], xmm0 + mov [lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) + DBGPRINTL_XMM "sha1 outer hash input words[0-3]", xmm0 + DBGPRINTL64 "sha1 outer hash input word 4", tmp + mov job, [lane_data + _job_in_lane] + mov tmp, [job + _auth_key_xor_opad] + movdqu xmm0, [tmp] + mov DWORD(tmp), [tmp + 4*SHA1_DIGEST_WORD_SIZE] + movdqu [state + _args_digest + p2*4], xmm0 + mov [state + _args_digest + p2*4 + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) + + jmp copy_lane_data + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + DBGPRINTL64 "extra blocks-start offset", start_offset + mov [state + _lens + 2*idx], WORD(extra_blocks) + DBGPRINTL64 "extra blocks-len", extra_blocks + lea tmp, [lane_data + _extra_block + start_offset] + DBGPRINTL64 "extra block ptr", tmp + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp copy_lane_data + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 12 bytes +%if SHA1NI_DIGEST_ROW_SIZE != 20 +%error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!" +%endif + lea idx, [idx + idx*4] + mov DWORD(tmp2), [state + _args_digest + idx*4 + 0*SHA1_DIGEST_WORD_SIZE] + mov DWORD(tmp4), [state + _args_digest + idx*4 + 1*SHA1_DIGEST_WORD_SIZE] + mov DWORD(tmp3), [state + _args_digest + idx*4 + 2*SHA1_DIGEST_WORD_SIZE] + bswap DWORD(tmp2) + bswap DWORD(tmp4) + bswap DWORD(tmp3) + mov [p + 0*4], DWORD(tmp2) + mov [p + 1*4], DWORD(tmp4) + mov [p + 2*4], DWORD(tmp3) + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*2] + mov rdi, [rsp + _gpr_save + 8*3] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_sse.asm new file mode 100644 index 00000000..66b894d8 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_flush_sse.asm @@ -0,0 +1,254 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +;%define DO_DBGPRINT +%include "dbgprint.asm" + +extern sha1_mult_sse + +section .data +default rel + +align 16 +byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b +x80: ;ddq 0x00000000000000000000000000000080 + dq 0x0000000000000080, 0x0000000000000000 +x00: ;ddq 0x00000000000000000000000000000000 + dq 0x0000000000000000, 0x0000000000000000 +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 + ;ddq 0x00000000000000000000FFFF00000000 + dq 0x0000FFFF00000000, 0x0000000000000000 + ;ddq 0x0000000000000000FFFF000000000000 + dq 0xFFFF000000000000, 0x0000000000000000 +one: dq 1 +two: dq 2 +three: dq 3 + +section .text + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r12-r15 +%define idx rbp + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 + +%endif + +; This routine clobbers rbx, rbp +struc STACK +_gpr_save: resq 2 +_rsp_save: resq 1 +endstruc + +%define APPEND(a,b) a %+ b + +; JOB* flush_job_hmac_sse(MB_MGR_HMAC_SHA_1_OOO *state) +; arg 1 : state +MKGLOBAL(flush_job_hmac_sse,function,internal) +flush_job_hmac_sse: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _rsp_save], rax ; original SP + + DBGPRINTL "enter sha1-sse flush" + mov unused_lanes, [state + _unused_lanes] + bt unused_lanes, 32+7 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel one] + cmp qword [state + _ldata + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel two] + cmp qword [state + _ldata + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel three] +copy_lane_data: + ; copy valid lane (idx) to empty lanes + movdqa xmm0, [state + _lens] + mov tmp, [state + _args_data_ptr + PTR_SZ*idx] + +%assign I 0 +%rep 4 + cmp qword [state + _ldata + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args_data_ptr + PTR_SZ*I], tmp + por xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + + movdqa [state + _lens], xmm0 + + phminposuw xmm1, xmm0 + pextrw len2, xmm1, 0 ; min value + pextrw idx, xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + pshuflw xmm1, xmm1, 0 + psubw xmm0, xmm1 + movdqa [state + _lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mult_sse + ; state is intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens + 2*idx], 1 + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + + ;; idx determines which column + ;; read off from consecutive rows + movd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] + pinsrd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1 + pinsrd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2 + pinsrd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3 + pshufb xmm0, [rel byteswap] + mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE] + bswap DWORD(tmp) + movdqa [lane_data + _outer_block], xmm0 + mov [lane_data + _outer_block + 4*4], DWORD(tmp) + DBGPRINTL_XMM "sha1 outer hash input words[0-3]", xmm0 + DBGPRINTL64 "sha1 outer hash input word 4", tmp + mov tmp, [job + _auth_key_xor_opad] + movdqu xmm0, [tmp] + mov DWORD(tmp), [tmp + 4*4] + movd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0 + pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1 + pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2 + pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3 + mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp) + + jmp copy_lane_data + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + mov [state + _lens + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp copy_lane_data + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + mov unused_lanes, [state + _unused_lanes] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 12 bytes + mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] + mov DWORD(tmp4), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE] + mov DWORD(tmp3), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE] + bswap DWORD(tmp2) + bswap DWORD(tmp4) + bswap DWORD(tmp3) + mov [p + 0*4], DWORD(tmp2) + mov [p + 1*4], DWORD(tmp4) + mov [p + 2*4], DWORD(tmp3) + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_flush_sse.asm new file mode 100644 index 00000000..73878e46 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_flush_sse.asm @@ -0,0 +1,274 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +extern md5_x4x2_sse + +section .data +default rel +align 16 +dupw: ;ddq 0x01000100010001000100010001000100 + dq 0x0100010001000100, 0x0100010001000100 +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 + ;ddq 0x00000000000000000000FFFF00000000 + dq 0x0000FFFF00000000, 0x0000000000000000 + ;ddq 0x0000000000000000FFFF000000000000 + dq 0xFFFF000000000000, 0x0000000000000000 + ;ddq 0x000000000000FFFF0000000000000000 + dq 0x0000000000000000, 0x000000000000FFFF + ;ddq 0x00000000FFFF00000000000000000000 + dq 0x0000000000000000, 0x00000000FFFF0000 + ;ddq 0x0000FFFF000000000000000000000000 + dq 0x0000000000000000, 0x0000FFFF00000000 + ;ddq 0xFFFF0000000000000000000000000000 + dq 0x0000000000000000, 0xFFFF000000000000 +one: dq 1 +two: dq 2 +three: dq 3 +four: dq 4 +five: dq 5 +six: dq 6 +seven: dq 7 + +section .text + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbp +%define idx rbp + +; unused_lanes must be in rax-rdx +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 + +%endif + +; This routine and/or the called routine clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +%define APPEND(a,b) a %+ b + +; JOB* flush_job_hmac_md5_sse(MB_MGR_HMAC_MD5_OOO *state) +; arg 1 : rcx : state +MKGLOBAL(flush_job_hmac_md5_sse,function,internal) +flush_job_hmac_md5_sse: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_md5] + bt unused_lanes, 32+3 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata_md5 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel one] + cmp qword [state + _ldata_md5 + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel two] + cmp qword [state + _ldata_md5 + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel three] + cmp qword [state + _ldata_md5 + 4 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel four] + cmp qword [state + _ldata_md5 + 5 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel five] + cmp qword [state + _ldata_md5 + 6 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel six] + cmp qword [state + _ldata_md5 + 7 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane],0 + cmovne idx, [rel seven] + +copy_lane_data: + ; copy good lane (idx) to empty lanes + movdqa xmm0, [state + _lens_md5] + mov tmp, [state + _args_data_ptr_md5 + PTR_SZ*idx] + +%assign I 0 +%rep 8 + cmp qword [state + _ldata_md5 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args_data_ptr_md5 + PTR_SZ*I], tmp + por xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + + movdqa [state + _lens_md5], xmm0 + + phminposuw xmm1, xmm0 + pextrw len2, xmm1, 0 ; min value + pextrw idx, xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + pshufb xmm1, [rel dupw] ; duplicate words across all lanes + psubw xmm0, xmm1 + movdqa [state + _lens_md5], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call md5_x4x2_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_md5 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens_md5 + 2*idx], 1 + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp + + movd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] + pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1 + pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2 + pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3 +; pshufb xmm0, [byteswap wrt rip] + movdqa [lane_data + _outer_block], xmm0 + + mov tmp, [job + _auth_key_xor_opad] + movdqu xmm0, [tmp] + movd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0 + pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1 + pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2 + pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3 + jmp copy_lane_data + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + mov [state + _lens_md5 + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp copy_lane_data + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + mov unused_lanes, [state + _unused_lanes_md5] + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes_md5], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 12 bytes + mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] + mov DWORD(tmp4), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE] + mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE] +; bswap DWORD(tmp2) +; bswap DWORD(tmp4) +; bswap DWORD(tmp3) + mov [p + 0*4], DWORD(tmp2) + mov [p + 1*4], DWORD(tmp4) + mov [p + 2*4], DWORD(tmp3) + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_submit_sse.asm new file mode 100644 index 00000000..cfe60e78 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_md5_submit_sse.asm @@ -0,0 +1,310 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "memcpy.asm" +%include "reg_sizes.asm" + +extern md5_x4x2_sse + +section .data +default rel +align 16 +;byteswap: ddq 0x0c0d0e0f08090a0b0405060700010203 +dupw: ;ddq 0x01000100010001000100010001000100 + dq 0x0100010001000100, 0x0100010001000100 + +section .text + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define reg3 rcx +%define reg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define reg3 rdi +%define reg4 rsi +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbp +%define last_len rbp +%define idx rbp + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx +%define tmp4 rbx + +%define job_rax rax +%define len rax + +%define size_offset reg3 +%define tmp2 reg3 + +%define lane reg4 +%define tmp3 reg4 + +%define extra_blocks r8 + +%define tmp r9 +%define p2 r9 + +%define lane_data r10 + +%endif + +; This routine and/or the called routine clobbers all GPRs +struc STACK +_gpr_save: resq 8 +_rsp_save: resq 1 +endstruc + +; JOB* submit_job_hmac_md5_sse(MB_MGR_HMAC_MD5_OOO *state, JOB_AES_HMAC *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +MKGLOBAL(submit_job_hmac_md5_sse,function,internal) +submit_job_hmac_md5_sse: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _gpr_save + 8*3], r13 + mov [rsp + _gpr_save + 8*4], r14 + mov [rsp + _gpr_save + 8*5], r15 +%ifndef LINUX + mov [rsp + _gpr_save + 8*6], rsi + mov [rsp + _gpr_save + 8*7], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_md5] + mov lane, unused_lanes + and lane, 0xF + shr unused_lanes, 4 + imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_md5 + lane_data] + mov [state + _unused_lanes_md5], unused_lanes + mov len, [job + _msg_len_to_hash_in_bytes] + mov tmp, len + shr tmp, 6 ; divide by 64, len in terms of blocks + + mov [lane_data + _job_in_lane], job + mov dword [lane_data + _outer_done], 0 + mov [state + _lens_md5 + 2*lane], WORD(tmp) + + mov last_len, len + and last_len, 63 + lea extra_blocks, [last_len + 9 + 63] + shr extra_blocks, 6 + mov [lane_data + _extra_blocks], DWORD(extra_blocks) + + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + mov [state + _args_data_ptr_md5 + PTR_SZ*lane], p + + cmp len, 64 + jb copy_lt64 + +fast_copy: + add p, len + movdqu xmm0, [p - 64 + 0*16] + movdqu xmm1, [p - 64 + 1*16] + movdqu xmm2, [p - 64 + 2*16] + movdqu xmm3, [p - 64 + 3*16] + movdqa [lane_data + _extra_block + 0*16], xmm0 + movdqa [lane_data + _extra_block + 1*16], xmm1 + movdqa [lane_data + _extra_block + 2*16], xmm2 + movdqa [lane_data + _extra_block + 3*16], xmm3 +end_fast_copy: + + mov size_offset, extra_blocks + shl size_offset, 6 + sub size_offset, last_len + add size_offset, 64-8 + mov [lane_data + _size_offset], DWORD(size_offset) + mov start_offset, 64 + sub start_offset, last_len + mov [lane_data + _start_offset], DWORD(start_offset) + + lea tmp, [8*64 + 8*len] +; bswap tmp + mov [lane_data + _extra_block + size_offset], tmp + + mov tmp, [job + _auth_key_xor_ipad] + movdqu xmm0, [tmp] + movd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 0*MD5_DIGEST_ROW_SIZE], xmm0 + pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1 + pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2 + pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*lane + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3 + + test len, ~63 + jnz ge64_bytes + +lt64_bytes: + mov [state + _lens_md5 + 2*lane], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_md5 + PTR_SZ*lane], tmp + mov dword [lane_data + _extra_blocks], 0 + +ge64_bytes: + cmp unused_lanes, 0xf + jne return_null + jmp start_loop + + align 16 +start_loop: + ; Find min length + movdqa xmm0, [state + _lens_md5] + phminposuw xmm1, xmm0 + pextrw len2, xmm1, 0 ; min value + pextrw idx, xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + pshufb xmm1, [rel dupw] ; duplicate words across all lanes + psubw xmm0, xmm1 + movdqa [state + _lens_md5], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call md5_x4x2_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_md5 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens_md5 + 2*idx], 1 + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp + + movd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] + pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], 1 + pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], 2 + pinsrd xmm0, [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], 3 +; pshufb xmm0, [rel byteswap] + movdqa [lane_data + _outer_block], xmm0 + + mov tmp, [job + _auth_key_xor_opad] + movdqu xmm0, [tmp] + movd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE], xmm0 + pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE], xmm0, 1 + pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE], xmm0, 2 + pextrd [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 3*MD5_DIGEST_ROW_SIZE], xmm0, 3 + jmp start_loop + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + mov [state + _lens_md5 + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_md5 + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp start_loop + + align 16 + +copy_lt64: + ;; less than one message block of data + ;; beginning of source block + ;; destination extrablock but backwards by len from where 0x80 pre-populated + ;; p2 clobbers unused_lanes, undo before exiting + lea p2, [lane_data + _extra_block + 64] + sub p2, len + memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 + mov unused_lanes, [state + _unused_lanes_md5] + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes_md5] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 4 + or unused_lanes, idx + mov [state + _unused_lanes_md5], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 12 bytes + mov DWORD(tmp), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 0*MD5_DIGEST_ROW_SIZE] + mov DWORD(tmp2), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 1*MD5_DIGEST_ROW_SIZE] + mov DWORD(tmp3), [state + _args_digest_md5 + MD5_DIGEST_WORD_SIZE*idx + 2*MD5_DIGEST_ROW_SIZE] + mov [p + 0*4], DWORD(tmp) + mov [p + 1*4], DWORD(tmp2) + mov [p + 2*4], DWORD(tmp3) + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov r13, [rsp + _gpr_save + 8*3] + mov r14, [rsp + _gpr_save + 8*4] + mov r15, [rsp + _gpr_save + 8*5] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*6] + mov rdi, [rsp + _gpr_save + 8*7] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_ni_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_ni_sse.asm new file mode 100644 index 00000000..a31a009c --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_ni_sse.asm @@ -0,0 +1,28 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; +%define SHA224 +%include "mb_mgr_hmac_sha_256_flush_ni_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_sse.asm new file mode 100644 index 00000000..ae945944 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_flush_sse.asm @@ -0,0 +1,31 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define FUNC flush_job_hmac_sha_224_sse +%define SHA224 + +%include "mb_mgr_hmac_sha_256_flush_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_ni_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_ni_sse.asm new file mode 100644 index 00000000..e98ac895 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_ni_sse.asm @@ -0,0 +1,28 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; +%define SHA224 +%include "mb_mgr_hmac_sha_256_submit_ni_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_sse.asm new file mode 100644 index 00000000..ef1a6780 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_224_submit_sse.asm @@ -0,0 +1,31 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define FUNC submit_job_hmac_sha_224_sse +%define SHA224 + +%include "mb_mgr_hmac_sha_256_submit_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_ni_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_ni_sse.asm new file mode 100644 index 00000000..413a5944 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_ni_sse.asm @@ -0,0 +1,268 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; In System V AMD64 ABI +;; calle saves: RBX, RBP, R12-R15 +;; Windows x64 ABI +;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 +;; +;; Linux/Windows clobbers: xmm0 - xmm15 +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +;%define DO_DBGPRINT +%include "dbgprint.asm" + +extern sha256_ni + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r13-r15 +%define idx rbp + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 + +%define tmp5 r9 + +%define tmp6 r10 + +%define bswap_xmm4 xmm4 + +struc STACK +_gpr_save: resq 4 ;rbx, rbp, rsi (win), rdi (win) +_rsp_save: resq 1 +endstruc + +section .data +default rel + +align 16 +byteswap: + dq 0x0405060700010203 + dq 0x0c0d0e0f08090a0b + +one: dq 1 + +section .text + +%ifdef SHA224 +;; JOB* flush_job_hmac_sha_224_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state) +;; arg1 : state +MKGLOBAL(flush_job_hmac_sha_224_ni_sse,function,internal) +flush_job_hmac_sha_224_ni_sse: +%else +;; JOB* flush_job_hmac_sha_256_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state) +;; arg1 : state +MKGLOBAL(flush_job_hmac_sha_256_ni_sse,function,internal) +flush_job_hmac_sha_256_ni_sse: +%endif + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp +%ifndef LINUX + mov [rsp + _gpr_save + 8*2], rsi + mov [rsp + _gpr_save + 8*3], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + DBGPRINTL "enter sha256-ni-sse flush" + + mov unused_lanes, [state + _unused_lanes_sha256] + bt unused_lanes, 16+7 + jc return_null + + ; find a lane with a non-null job, assume it is 0 then check 1 + xor idx, idx + cmp qword [state + _ldata_sha256 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel one] + DBGPRINTL64 "idx:", idx + +copy_lane_data: + ; copy idx to empty lanes + mov tmp, [state + _args_data_ptr_sha256 + PTR_SZ*idx] + xor len2, len2 + mov WORD(len2), word [state + _lens_sha256 + idx*2] + + ; there are only two lanes so if one is empty it is easy to determine which one + xor idx, 1 + mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp + xor idx, 1 + + ; No need to find min length - only two lanes available + cmp len2, 0 + je len_is_0 + + ; set length on both lanes to 0 + mov dword [state + _lens_sha256], 0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_ni + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_sha256 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + movdqa bswap_xmm4, [rel byteswap] + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens_sha256 + 2*idx], 1 + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp + +%if SHA256NI_DIGEST_ROW_SIZE != 32 +%error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!" +%endif + lea tmp4, [idx*8] ; x8 here + scale factor x4 below give x32 + movdqu xmm0, [state + _args_digest_sha256 + tmp4*4] + movdqu xmm1, [state + _args_digest_sha256 + tmp4*4 + 4*4] + pshufb xmm0, bswap_xmm4 + pshufb xmm1, bswap_xmm4 + movdqa [lane_data + _outer_block], xmm0 + movdqa [lane_data + _outer_block + 4*4], xmm1 +%ifdef SHA224 + ;; overwrite top 4 bytes with 0x80 + mov dword [lane_data + _outer_block + 7*4], 0x80 +%endif + DBGPRINTL "sha256 outer hash input words:" + DBGPRINT_XMM xmm0 + DBGPRINT_XMM xmm1 + + mov tmp, [job + _auth_key_xor_opad] + movdqu xmm0, [tmp] + movdqu xmm1, [tmp + 4*4] + DBGPRINTL64 "auth_key_xor_opad", tmp + movdqu [state + _args_digest_sha256 + tmp4*4], xmm0 + movdqu [state + _args_digest_sha256 + tmp4*4 + 4*4], xmm1 + DBGPRINTL "new digest args" + DBGPRINT_XMM xmm0 + DBGPRINT_XMM xmm1 + jmp copy_lane_data + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp copy_lane_data + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + mov unused_lanes, [state + _unused_lanes_sha256] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes_sha256], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 16 bytes for SHA256, 14 bytes for SHA224 +%if SHA256NI_DIGEST_ROW_SIZE != 32 +%error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!" +%endif + shl idx, 5 + movdqu xmm0, [state + _args_digest_sha256 + idx] + pshufb xmm0, bswap_xmm4 +%ifdef SHA224 + ;; SHA224 + movq [p + 0*4], xmm0 + pextrd [p + 2*4], xmm0, 2 + pextrw [p + 3*4], xmm0, 6 +%else + ;; SHA256 + movdqu [p], xmm0 +%endif + + DBGPRINTL "auth_tag_output:" + DBGPRINT_XMM xmm0 + +return: + DBGPRINTL "exit sha256-ni-sse flush" + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*2] + mov rdi, [rsp + _gpr_save + 8*3] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_sse.asm new file mode 100644 index 00000000..bcaa7f89 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_flush_sse.asm @@ -0,0 +1,274 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +extern sha_256_mult_sse + +section .data +default rel + +align 16 +byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 + ;ddq 0x00000000000000000000FFFF00000000 + dq 0x0000FFFF00000000, 0x0000000000000000 + ;ddq 0x0000000000000000FFFF000000000000 + dq 0xFFFF000000000000, 0x0000000000000000 +one: dq 1 +two: dq 2 +three: dq 3 + +section .text + +%ifndef FUNC +%define FUNC flush_job_hmac_sha_256_sse +%endif + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r13-r15 +%define idx rbp + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 + +%define tmp5 r9 + +%define tmp6 r10 + +%endif + +; This routine clobbers rbx, rbp; called routine also clobbers r12 +struc STACK +_gpr_save: resq 3 +_rsp_save: resq 1 +endstruc + +%define APPEND(a,b) a %+ b + +; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state) +; arg 1 : rcx : state +MKGLOBAL(FUNC,function,internal) +FUNC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_sha256] + bt unused_lanes, 32+7 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata_sha256 + 1 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel one] + cmp qword [state + _ldata_sha256 + 2 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel two] + cmp qword [state + _ldata_sha256 + 3 * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + cmovne idx, [rel three] + +copy_lane_data: + ; copy idx to empty lanes + movdqa xmm0, [state + _lens_sha256] + mov tmp, [state + _args_data_ptr_sha256 + 8*idx] + +%assign I 0 +%rep 4 + cmp qword [state + _ldata_sha256 + I * _HMAC_SHA1_LANE_DATA_size + _job_in_lane], 0 + jne APPEND(skip_,I) + mov [state + _args_data_ptr_sha256 + 8*I], tmp + por xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + + movdqa [state + _lens_sha256], xmm0 + + phminposuw xmm1, xmm0 + pextrw len2, xmm1, 0 ; min value + pextrw idx, xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + pshuflw xmm1, xmm1, 0 + psubw xmm0, xmm1 + movdqa [state + _lens_sha256], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha_256_mult_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_sha256 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens_sha256 + 2*idx], 1 + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr_sha256 + 8*idx], tmp + + movd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] + pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1 + pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2 + pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3 + pshufb xmm0, [rel byteswap] + movd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE] + pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1 + pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2 +%ifndef SHA224 + pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3 +%endif + pshufb xmm1, [rel byteswap] + movdqa [lane_data + _outer_block], xmm0 + movdqa [lane_data + _outer_block + 4*4], xmm1 +%ifdef SHA224 + mov dword [lane_data + _outer_block + 7*4], 0x80 +%endif + + mov tmp, [job + _auth_key_xor_opad] + movdqu xmm0, [tmp] + movdqu xmm1, [tmp + 4*4] + movd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0 + pextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 + pextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 + pextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 + movd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1 + pextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 + pextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 + pextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 + jmp copy_lane_data + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_sha256 + 8*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp copy_lane_data + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + mov unused_lanes, [state + _unused_lanes_sha256] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes_sha256], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 14 bytes for SHA224 and 16 bytes for SHA256 + mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp6), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp5), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE] + + bswap DWORD(tmp2) + bswap DWORD(tmp4) + bswap DWORD(tmp6) + bswap DWORD(tmp5) + + mov [p + 0*4], DWORD(tmp2) + mov [p + 1*4], DWORD(tmp4) + mov [p + 2*4], DWORD(tmp6) + +%ifdef SHA224 + mov [p + 3*4], WORD(tmp5) +%else + mov [p + 3*4], DWORD(tmp5) +%endif + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_ni_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_ni_sse.asm new file mode 100644 index 00000000..60dc04b0 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_ni_sse.asm @@ -0,0 +1,347 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; In System V AMD64 ABI +;; calle saves: RBX, RBP, R12-R15 +;; Windows x64 ABI +;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 +;; +;; Linux/Windows clobbers: xmm0 - xmm15 +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "reg_sizes.asm" +%include "memcpy.asm" + +;%define DO_DBGPRINT +%include "dbgprint.asm" + +extern sha256_ni + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define reg3 rcx +%define reg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define reg3 rdi +%define reg4 rsi +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r13-r15 +%define last_len rbp +%define idx rbp + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx +%define tmp4 rbx + +%define job_rax rax +%define len rax + +%define size_offset reg3 +%define tmp2 reg3 + +%define lane reg4 + +%define extra_blocks r8 + +%define tmp r9 +%define p2 r9 + +%define lane_data r10 + +%define bswap_xmm4 xmm4 + +struc STACK +_gpr_save: resq 4 ; rbx, rbp, rsi (win), rdi (win) +_rsp_save: resq 1 +endstruc + +section .data +default rel + +align 16 +byteswap: + dq 0x0405060700010203 + dq 0x0c0d0e0f08090a0b + +section .text + +%ifdef SHA224 +; JOB* submit_job_hmac_sha_224_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job) +; arg 1 : state +; arg 2 : job +MKGLOBAL(submit_job_hmac_sha_224_ni_sse,function,internal) +submit_job_hmac_sha_224_ni_sse: + +%else + +; JOB* submit_job_hmac_sha_256_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job) +; arg 1 : state +; arg 2 : job +MKGLOBAL(submit_job_hmac_sha_256_ni_sse,function,internal) +submit_job_hmac_sha_256_ni_sse: +%endif + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp +%ifndef LINUX + mov [rsp + _gpr_save + 8*2], rsi + mov [rsp + _gpr_save + 8*3], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + DBGPRINTL "enter sha256-ni-sse submit" + + mov unused_lanes, [state + _unused_lanes_sha256] + movzx lane, BYTE(unused_lanes) + DBGPRINTL64 "lane: ", lane + shr unused_lanes, 8 + imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size ; SHA1 & SHA256 lane data is the same + lea lane_data, [state + _ldata_sha256 + lane_data] + mov [state + _unused_lanes_sha256], unused_lanes + mov len, [job + _msg_len_to_hash_in_bytes] + DBGPRINTL64 "length: ", len + mov tmp, len + shr tmp, 6 ; divide by 64, len in terms of blocks + + mov [lane_data + _job_in_lane], job + mov dword [lane_data + _outer_done], 0 + mov [state + _lens_sha256 + 2*lane], WORD(tmp) + + mov last_len, len + and last_len, 63 + lea extra_blocks, [last_len + 9 + 63] + shr extra_blocks, 6 + mov [lane_data + _extra_blocks], DWORD(extra_blocks) + + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + mov [state + _args_data_ptr_sha256 + 8*lane], p + + cmp len, 64 + jb copy_lt64 + +fast_copy: + add p, len + movdqu xmm0, [p - 64 + 0*16] + movdqu xmm1, [p - 64 + 1*16] + movdqu xmm2, [p - 64 + 2*16] + movdqu xmm3, [p - 64 + 3*16] + movdqa [lane_data + _extra_block + 0*16], xmm0 + movdqa [lane_data + _extra_block + 1*16], xmm1 + movdqa [lane_data + _extra_block + 2*16], xmm2 + movdqa [lane_data + _extra_block + 3*16], xmm3 +end_fast_copy: + + mov size_offset, extra_blocks + shl size_offset, 6 + sub size_offset, last_len + add size_offset, 64-8 + mov [lane_data + _size_offset], DWORD(size_offset) + mov start_offset, 64 + sub start_offset, last_len + mov [lane_data + _start_offset], DWORD(start_offset) + + lea tmp, [8*64 + 8*len] + bswap tmp + mov [lane_data + _extra_block + size_offset], tmp + + mov tmp, [job + _auth_key_xor_ipad] + movdqu xmm0, [tmp] + movdqu xmm1, [tmp + 4*4] +%if SHA256NI_DIGEST_ROW_SIZE != 32 +%error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!" +%endif + lea tmp, [lane*8] ; x8 here plus x4 scale factor give x32 + movdqu [state + _args_digest_sha256 + tmp*4], xmm0 + movdqu [state + _args_digest_sha256 + tmp*4 + 4*4], xmm1 + DBGPRINTL "args digest:" + DBGPRINT_XMM xmm0 + DBGPRINT_XMM xmm1 + test len, ~63 + jnz ge64_bytes + +lt64_bytes: + mov [state + _lens_sha256 + 2*lane], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_sha256 + 8*lane], tmp + mov dword [lane_data + _extra_blocks], 0 + +ge64_bytes: + cmp unused_lanes, 0xff + jne return_null + jmp start_loop + + align 16 +start_loop: + ; Find min length - only two lanes available + xor len2, len2 + mov tmp, 0x10000 + mov WORD(len2), word [state + _lens_sha256 + 0*2] ; [0:15] - lane 0 length, [16:31] - lane index (0) + mov WORD(tmp), word [state + _lens_sha256 + 1*2] ; [0:15] - lane 1 length, [16:31] - lane index (1) + cmp WORD(len2), WORD(tmp) + cmovg DWORD(len2), DWORD(tmp) ; move if lane 0 length is greater than lane 1 length + + mov idx, len2 ; retrieve index & length from [16:31] and [0:15] bit fields + shr DWORD(idx), 16 + and DWORD(len2), 0xffff + je len_is_0 + + sub word [state + _lens_sha256 + 0*2], WORD(len2) + sub word [state + _lens_sha256 + 1*2], WORD(len2) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha256_ni + ; state is intact +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_sha256 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + movdqa bswap_xmm4, [rel byteswap] + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens_sha256 + 2*idx], 1 + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp + +%if SHA256NI_DIGEST_ROW_SIZE != 32 +%error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!" +%endif + lea tmp4, [idx*8] ; x8 here + scale factor x4 below give x32 + movdqu xmm0, [state + _args_digest_sha256 + tmp4*4] + movdqu xmm1, [state + _args_digest_sha256 + tmp4*4 + 4*4] + pshufb xmm0, bswap_xmm4 + pshufb xmm1, bswap_xmm4 + movdqa [lane_data + _outer_block], xmm0 + movdqa [lane_data + _outer_block + 4*4], xmm1 +%ifdef SHA224 + ;; overwrite top 4 bytes with 0x80 + mov dword [lane_data + _outer_block + 7*4], 0x80 +%endif + + mov tmp, [job + _auth_key_xor_opad] + movdqu xmm0, [tmp] + movdqu xmm1, [tmp + 4*4] + movdqu [state + _args_digest_sha256 + tmp4*4], xmm0 + movdqu [state + _args_digest_sha256 + tmp4*4 + 4*4], xmm1 + jmp start_loop + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp start_loop + + align 16 + +copy_lt64: + ;; less than one message block of data + ;; beginning of source block + ;; destination extrablock but backwards by len from where 0x80 pre-populated + ;; p2 clobbers unused_lanes, undo before exit + lea p2, [lane_data + _extra_block + 64] + sub p2, len + memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 + mov unused_lanes, [state + _unused_lanes_sha256] + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes_sha256] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes_sha256], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 16 bytes for SHA256, 14 for SHA224 +%if SHA256NI_DIGEST_ROW_SIZE != 32 +%error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!" +%endif + shl idx, 5 + movdqu xmm0, [state + _args_digest_sha256 + idx] + pshufb xmm0, bswap_xmm4 +%ifdef SHA224 + ;; SHA224 + movq [p + 0*4], xmm0 + pextrd [p + 2*4], xmm0, 2 + pextrw [p + 3*4], xmm0, 6 +%else + ;; SHA256 + movdqu [p], xmm0 +%endif + +return: + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*2] + mov rdi, [rsp + _gpr_save + 8*3] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_sse.asm new file mode 100644 index 00000000..a63990c1 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_sse.asm @@ -0,0 +1,341 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "reg_sizes.asm" +%include "memcpy.asm" + +extern sha_256_mult_sse + +section .data +default rel +align 16 +byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +section .text + +%ifndef FUNC +%define FUNC submit_job_hmac_sha_256_sse +%endif + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define reg3 rcx +%define reg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define reg3 rdi +%define reg4 rsi +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r13-r15 +%define last_len rbp +%define idx rbp + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx +%define tmp4 rbx + +%define job_rax rax +%define len rax + +%define size_offset reg3 +%define tmp2 reg3 + +%define lane reg4 +%define tmp3 reg4 + +%define extra_blocks r8 + +%define tmp r9 +%define p2 r9 + +%define lane_data r10 + +%endif + +; This routine clobbers rbx, rbp, rsi, rdi; called routine also clobbers r12 +struc STACK +_gpr_save: resq 5 +_rsp_save: resq 1 +endstruc + +; JOB* FUNC(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +MKGLOBAL(FUNC,function,internal) +FUNC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 +%ifndef LINUX + mov [rsp + _gpr_save + 8*3], rsi + mov [rsp + _gpr_save + 8*4], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_sha256] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_sha256 + lane_data] + mov [state + _unused_lanes_sha256], unused_lanes + mov len, [job + _msg_len_to_hash_in_bytes] + mov tmp, len + shr tmp, 6 ; divide by 64, len in terms of blocks + + mov [lane_data + _job_in_lane], job + mov dword [lane_data + _outer_done], 0 + mov [state + _lens_sha256 + 2*lane], WORD(tmp) + + mov last_len, len + and last_len, 63 + lea extra_blocks, [last_len + 9 + 63] + shr extra_blocks, 6 + mov [lane_data + _extra_blocks], DWORD(extra_blocks) + + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + mov [state + _args_data_ptr_sha256 + 8*lane], p + + cmp len, 64 + jb copy_lt64 + +fast_copy: + add p, len + movdqu xmm0, [p - 64 + 0*16] + movdqu xmm1, [p - 64 + 1*16] + movdqu xmm2, [p - 64 + 2*16] + movdqu xmm3, [p - 64 + 3*16] + movdqa [lane_data + _extra_block + 0*16], xmm0 + movdqa [lane_data + _extra_block + 1*16], xmm1 + movdqa [lane_data + _extra_block + 2*16], xmm2 + movdqa [lane_data + _extra_block + 3*16], xmm3 +end_fast_copy: + + mov size_offset, extra_blocks + shl size_offset, 6 + sub size_offset, last_len + add size_offset, 64-8 + mov [lane_data + _size_offset], DWORD(size_offset) + mov start_offset, 64 + sub start_offset, last_len + mov [lane_data + _start_offset], DWORD(start_offset) + + lea tmp, [8*64 + 8*len] + bswap tmp + mov [lane_data + _extra_block + size_offset], tmp + + mov tmp, [job + _auth_key_xor_ipad] + movdqu xmm0, [tmp] + movdqu xmm1, [tmp + 4*4] + movd [state + _args_digest_sha256 + 4*lane + 0*SHA256_DIGEST_ROW_SIZE], xmm0 + pextrd [state + _args_digest_sha256 + 4*lane + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 + pextrd [state + _args_digest_sha256 + 4*lane + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 + pextrd [state + _args_digest_sha256 + 4*lane + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 + movd [state + _args_digest_sha256 + 4*lane + 4*SHA256_DIGEST_ROW_SIZE], xmm1 + pextrd [state + _args_digest_sha256 + 4*lane + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 + pextrd [state + _args_digest_sha256 + 4*lane + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 + pextrd [state + _args_digest_sha256 + 4*lane + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 + test len, ~63 + jnz ge64_bytes + +lt64_bytes: + mov [state + _lens_sha256 + 2*lane], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_sha256 + 8*lane], tmp + mov dword [lane_data + _extra_blocks], 0 + +ge64_bytes: + cmp unused_lanes, 0xff + jne return_null + jmp start_loop + + align 16 +start_loop: + ; Find min length + movdqa xmm0, [state + _lens_sha256] + phminposuw xmm1, xmm0 + pextrw len2, xmm1, 0 ; min value + pextrw idx, xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + pshuflw xmm1, xmm1, 0 + psubw xmm0, xmm1 + movdqa [state + _lens_sha256], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha_256_mult_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata_sha256 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens_sha256 + 2*idx], 1 + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr_sha256 + 8*idx], tmp + + movd xmm0, [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] + pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], 1 + pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], 2 + pinsrd xmm0, [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], 3 + pshufb xmm0, [rel byteswap] + movd xmm1, [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE] + pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], 1 + pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], 2 +%ifndef SHA224 + pinsrd xmm1, [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], 3 +%endif + pshufb xmm1, [rel byteswap] + movdqa [lane_data + _outer_block], xmm0 + movdqa [lane_data + _outer_block + 4*4], xmm1 +%ifdef SHA224 + mov dword [lane_data + _outer_block + 7*4], 0x80 +%endif + + + mov tmp, [job + _auth_key_xor_opad] + movdqu xmm0, [tmp] + movdqu xmm1, [tmp + 4*4] + movd [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE], xmm0 + pextrd [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE], xmm0, 1 + pextrd [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE], xmm0, 2 + pextrd [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE], xmm0, 3 + movd [state + _args_digest_sha256 + 4*idx + 4*SHA256_DIGEST_ROW_SIZE], xmm1 + pextrd [state + _args_digest_sha256 + 4*idx + 5*SHA256_DIGEST_ROW_SIZE], xmm1, 1 + pextrd [state + _args_digest_sha256 + 4*idx + 6*SHA256_DIGEST_ROW_SIZE], xmm1, 2 + pextrd [state + _args_digest_sha256 + 4*idx + 7*SHA256_DIGEST_ROW_SIZE], xmm1, 3 + jmp start_loop + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + mov [state + _lens_sha256 + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr_sha256 + 8*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp start_loop + + align 16 + +copy_lt64: + ;; less than one message block of data + ;; beginning of source block + ;; destination extrablock but backwards by len from where 0x80 pre-populated + ;; p2 clobbers unused_lanes, undo before exit + lea p2, [lane_data + _extra_block + 64] + sub p2, len + memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 + mov unused_lanes, [state + _unused_lanes_sha256] + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes_sha256] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes_sha256], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 14 bytes for SHA224 and 16 bytes for SHA256 + mov DWORD(tmp), [state + _args_digest_sha256 + 4*idx + 0*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp2), [state + _args_digest_sha256 + 4*idx + 1*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp3), [state + _args_digest_sha256 + 4*idx + 2*SHA256_DIGEST_ROW_SIZE] + mov DWORD(tmp4), [state + _args_digest_sha256 + 4*idx + 3*SHA256_DIGEST_ROW_SIZE] + + bswap DWORD(tmp) + bswap DWORD(tmp2) + bswap DWORD(tmp3) + bswap DWORD(tmp4) + + mov [p + 0*4], DWORD(tmp) + mov [p + 1*4], DWORD(tmp2) + mov [p + 2*4], DWORD(tmp3) + +%ifdef SHA224 + mov [p + 3*4], WORD(tmp4) +%else + mov [p + 3*4], DWORD(tmp4) +%endif + + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*3] + mov rdi, [rsp + _gpr_save + 8*4] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_flush_sse.asm new file mode 100644 index 00000000..846b09aa --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_flush_sse.asm @@ -0,0 +1,31 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define FUNC flush_job_hmac_sha_384_sse +%define SHA_X_DIGEST_SIZE 384 + +%include "mb_mgr_hmac_sha_512_flush_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_submit_sse.asm new file mode 100644 index 00000000..2ae645ab --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_384_submit_sse.asm @@ -0,0 +1,31 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define FUNC submit_job_hmac_sha_384_sse +%define SHA_X_DIGEST_SIZE 384 + +%include "mb_mgr_hmac_sha_512_submit_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_flush_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_flush_sse.asm new file mode 100644 index 00000000..b96e370b --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_flush_sse.asm @@ -0,0 +1,249 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "reg_sizes.asm" + +extern sha512_x2_sse + +section .data +default rel +align 16 +byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607 + dq 0x0001020304050607, 0x08090a0b0c0d0e0f +len_masks: + ;ddq 0x0000000000000000000000000000FFFF + dq 0x000000000000FFFF, 0x0000000000000000 + ;ddq 0x000000000000000000000000FFFF0000 + dq 0x00000000FFFF0000, 0x0000000000000000 +one: dq 1 + +section .text + +%ifndef FUNC +%define FUNC flush_job_hmac_sha_512_sse +%define SHA_X_DIGEST_SIZE 512 +%endif + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r12-r15 +%define idx rbp + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 + +%define tmp5 r9 + +%define tmp6 r10 + +%endif + +; This routine clobbers rbx, rbp +struc STACK +_gpr_save: resq 2 +_rsp_save: resq 1 +endstruc + +%define APPEND(a,b) a %+ b + +; JOB* FUNC(MB_MGR_HMAC_SHA_512_OOO *state) +; arg 1 : rcx : state +MKGLOBAL(FUNC,function,internal) +FUNC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_sha512] + bt unused_lanes, 16+7 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata_sha512 + 1 * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0 + cmovne idx, [rel one] +copy_lane_data: + ; copy good lane (idx) to empty lanes + movdqa xmm0, [state + _lens_sha512] + mov tmp, [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*idx] + +%assign I 0 +%rep 2 + cmp qword [state + _ldata_sha512 + I * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0 + jne APPEND(skip_,I) + mov [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*I], tmp + por xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + + movdqa [state + _lens_sha512], xmm0 + + phminposuw xmm1, xmm0 + pextrw len2, xmm1, 0 ; min value + pextrw idx, xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + pshuflw xmm1, xmm1, 0xA0 + psubw xmm0, xmm1 + movdqa [state + _lens_sha512], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha512_x2_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _SHA512_LANE_DATA_size + lea lane_data, [state + _ldata_sha512 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done_sha512], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done_sha512], 1 + mov DWORD(size_offset), [lane_data + _size_offset_sha512] + mov qword [lane_data + _extra_block_sha512 + size_offset], 0 + mov word [state + _lens_sha512 + 2*idx], 1 + lea tmp, [lane_data + _outer_block_sha512] + mov job, [lane_data + _job_in_lane_sha512] + mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp + +%assign I 0 +%rep (SHA_X_DIGEST_SIZE / (8*16)) + movq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I)*SHA512_DIGEST_ROW_SIZE] + pinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1) *SHA512_DIGEST_ROW_SIZE], 1 + pshufb xmm0, [rel byteswap] + movdqa [lane_data + _outer_block_sha512 + I*16], xmm0 +%assign I (I+1) +%endrep + + mov tmp, [job + _auth_key_xor_opad] +%assign I 0 +%rep 4 + movdqu xmm0, [tmp + I * 16] + movq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE], xmm0 + pextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 +%assign I (I+1) +%endrep + jmp copy_lane_data + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset_sha512] + mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block_sha512 + start_offset] + mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks_sha512], 0 + jmp copy_lane_data + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane_sha512] + mov qword [lane_data + _job_in_lane_sha512], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + mov unused_lanes, [state + _unused_lanes_sha512] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes_sha512], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; below is the code for both SHA512 & SHA384. SHA512=32 bytes and SHA384=24 bytes + + mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp6), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE] +%if (SHA_X_DIGEST_SIZE != 384) + mov QWORD(tmp5), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] +%endif + bswap QWORD(tmp2) + bswap QWORD(tmp4) + bswap QWORD(tmp6) +%if (SHA_X_DIGEST_SIZE != 384) + bswap QWORD(tmp5) +%endif + mov [p + 0*8], QWORD(tmp2) + mov [p + 1*8], QWORD(tmp4) + mov [p + 2*8], QWORD(tmp6) +%if (SHA_X_DIGEST_SIZE != 384) + mov [p + 3*8], QWORD(tmp5) +%endif + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_submit_sse.asm new file mode 100644 index 00000000..579121dd --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_512_submit_sse.asm @@ -0,0 +1,325 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "reg_sizes.asm" +%include "memcpy.asm" + +extern sha512_x2_sse + +section .data +default rel +align 16 +byteswap: ;ddq 0x08090a0b0c0d0e0f0001020304050607 + dq 0x0001020304050607, 0x08090a0b0c0d0e0f + +section .text + +%ifndef FUNC +%define FUNC submit_job_hmac_sha_512_sse +%define SHA_X_DIGEST_SIZE 512 +%endif + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define reg3 rcx +%define reg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define reg3 rdi +%define reg4 rsi +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r12-r15 +%define last_len rbp +%define idx rbp + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx +%define tmp4 rbx + +%define job_rax rax +%define len rax + +%define size_offset reg3 +%define tmp2 reg3 + +%define lane reg4 +%define tmp3 reg4 + +%define extra_blocks r8 + +%define tmp r9 +%define p2 r9 + +%define lane_data r10 + +%endif + +; This routine clobbers rbx, rbp, rsi, rdi +struc STACK +_gpr_save: resq 4 +_rsp_save: resq 1 +endstruc + +; JOB* FUNC(MB_MGR_HMAC_SHA_512_OOO *state, JOB_AES_HMAC *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +MKGLOBAL(FUNC,function,internal) +FUNC: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp +%ifndef LINUX + mov [rsp + _gpr_save + 8*2], rsi + mov [rsp + _gpr_save + 8*3], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_sha512] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + imul lane_data, lane, _SHA512_LANE_DATA_size + lea lane_data, [state + _ldata_sha512+ lane_data] + mov [state + _unused_lanes_sha512], unused_lanes + mov len, [job + _msg_len_to_hash_in_bytes] + mov tmp, len + shr tmp, 7 ; divide by 128, len in terms of sha512 blocks + + mov [lane_data + _job_in_lane_sha512], job + mov dword [lane_data + _outer_done_sha512], 0 + mov [state + _lens_sha512 + 2*lane], WORD(tmp) ; 2 is word size in bytes + + mov last_len, len + and last_len, 127 + lea extra_blocks, [last_len + 17 + 127] + shr extra_blocks, 7 + mov [lane_data + _extra_blocks_sha512], DWORD(extra_blocks) + + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], p + + cmp len, 128 + jb copy_lt128 + +fast_copy: + add p, len +%assign I 0 +%rep 2 + movdqu xmm0, [p - 128 + I*4*16 + 0*16] + movdqu xmm1, [p - 128 + I*4*16 + 1*16] + movdqu xmm2, [p - 128 + I*4*16 + 2*16] + movdqu xmm3, [p - 128 + I*4*16 + 3*16] + movdqa [lane_data + _extra_block_sha512 + I*4*16 + 0*16], xmm0 + movdqa [lane_data + _extra_block_sha512 + I*4*16 + 1*16], xmm1 + movdqa [lane_data + _extra_block_sha512 + I*4*16 + 2*16], xmm2 + movdqa [lane_data + _extra_block_sha512 + I*4*16 + 3*16], xmm3 +%assign I (I+1) +%endrep +end_fast_copy: + + mov size_offset, extra_blocks + shl size_offset, 7 + sub size_offset, last_len + add size_offset, 128-8 + mov [lane_data + _size_offset_sha512], DWORD(size_offset) + mov start_offset, 128 + sub start_offset, last_len + mov [lane_data + _start_offset_sha512], DWORD(start_offset) + + lea tmp, [8*128 + 8*len] + bswap tmp + mov [lane_data + _extra_block_sha512 + size_offset], tmp + + mov tmp, [job + _auth_key_xor_ipad] + %assign I 0 + %rep 4 + movdqu xmm0, [tmp + I * 2 * SHA512_DIGEST_WORD_SIZE] + movq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I)*SHA512_DIGEST_ROW_SIZE], xmm0 + pextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*lane + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 + %assign I (I+1) + %endrep + test len, ~127 + jnz ge128_bytes + +lt128_bytes: + mov [state + _lens_sha512 + 2*lane], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block_sha512 + start_offset] + mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], tmp ;; 8 to hold a UINT8 + mov dword [lane_data + _extra_blocks_sha512], 0 + +ge128_bytes: + cmp unused_lanes, 0xff + jne return_null + jmp start_loop + + align 16 +start_loop: + ; Find min length + movdqa xmm0, [state + _lens_sha512] + phminposuw xmm1, xmm0 + pextrw DWORD(len2), xmm1, 0 ; min value + pextrw DWORD(idx), xmm1, 1 ; min index (0...1) + cmp len2, 0 + je len_is_0 + + pshuflw xmm1, xmm1, 0XA0 + psubw xmm0, xmm1 + movdqa [state + _lens_sha512], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha512_x2_sse + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _SHA512_LANE_DATA_size + lea lane_data, [state + _ldata_sha512 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done_sha512], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done_sha512], 1 + mov DWORD(size_offset), [lane_data + _size_offset_sha512] + mov qword [lane_data + _extra_block_sha512 + size_offset], 0 + mov word [state + _lens_sha512 + 2*idx], 1 + lea tmp, [lane_data + _outer_block_sha512] + mov job, [lane_data + _job_in_lane_sha512] + mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp + +%assign I 0 +%rep (SHA_X_DIGEST_SIZE / (8 * 16)) + movq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I)*SHA512_DIGEST_ROW_SIZE] + pinsrq xmm0, [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], 1 + pshufb xmm0, [rel byteswap] + movdqa [lane_data + _outer_block_sha512 + I*16], xmm0 +%assign I (I+1) +%endrep + + mov tmp, [job + _auth_key_xor_opad] +%assign I 0 +%rep 4 + movdqu xmm0, [tmp + I*16] + movq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*I*SHA512_DIGEST_ROW_SIZE], xmm0 + pextrq [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + (2*I + 1)*SHA512_DIGEST_ROW_SIZE], xmm0, 1 +%assign I (I+1) +%endrep + jmp start_loop + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset_sha512] + mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block_sha512 + start_offset] + mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks_sha512], 0 + jmp start_loop + + align 16 +copy_lt128: + ;; less than one message block of data + ;; beginning of source block + ;; destination extra block but backwards by len from where 0x80 pre-populated + lea p2, [lane_data + _extra_block + 128] + sub p2, len + memcpy_sse_128_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 + mov unused_lanes, [state + _unused_lanes_sha512] + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane_sha512] + mov unused_lanes, [state + _unused_lanes_sha512] + mov qword [lane_data + _job_in_lane_sha512], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes_sha512], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; below is the code for both SHA512 & SHA384. SHA512=32 bytes and SHA384=24 bytes + + mov QWORD(tmp), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 0*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp2), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 1*SHA512_DIGEST_ROW_SIZE] + mov QWORD(tmp3), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 2*SHA512_DIGEST_ROW_SIZE] +%if (SHA_X_DIGEST_SIZE != 384) + mov QWORD(tmp4), [state + _args_digest_sha512 + SHA512_DIGEST_WORD_SIZE*idx + 3*SHA512_DIGEST_ROW_SIZE] ; this line of code will run only for SHA512 +%endif + bswap QWORD(tmp) + bswap QWORD(tmp2) + bswap QWORD(tmp3) +%if (SHA_X_DIGEST_SIZE != 384) + bswap QWORD(tmp4) +%endif + mov [p + 0*8], QWORD(tmp) + mov [p + 1*8], QWORD(tmp2) + mov [p + 2*8], QWORD(tmp3) +%if (SHA_X_DIGEST_SIZE != 384) + mov [p + 3*8], QWORD(tmp4) +%endif + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*2] + mov rdi, [rsp + _gpr_save + 8*3] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_ni_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_ni_sse.asm new file mode 100644 index 00000000..a066c00a --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_ni_sse.asm @@ -0,0 +1,335 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; In System V AMD64 ABI +;; calle saves: RBX, RBP, R12-R15 +;; Windows x64 ABI +;; calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15 +;; +;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Windows clobbers: RAX RCX RDX R8 R9 R10 R11 +;; Windows preserves: RBX RBP RSI RDI R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Linux clobbers: RAX RCX RDX RSI RDI R8 R9 R10 R11 +;; Linux preserves: RBX RBP R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; +;; Linux/Windows clobbers: xmm0 - xmm15 +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "reg_sizes.asm" +%include "memcpy.asm" + +;%define DO_DBGPRINT +%include "dbgprint.asm" + +extern sha1_ni + +section .data +default rel + +align 16 +byteswap: + dq 0x0405060700010203 + dq 0x0c0d0e0f08090a0b + +section .text + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define reg3 rcx +%define reg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define reg3 rdi +%define reg4 rsi +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + +; idx needs to be in rbx, rbp, r12-r15 +%define last_len rbp +%define idx rbp +%define p4 rbp + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx +%define tmp4 rbx +%define p3 rbx + +%define job_rax rax +%define len rax + +%define size_offset reg3 +%define tmp2 reg3 + +%define lane reg4 +%define tmp3 reg4 + +%define extra_blocks r8 + +%define tmp r9 +%define p2 r9 + +%define lane_data r10 + +struc STACK +_gpr_save: resq 4 +_rsp_save: resq 1 +endstruc + +; JOB* submit_job_hmac_ni_sse(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +MKGLOBAL(submit_job_hmac_ni_sse,function,internal) +submit_job_hmac_ni_sse: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp +%ifndef LINUX + mov [rsp + _gpr_save + 8*2], rsi + mov [rsp + _gpr_save + 8*3], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + DBGPRINTL "enter sha1-ni-sse submit" + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + DBGPRINTL64 "lane: ", lane + shr unused_lanes, 8 + imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov len, [job + _msg_len_to_hash_in_bytes] + DBGPRINTL64 "length: ", len + mov tmp, len + shr tmp, 6 ; divide by 64, len in terms of blocks + + mov [lane_data + _job_in_lane], job + mov dword [lane_data + _outer_done], 0 + mov [state + _lens + 2*lane], WORD(tmp) + + mov last_len, len + and last_len, 63 + lea extra_blocks, [last_len + 9 + 63] + shr extra_blocks, 6 + mov [lane_data + _extra_blocks], DWORD(extra_blocks) + + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + DBGPRINTL64 "src pointer + offset:", p + mov [state + _args_data_ptr + PTR_SZ*lane], p + cmp len, 64 + jb copy_lt64 + +fast_copy: + add p, len + movdqu xmm0, [p - 64 + 0*16] + movdqu xmm1, [p - 64 + 1*16] + movdqu xmm2, [p - 64 + 2*16] + movdqu xmm3, [p - 64 + 3*16] + movdqa [lane_data + _extra_block + 0*16], xmm0 + movdqa [lane_data + _extra_block + 1*16], xmm1 + movdqa [lane_data + _extra_block + 2*16], xmm2 + movdqa [lane_data + _extra_block + 3*16], xmm3 +end_fast_copy: + + mov size_offset, extra_blocks + shl size_offset, 6 + sub size_offset, last_len + add size_offset, 64-8 + mov [lane_data + _size_offset], DWORD(size_offset) + mov start_offset, 64 + sub start_offset, last_len + mov [lane_data + _start_offset], DWORD(start_offset) + + lea tmp, [8*64 + 8*len] + bswap tmp + mov [lane_data + _extra_block + size_offset], tmp + + mov tmp, [job + _auth_key_xor_ipad] + movdqu xmm0, [tmp] + mov DWORD(tmp), [tmp + 4*SHA1_DIGEST_WORD_SIZE] +%if SHA1NI_DIGEST_ROW_SIZE != 20 +%error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!" +%endif + lea p4, [lane + lane*4] + movdqu [state + _args_digest + p4*4 + 0*SHA1_DIGEST_WORD_SIZE], xmm0 + mov [state + _args_digest + p4*4 + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) + test len, ~63 + jnz ge64_bytes + +lt64_bytes: + mov [state + _lens + 2*lane], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr + PTR_SZ*lane], tmp + mov dword [lane_data + _extra_blocks], 0 + +ge64_bytes: + cmp unused_lanes, 0xff + jne return_null + jmp start_loop + + align 16 +start_loop: + ; Find min length - only two lanes available + xor len2, len2 + mov p3, 0x10000 + mov WORD(len2), word [state + _lens + 0*2] ; [0:15] - lane 0 length, [16:31] - lane index (0) + mov WORD(p3), word [state + _lens + 1*2] ; [0:15] - lane 1 length, [16:31] - lane index (1) + cmp WORD(len2), WORD(p3) + cmovg DWORD(len2), DWORD(p3) ; move if lane 0 length is greater than lane 1 length + + mov idx, len2 ; retrieve index & length from [16:31] and [0:15] bit fields + shr DWORD(idx), 16 + and DWORD(len2), 0xffff + je len_is_0 + + sub word [state + _lens + 0*2], WORD(len2) + sub word [state + _lens + 1*2], WORD(len2) + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_ni + ; state is intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens + 2*idx], 1 + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + +%if SHA1NI_DIGEST_ROW_SIZE != 20 +%error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!" +%endif + lea p3, [idx + idx*4] + movdqu xmm0, [state + _args_digest + p3*4 + 0*SHA1_DIGEST_WORD_SIZE] + pshufb xmm0, [rel byteswap] + mov DWORD(tmp), [state + _args_digest + p3*4 + 4*SHA1_DIGEST_WORD_SIZE] + bswap DWORD(tmp) + movdqa [lane_data + _outer_block], xmm0 + mov [lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) + + mov tmp, [job + _auth_key_xor_opad] + movdqu xmm0, [tmp] + mov DWORD(tmp), [tmp + 4*SHA1_DIGEST_WORD_SIZE] + movdqu [state + _args_digest + p3*4 + 0*SHA1_DIGEST_WORD_SIZE], xmm0 + mov [state + _args_digest + p3*4 + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) + jmp start_loop + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + mov [state + _lens + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp start_loop + + align 16 +copy_lt64: + ;; less than one message block of data + ;; beginning of source block + ;; destination extrablock but backwards by len from where 0x80 pre-populated + lea p2, [lane_data + _extra_block + 64] + sub p2, len + memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 + mov unused_lanes, [state + _unused_lanes] + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 12 bytes +%if SHA1NI_DIGEST_ROW_SIZE != 20 +%error "Below code has been optimized for SHA1NI_DIGEST_ROW_SIZE = 20!" +%endif + lea idx, [idx + 4*idx] + mov DWORD(tmp), [state + _args_digest + idx*4 + 0*SHA1_DIGEST_WORD_SIZE] + mov DWORD(tmp2), [state + _args_digest + idx*4 + 1*SHA1_DIGEST_WORD_SIZE] + mov DWORD(tmp3), [state + _args_digest + idx*4 + 2*SHA1_DIGEST_WORD_SIZE] + bswap DWORD(tmp) + bswap DWORD(tmp2) + bswap DWORD(tmp3) + mov [p + 0*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) + mov [p + 1*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2) + mov [p + 2*SHA1_DIGEST_WORD_SIZE], DWORD(tmp3) + +return: + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*2] + mov rdi, [rsp + _gpr_save + 8*3] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_sse.asm b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_sse.asm new file mode 100644 index 00000000..deab9085 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_submit_sse.asm @@ -0,0 +1,312 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" +%include "job_aes_hmac.asm" +%include "mb_mgr_datastruct.asm" +%include "reg_sizes.asm" +%include "memcpy.asm" + +;%define DO_DBGPRINT +%include "dbgprint.asm" + +extern sha1_mult_sse + +section .data +default rel + +align 16 +byteswap: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +section .text + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define reg3 rcx +%define reg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define reg3 rdi +%define reg4 rsi +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + + +; idx needs to be in rbx, rbp, r12-r15 +%define last_len rbp +%define idx rbp + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx +%define tmp4 rbx + +%define job_rax rax +%define len rax + +%define size_offset reg3 +%define tmp2 reg3 + +%define lane reg4 +%define tmp3 reg4 + +%define extra_blocks r8 + +%define tmp r9 +%define p2 r9 + +%define lane_data r10 + +%endif + +; This routine clobbers rdi, rsi, rbx, rbp +struc STACK +_gpr_save: resq 4 +_rsp_save: resq 1 +endstruc + +; JOB* submit_job_hmac_sse(MB_MGR_HMAC_SHA_1_OOO *state, JOB_AES_HMAC *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +MKGLOBAL(submit_job_hmac_sse,function, internal) +submit_job_hmac_sse: + + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp +%ifndef LINUX + mov [rsp + _gpr_save + 8*2], rsi + mov [rsp + _gpr_save + 8*3], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + DBGPRINTL "enter sha1-sse submit" + mov unused_lanes, [state + _unused_lanes] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + imul lane_data, lane, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + mov [state + _unused_lanes], unused_lanes + mov len, [job + _msg_len_to_hash_in_bytes] + mov tmp, len + shr tmp, 6 ; divide by 64, len in terms of blocks + + mov [lane_data + _job_in_lane], job + mov dword [lane_data + _outer_done], 0 + mov [state + _lens + 2*lane], WORD(tmp) + + mov last_len, len + and last_len, 63 + lea extra_blocks, [last_len + 9 + 63] + shr extra_blocks, 6 + mov [lane_data + _extra_blocks], DWORD(extra_blocks) + + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + mov [state + _args_data_ptr + PTR_SZ*lane], p + cmp len, 64 + jb copy_lt64 + +fast_copy: + add p, len + movdqu xmm0, [p - 64 + 0*16] + movdqu xmm1, [p - 64 + 1*16] + movdqu xmm2, [p - 64 + 2*16] + movdqu xmm3, [p - 64 + 3*16] + movdqa [lane_data + _extra_block + 0*16], xmm0 + movdqa [lane_data + _extra_block + 1*16], xmm1 + movdqa [lane_data + _extra_block + 2*16], xmm2 + movdqa [lane_data + _extra_block + 3*16], xmm3 +end_fast_copy: + + mov size_offset, extra_blocks + shl size_offset, 6 + sub size_offset, last_len + add size_offset, 64-8 + mov [lane_data + _size_offset], DWORD(size_offset) + mov start_offset, 64 + sub start_offset, last_len + mov [lane_data + _start_offset], DWORD(start_offset) + + lea tmp, [8*64 + 8*len] + bswap tmp + mov [lane_data + _extra_block + size_offset], tmp + + mov tmp, [job + _auth_key_xor_ipad] + movdqu xmm0, [tmp] + mov DWORD(tmp), [tmp + 4*4] + movd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 0*SHA1_DIGEST_ROW_SIZE], xmm0 + pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1 + pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2 + pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3 + mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*lane + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp) + + test len, ~63 + jnz ge64_bytes + +lt64_bytes: + mov [state + _lens + 2*lane], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr + PTR_SZ*lane], tmp + mov dword [lane_data + _extra_blocks], 0 + +ge64_bytes: + cmp unused_lanes, 0xff + jne return_null + jmp start_loop + + align 16 +start_loop: + ; Find min length + movdqa xmm0, [state + _lens] + phminposuw xmm1, xmm0 + pextrw len2, xmm1, 0 ; min value + pextrw idx, xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + pshuflw xmm1, xmm1, 0 + psubw xmm0, xmm1 + movdqa [state + _lens], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call sha1_mult_sse + ; state is intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _HMAC_SHA1_LANE_DATA_size + lea lane_data, [state + _ldata + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done], 1 + mov DWORD(size_offset), [lane_data + _size_offset] + mov qword [lane_data + _extra_block + size_offset], 0 + mov word [state + _lens + 2*idx], 1 + lea tmp, [lane_data + _outer_block] + mov job, [lane_data + _job_in_lane] + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + + movd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] + pinsrd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], 1 + pinsrd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], 2 + pinsrd xmm0, [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], 3 + pshufb xmm0, [rel byteswap] + mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE] + bswap DWORD(tmp) + movdqa [lane_data + _outer_block], xmm0 + mov [lane_data + _outer_block + 4*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) + + mov tmp, [job + _auth_key_xor_opad] + movdqu xmm0, [tmp] + mov DWORD(tmp), [tmp + 4*4] + movd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE], xmm0 + pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE], xmm0, 1 + pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE], xmm0, 2 + pextrd [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 3*SHA1_DIGEST_ROW_SIZE], xmm0, 3 + mov [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 4*SHA1_DIGEST_ROW_SIZE], DWORD(tmp) + jmp start_loop + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset] + mov [state + _lens + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block + start_offset] + mov [state + _args_data_ptr + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks], 0 + jmp start_loop + + align 16 +copy_lt64: + ;; less than one message block of data + ;; beginning of source block + ;; destination extrablock but backwards by len from where 0x80 pre-populated + lea p2, [lane_data + _extra_block + 64] + sub p2, len + memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3 + mov unused_lanes, [state + _unused_lanes] + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane] + mov unused_lanes, [state + _unused_lanes] + mov qword [lane_data + _job_in_lane], 0 + or dword [job_rax + _status], STS_COMPLETED_HMAC + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ; copy 12 bytes + mov DWORD(tmp), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 0*SHA1_DIGEST_ROW_SIZE] + mov DWORD(tmp2), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 1*SHA1_DIGEST_ROW_SIZE] + mov DWORD(tmp3), [state + _args_digest + SHA1_DIGEST_WORD_SIZE*idx + 2*SHA1_DIGEST_ROW_SIZE] + bswap DWORD(tmp) + bswap DWORD(tmp2) + bswap DWORD(tmp3) + mov [p + 0*SHA1_DIGEST_WORD_SIZE], DWORD(tmp) + mov [p + 1*SHA1_DIGEST_WORD_SIZE], DWORD(tmp2) + mov [p + 2*SHA1_DIGEST_WORD_SIZE], DWORD(tmp3) + +return: + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*2] + mov rdi, [rsp + _gpr_save + 8*3] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/mb_mgr_sse.c b/src/spdk/intel-ipsec-mb/sse/mb_mgr_sse.c new file mode 100644 index 00000000..e78c8c6e --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/mb_mgr_sse.c @@ -0,0 +1,628 @@ +/******************************************************************************* + Copyright (c) 2012-2018, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#ifdef __WIN32 +#include <intrin.h> +#endif + +#include "intel-ipsec-mb.h" +#include "save_xmms.h" +#include "asm.h" +#include "des.h" + +JOB_AES_HMAC *submit_job_aes128_enc_sse(MB_MGR_AES_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_aes128_enc_sse(MB_MGR_AES_OOO *state); + +JOB_AES_HMAC *submit_job_aes192_enc_sse(MB_MGR_AES_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_aes192_enc_sse(MB_MGR_AES_OOO *state); + +JOB_AES_HMAC *submit_job_aes256_enc_sse(MB_MGR_AES_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_aes256_enc_sse(MB_MGR_AES_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_sse(MB_MGR_HMAC_SHA_1_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_sse(MB_MGR_HMAC_SHA_1_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_ni_sse(MB_MGR_HMAC_SHA_1_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_ni_sse(MB_MGR_HMAC_SHA_1_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_sha_224_sse(MB_MGR_HMAC_SHA_256_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_sha_224_sse(MB_MGR_HMAC_SHA_256_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_sha_224_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_sha_224_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_sha_256_sse(MB_MGR_HMAC_SHA_256_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_sha_256_sse(MB_MGR_HMAC_SHA_256_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_sha_256_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_sha_256_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_sha_384_sse(MB_MGR_HMAC_SHA_512_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_sha_384_sse(MB_MGR_HMAC_SHA_512_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_sha_512_sse(MB_MGR_HMAC_SHA_512_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_sha_512_sse(MB_MGR_HMAC_SHA_512_OOO *state); + +JOB_AES_HMAC *submit_job_hmac_md5_sse(MB_MGR_HMAC_MD5_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_hmac_md5_sse(MB_MGR_HMAC_MD5_OOO *state); + + +JOB_AES_HMAC *submit_job_aes_xcbc_sse(MB_MGR_AES_XCBC_OOO *state, + JOB_AES_HMAC *job); +JOB_AES_HMAC *flush_job_aes_xcbc_sse(MB_MGR_AES_XCBC_OOO *state); + +#define SAVE_XMMS save_xmms +#define RESTORE_XMMS restore_xmms +#define SUBMIT_JOB_AES128_ENC submit_job_aes128_enc_sse +#define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_sse +#define FLUSH_JOB_AES128_ENC flush_job_aes128_enc_sse +#define SUBMIT_JOB_AES192_ENC submit_job_aes192_enc_sse +#define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_sse +#define FLUSH_JOB_AES192_ENC flush_job_aes192_enc_sse +#define SUBMIT_JOB_AES256_ENC submit_job_aes256_enc_sse +#define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_sse +#define FLUSH_JOB_AES256_ENC flush_job_aes256_enc_sse +#define SUBMIT_JOB_HMAC submit_job_hmac_sse +#define FLUSH_JOB_HMAC flush_job_hmac_sse +#define SUBMIT_JOB_HMAC_NI submit_job_hmac_ni_sse +#define FLUSH_JOB_HMAC_NI flush_job_hmac_ni_sse +#define SUBMIT_JOB_HMAC_SHA_224 submit_job_hmac_sha_224_sse +#define FLUSH_JOB_HMAC_SHA_224 flush_job_hmac_sha_224_sse +#define SUBMIT_JOB_HMAC_SHA_224_NI submit_job_hmac_sha_224_ni_sse +#define FLUSH_JOB_HMAC_SHA_224_NI flush_job_hmac_sha_224_ni_sse +#define SUBMIT_JOB_HMAC_SHA_256 submit_job_hmac_sha_256_sse +#define FLUSH_JOB_HMAC_SHA_256 flush_job_hmac_sha_256_sse +#define SUBMIT_JOB_HMAC_SHA_256_NI submit_job_hmac_sha_256_ni_sse +#define FLUSH_JOB_HMAC_SHA_256_NI flush_job_hmac_sha_256_ni_sse +#define SUBMIT_JOB_HMAC_SHA_384 submit_job_hmac_sha_384_sse +#define FLUSH_JOB_HMAC_SHA_384 flush_job_hmac_sha_384_sse +#define SUBMIT_JOB_HMAC_SHA_512 submit_job_hmac_sha_512_sse +#define FLUSH_JOB_HMAC_SHA_512 flush_job_hmac_sha_512_sse +#define SUBMIT_JOB_HMAC_MD5 submit_job_hmac_md5_sse +#define FLUSH_JOB_HMAC_MD5 flush_job_hmac_md5_sse +#define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_sse +#define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_sse + +#define SUBMIT_JOB_AES128_CNTR submit_job_aes128_cntr_sse +#define SUBMIT_JOB_AES192_CNTR submit_job_aes192_cntr_sse +#define SUBMIT_JOB_AES256_CNTR submit_job_aes256_cntr_sse + +#define AES_CBC_DEC_128 aes_cbc_dec_128_sse +#define AES_CBC_DEC_192 aes_cbc_dec_192_sse +#define AES_CBC_DEC_256 aes_cbc_dec_256_sse + +#define AES_CNTR_128 aes_cntr_128_sse +#define AES_CNTR_192 aes_cntr_192_sse +#define AES_CNTR_256 aes_cntr_256_sse + +#ifndef NO_GCM +#define AES_GCM_DEC_128 aes_gcm_dec_128_sse +#define AES_GCM_ENC_128 aes_gcm_enc_128_sse +#define AES_GCM_DEC_192 aes_gcm_dec_192_sse +#define AES_GCM_ENC_192 aes_gcm_enc_192_sse +#define AES_GCM_DEC_256 aes_gcm_dec_256_sse +#define AES_GCM_ENC_256 aes_gcm_enc_256_sse +#endif /* NO_GCM */ + +/* ====================================================================== */ + +#define SUBMIT_JOB submit_job_sse +#define FLUSH_JOB flush_job_sse +#define SUBMIT_JOB_NOCHECK submit_job_nocheck_sse + +#define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_sse +#define SUBMIT_JOB_AES192_DEC submit_job_aes192_dec_sse +#define SUBMIT_JOB_AES256_DEC submit_job_aes256_dec_sse +#define QUEUE_SIZE queue_size_sse + +/* ====================================================================== */ + +#define SUBMIT_JOB_AES_ENC SUBMIT_JOB_AES_ENC_SSE +#define FLUSH_JOB_AES_ENC FLUSH_JOB_AES_ENC_SSE +#define SUBMIT_JOB_AES_DEC SUBMIT_JOB_AES_DEC_SSE +#define SUBMIT_JOB_HASH SUBMIT_JOB_HASH_SSE +#define FLUSH_JOB_HASH FLUSH_JOB_HASH_SSE + +/* ====================================================================== */ + +#define AES_CFB_128_ONE aes_cfb_128_one_sse + +void aes128_cbc_mac_x4(AES_ARGS_x8 *args, uint64_t len); + +#define AES128_CBC_MAC aes128_cbc_mac_x4 + +#define FLUSH_JOB_AES_CCM_AUTH flush_job_aes_ccm_auth_arch +#define SUBMIT_JOB_AES_CCM_AUTH submit_job_aes_ccm_auth_arch +#define AES_CCM_MAX_JOBS 4 + +#define FLUSH_JOB_AES_CMAC_AUTH flush_job_aes_cmac_auth_arch +#define SUBMIT_JOB_AES_CMAC_AUTH submit_job_aes_cmac_auth_arch +#define AES_CMAC_MAX_JOBS 4 + +/* ====================================================================== */ + +/* + * Used to decide if SHA1/SHA256 SIMD or SHA1NI OOO scheduler should be + * called. + */ +#define HASH_USE_SHAEXT 1 + +/* ====================================================================== */ + +struct cpuid_regs { + uint32_t eax; + uint32_t ebx; + uint32_t ecx; + uint32_t edx; +}; + +/* + * A C wrapper for CPUID opcode + * + * Parameters: + * [in] leaf - CPUID leaf number (EAX) + * [in] subleaf - CPUID sub-leaf number (ECX) + * [out] out - registers structure to store results of CPUID into + */ +static void +__mbcpuid(const unsigned leaf, const unsigned subleaf, + struct cpuid_regs *out) +{ +#ifdef _WIN32 + /* Windows */ + int regs[4]; + + __cpuidex(regs, leaf, subleaf); + out->eax = regs[0]; + out->ebx = regs[1]; + out->ecx = regs[2]; + out->edx = regs[3]; +#else + /* Linux */ +#ifdef __x86_64__ + asm volatile("mov %4, %%eax\n\t" + "mov %5, %%ecx\n\t" + "cpuid\n\t" + "mov %%eax, %0\n\t" + "mov %%ebx, %1\n\t" + "mov %%ecx, %2\n\t" + "mov %%edx, %3\n\t" + : "=g" (out->eax), "=g" (out->ebx), "=g" (out->ecx), + "=g" (out->edx) + : "g" (leaf), "g" (subleaf) + : "%eax", "%ebx", "%ecx", "%edx"); +#else + asm volatile("push %%ebx\n\t" + "mov %4, %%eax\n\t" + "mov %5, %%ecx\n\t" + "cpuid\n\t" + "mov %%eax, %0\n\t" + "mov %%ebx, %1\n\t" + "mov %%ecx, %2\n\t" + "mov %%edx, %3\n\t" + "pop %%ebx\n\t" + : "=g" (out->eax), "=g" (out->ebx), "=g" (out->ecx), + "=g" (out->edx) + : "g" (leaf), "g" (subleaf) + : "%eax", "%ecx", "%edx"); +#endif +#endif /* Linux */ +} + +/* + * Uses CPUID instruction to detected presence of SHA extensions. + * + * Return value: + * 0 - SHA extensions not present + * 1 - SHA extensions present + */ +static int +sha_extensions_supported(void) +{ + struct cpuid_regs r; + + /* Check highest leaf number. If less then 7 then SHA not supported. */ + __mbcpuid(0x0, 0x0, &r); + if (r.eax < 0x7) + return 0; + + /* Check presence of SHA extensions in the extended feature flags */ + __mbcpuid(0x7, 0x0, &r); + if (r.ebx & (1 << 29)) + return 1; + + return 0; +} + +void +init_mb_mgr_sse(MB_MGR *state) +{ + unsigned int j; + uint8_t *p; + + state->features &= (~IMB_FEATURE_SHANI); + if (!(state->flags & IMB_FLAG_SHANI_OFF)) + if (sha_extensions_supported()) + state->features |= IMB_FEATURE_SHANI; + + /* Init AES out-of-order fields */ + state->aes128_ooo.lens[0] = 0; + state->aes128_ooo.lens[1] = 0; + state->aes128_ooo.lens[2] = 0; + state->aes128_ooo.lens[3] = 0; + state->aes128_ooo.lens[4] = 0xFFFF; + state->aes128_ooo.lens[5] = 0xFFFF; + state->aes128_ooo.lens[6] = 0xFFFF; + state->aes128_ooo.lens[7] = 0xFFFF; + state->aes128_ooo.unused_lanes = 0xFF03020100; + state->aes128_ooo.job_in_lane[0] = NULL; + state->aes128_ooo.job_in_lane[1] = NULL; + state->aes128_ooo.job_in_lane[2] = NULL; + state->aes128_ooo.job_in_lane[3] = NULL; + + state->aes192_ooo.lens[0] = 0; + state->aes192_ooo.lens[1] = 0; + state->aes192_ooo.lens[2] = 0; + state->aes192_ooo.lens[3] = 0; + state->aes192_ooo.lens[4] = 0xFFFF; + state->aes192_ooo.lens[5] = 0xFFFF; + state->aes192_ooo.lens[6] = 0xFFFF; + state->aes192_ooo.lens[7] = 0xFFFF; + state->aes192_ooo.unused_lanes = 0xFF03020100; + state->aes192_ooo.job_in_lane[0] = NULL; + state->aes192_ooo.job_in_lane[1] = NULL; + state->aes192_ooo.job_in_lane[2] = NULL; + state->aes192_ooo.job_in_lane[3] = NULL; + + state->aes256_ooo.lens[0] = 0; + state->aes256_ooo.lens[1] = 0; + state->aes256_ooo.lens[2] = 0; + state->aes256_ooo.lens[3] = 0; + state->aes256_ooo.lens[4] = 0xFFFF; + state->aes256_ooo.lens[5] = 0xFFFF; + state->aes256_ooo.lens[6] = 0xFFFF; + state->aes256_ooo.lens[7] = 0xFFFF; + state->aes256_ooo.unused_lanes = 0xFF03020100; + state->aes256_ooo.job_in_lane[0] = NULL; + state->aes256_ooo.job_in_lane[1] = NULL; + state->aes256_ooo.job_in_lane[2] = NULL; + state->aes256_ooo.job_in_lane[3] = NULL; + + /* DOCSIS SEC BPI uses same settings as AES128 CBC */ + state->docsis_sec_ooo.lens[0] = 0; + state->docsis_sec_ooo.lens[1] = 0; + state->docsis_sec_ooo.lens[2] = 0; + state->docsis_sec_ooo.lens[3] = 0; + state->docsis_sec_ooo.lens[4] = 0xFFFF; + state->docsis_sec_ooo.lens[5] = 0xFFFF; + state->docsis_sec_ooo.lens[6] = 0xFFFF; + state->docsis_sec_ooo.lens[7] = 0xFFFF; + state->docsis_sec_ooo.unused_lanes = 0xFF03020100; + state->docsis_sec_ooo.job_in_lane[0] = NULL; + state->docsis_sec_ooo.job_in_lane[1] = NULL; + state->docsis_sec_ooo.job_in_lane[2] = NULL; + state->docsis_sec_ooo.job_in_lane[3] = NULL; + + /* Init HMAC/SHA1 out-of-order fields */ + state->hmac_sha_1_ooo.lens[0] = 0; + state->hmac_sha_1_ooo.lens[1] = 0; + state->hmac_sha_1_ooo.lens[2] = 0; + state->hmac_sha_1_ooo.lens[3] = 0; + state->hmac_sha_1_ooo.lens[4] = 0xFFFF; + state->hmac_sha_1_ooo.lens[5] = 0xFFFF; + state->hmac_sha_1_ooo.lens[6] = 0xFFFF; + state->hmac_sha_1_ooo.lens[7] = 0xFFFF; + state->hmac_sha_1_ooo.unused_lanes = 0xFF03020100; + for (j = 0; j < SSE_NUM_SHA1_LANES; j++) { + state->hmac_sha_1_ooo.ldata[j].job_in_lane = NULL; + state->hmac_sha_1_ooo.ldata[j].extra_block[64] = 0x80; + memset(state->hmac_sha_1_ooo.ldata[j].extra_block + 65, + 0x00, + 64+7); + p = state->hmac_sha_1_ooo.ldata[j].outer_block; + memset(p + 5*4 + 1, + 0x00, + 64 - 5*4 - 1 - 2); + p[5*4] = 0x80; + p[64-2] = 0x02; + p[64-1] = 0xA0; + } + +#ifdef HASH_USE_SHAEXT + if (state->features & IMB_FEATURE_SHANI) { + /* Init HMAC/SHA1 NI out-of-order fields */ + state->hmac_sha_1_ooo.lens[0] = 0; + state->hmac_sha_1_ooo.lens[1] = 0; + state->hmac_sha_1_ooo.lens[2] = 0xFFFF; + state->hmac_sha_1_ooo.lens[3] = 0xFFFF; + state->hmac_sha_1_ooo.lens[4] = 0xFFFF; + state->hmac_sha_1_ooo.lens[5] = 0xFFFF; + state->hmac_sha_1_ooo.lens[6] = 0xFFFF; + state->hmac_sha_1_ooo.lens[7] = 0xFFFF; + state->hmac_sha_1_ooo.unused_lanes = 0xFF0100; + } +#endif /* HASH_USE_SHAEXT */ + + /* Init HMAC/SHA224 out-of-order fields */ + state->hmac_sha_224_ooo.lens[0] = 0; + state->hmac_sha_224_ooo.lens[1] = 0; + state->hmac_sha_224_ooo.lens[2] = 0; + state->hmac_sha_224_ooo.lens[3] = 0; + state->hmac_sha_224_ooo.lens[4] = 0xFFFF; + state->hmac_sha_224_ooo.lens[5] = 0xFFFF; + state->hmac_sha_224_ooo.lens[6] = 0xFFFF; + state->hmac_sha_224_ooo.lens[7] = 0xFFFF; + state->hmac_sha_224_ooo.unused_lanes = 0xFF03020100; + for (j = 0; j < SSE_NUM_SHA256_LANES; j++) { + state->hmac_sha_224_ooo.ldata[j].job_in_lane = NULL; + state->hmac_sha_224_ooo.ldata[j].extra_block[64] = 0x80; + memset(state->hmac_sha_224_ooo.ldata[j].extra_block + 65, + 0x00, + 64+7); + p = state->hmac_sha_224_ooo.ldata[j].outer_block; + memset(p + 8*4 + 1, + 0x00, + 64 - 8*4 - 1 - 2); + p[7*4] = 0x80; /* digest 7 words long */ + p[64-2] = 0x02; /* length in little endian = 0x02E0 */ + p[64-1] = 0xE0; + } +#ifdef HASH_USE_SHAEXT + if (state->features & IMB_FEATURE_SHANI) { + /* Init HMAC/SHA224 NI out-of-order fields */ + state->hmac_sha_224_ooo.lens[0] = 0; + state->hmac_sha_224_ooo.lens[1] = 0; + state->hmac_sha_224_ooo.lens[2] = 0xFFFF; + state->hmac_sha_224_ooo.lens[3] = 0xFFFF; + state->hmac_sha_224_ooo.lens[4] = 0xFFFF; + state->hmac_sha_224_ooo.lens[5] = 0xFFFF; + state->hmac_sha_224_ooo.lens[6] = 0xFFFF; + state->hmac_sha_224_ooo.lens[7] = 0xFFFF; + state->hmac_sha_224_ooo.unused_lanes = 0xFF0100; + } +#endif /* HASH_USE_SHAEXT */ + + /* Init HMAC/SHA_256 out-of-order fields */ + state->hmac_sha_256_ooo.lens[0] = 0; + state->hmac_sha_256_ooo.lens[1] = 0; + state->hmac_sha_256_ooo.lens[2] = 0; + state->hmac_sha_256_ooo.lens[3] = 0; + state->hmac_sha_256_ooo.lens[4] = 0xFFFF; + state->hmac_sha_256_ooo.lens[5] = 0xFFFF; + state->hmac_sha_256_ooo.lens[6] = 0xFFFF; + state->hmac_sha_256_ooo.lens[7] = 0xFFFF; + state->hmac_sha_256_ooo.unused_lanes = 0xFF03020100; + for (j = 0; j < SSE_NUM_SHA256_LANES; j++) { + state->hmac_sha_256_ooo.ldata[j].job_in_lane = NULL; + state->hmac_sha_256_ooo.ldata[j].extra_block[64] = 0x80; + memset(state->hmac_sha_256_ooo.ldata[j].extra_block + 65, + 0x00, + 64+7); + p = state->hmac_sha_256_ooo.ldata[j].outer_block; + memset(p + 8*4 + 1, + 0x00, + 64 - 8*4 - 1 - 2); /* digest is 8*4 bytes long */ + p[8*4] = 0x80; + p[64-2] = 0x03; /* length of (opad (64*8) bits + 256 bits) + * in hex is 0x300 */ + p[64-1] = 0x00; + } +#ifdef HASH_USE_SHAEXT + if (state->features & IMB_FEATURE_SHANI) { + /* Init HMAC/SHA256 NI out-of-order fields */ + state->hmac_sha_256_ooo.lens[0] = 0; + state->hmac_sha_256_ooo.lens[1] = 0; + state->hmac_sha_256_ooo.lens[2] = 0xFFFF; + state->hmac_sha_256_ooo.lens[3] = 0xFFFF; + state->hmac_sha_256_ooo.lens[4] = 0xFFFF; + state->hmac_sha_256_ooo.lens[5] = 0xFFFF; + state->hmac_sha_256_ooo.lens[6] = 0xFFFF; + state->hmac_sha_256_ooo.lens[7] = 0xFFFF; + state->hmac_sha_256_ooo.unused_lanes = 0xFF0100; + } +#endif /* HASH_USE_SHAEXT */ + + /* Init HMAC/SHA384 out-of-order fields */ + state->hmac_sha_384_ooo.lens[0] = 0; + state->hmac_sha_384_ooo.lens[1] = 0; + state->hmac_sha_384_ooo.lens[2] = 0xFFFF; + state->hmac_sha_384_ooo.lens[3] = 0xFFFF; + state->hmac_sha_384_ooo.lens[4] = 0xFFFF; + state->hmac_sha_384_ooo.lens[5] = 0xFFFF; + state->hmac_sha_384_ooo.lens[6] = 0xFFFF; + state->hmac_sha_384_ooo.lens[7] = 0xFFFF; + state->hmac_sha_384_ooo.unused_lanes = 0xFF0100; + for (j = 0; j < SSE_NUM_SHA512_LANES; j++) { + MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_384_ooo; + + ctx->ldata[j].job_in_lane = NULL; + ctx->ldata[j].extra_block[SHA_384_BLOCK_SIZE] = 0x80; + memset(ctx->ldata[j].extra_block + (SHA_384_BLOCK_SIZE + 1), + 0x00, SHA_384_BLOCK_SIZE + 7); + + p = ctx->ldata[j].outer_block; + memset(p + SHA384_DIGEST_SIZE_IN_BYTES + 1, 0x00, + /* special end point because this length is constant */ + SHA_384_BLOCK_SIZE - + SHA384_DIGEST_SIZE_IN_BYTES - 1 - 2); + p[SHA384_DIGEST_SIZE_IN_BYTES] = 0x80; /* mark the end */ + /* + * hmac outer block length always of fixed size, it is OKey + * length, a whole message block length, 1024 bits, with padding + * plus the length of the inner digest, which is 384 bits + * 1408 bits == 0x0580. The input message block needs to be + * converted to big endian within the sha implementation + * before use. + */ + p[SHA_384_BLOCK_SIZE - 2] = 0x05; + p[SHA_384_BLOCK_SIZE - 1] = 0x80; + } + + /* Init HMAC/SHA512 out-of-order fields */ + state->hmac_sha_512_ooo.lens[0] = 0; + state->hmac_sha_512_ooo.lens[1] = 0; + state->hmac_sha_512_ooo.lens[2] = 0xFFFF; + state->hmac_sha_512_ooo.lens[3] = 0xFFFF; + state->hmac_sha_512_ooo.lens[4] = 0xFFFF; + state->hmac_sha_512_ooo.lens[5] = 0xFFFF; + state->hmac_sha_512_ooo.lens[6] = 0xFFFF; + state->hmac_sha_512_ooo.lens[7] = 0xFFFF; + state->hmac_sha_512_ooo.unused_lanes = 0xFF0100; + for (j = 0; j < SSE_NUM_SHA512_LANES; j++) { + MB_MGR_HMAC_SHA_512_OOO *ctx = &state->hmac_sha_512_ooo; + + ctx->ldata[j].job_in_lane = NULL; + ctx->ldata[j].extra_block[SHA_512_BLOCK_SIZE] = 0x80; + memset(ctx->ldata[j].extra_block + (SHA_512_BLOCK_SIZE + 1), + 0x00, SHA_512_BLOCK_SIZE + 7); + + p = ctx->ldata[j].outer_block; + memset(p + SHA512_DIGEST_SIZE_IN_BYTES + 1, 0x00, + /* special end point because this length is constant */ + SHA_512_BLOCK_SIZE - + SHA512_DIGEST_SIZE_IN_BYTES - 1 - 2); + p[SHA512_DIGEST_SIZE_IN_BYTES] = 0x80; /* mark the end */ + /* + * hmac outer block length always of fixed size, it is OKey + * length, a whole message block length, 1024 bits, with padding + * plus the length of the inner digest, which is 512 bits + * 1536 bits == 0x600. The input message block needs to be + * converted to big endian within the sha implementation + * before use. + */ + p[SHA_512_BLOCK_SIZE - 2] = 0x06; + p[SHA_512_BLOCK_SIZE - 1] = 0x00; + } + + /* Init HMAC/MD5 out-of-order fields */ + state->hmac_md5_ooo.lens[0] = 0; + state->hmac_md5_ooo.lens[1] = 0; + state->hmac_md5_ooo.lens[2] = 0; + state->hmac_md5_ooo.lens[3] = 0; + state->hmac_md5_ooo.lens[4] = 0; + state->hmac_md5_ooo.lens[5] = 0; + state->hmac_md5_ooo.lens[6] = 0; + state->hmac_md5_ooo.lens[7] = 0; + state->hmac_md5_ooo.lens[8] = 0xFFFF; + state->hmac_md5_ooo.lens[9] = 0xFFFF; + state->hmac_md5_ooo.lens[10] = 0xFFFF; + state->hmac_md5_ooo.lens[11] = 0xFFFF; + state->hmac_md5_ooo.lens[12] = 0xFFFF; + state->hmac_md5_ooo.lens[13] = 0xFFFF; + state->hmac_md5_ooo.lens[14] = 0xFFFF; + state->hmac_md5_ooo.lens[15] = 0xFFFF; + state->hmac_md5_ooo.unused_lanes = 0xF76543210; + for (j = 0; j < SSE_NUM_MD5_LANES; j++) { + state->hmac_md5_ooo.ldata[j].job_in_lane = NULL; + state->hmac_md5_ooo.ldata[j].extra_block[64] = 0x80; + memset(state->hmac_md5_ooo.ldata[j].extra_block + 65, + 0x00, + 64 + 7); + p = state->hmac_md5_ooo.ldata[j].outer_block; + memset(p + (5 * 4) + 1, + 0x00, + 64 - (5 * 4) - 1 - 2); + p[4*4] = 0x80; + p[64-7] = 0x02; + p[64-8] = 0x80; + } + + /* Init AES/XCBC OOO fields */ + state->aes_xcbc_ooo.lens[0] = 0; + state->aes_xcbc_ooo.lens[1] = 0; + state->aes_xcbc_ooo.lens[2] = 0; + state->aes_xcbc_ooo.lens[3] = 0; + state->aes_xcbc_ooo.lens[4] = 0xFFFF; + state->aes_xcbc_ooo.lens[5] = 0xFFFF; + state->aes_xcbc_ooo.lens[6] = 0xFFFF; + state->aes_xcbc_ooo.lens[7] = 0xFFFF; + state->aes_xcbc_ooo.unused_lanes = 0xFF03020100; + for (j = 0; j < 4; j++) { + state->aes_xcbc_ooo.ldata[j].job_in_lane = NULL; + state->aes_xcbc_ooo.ldata[j].final_block[16] = 0x80; + memset(state->aes_xcbc_ooo.ldata[j].final_block + 17, 0x00, 15); + } + + /* Init AES-CCM auth out-of-order fields */ + for (j = 0; j < 4; j++) { + state->aes_ccm_ooo.init_done[j] = 0; + state->aes_ccm_ooo.lens[j] = 0; + state->aes_ccm_ooo.job_in_lane[j] = NULL; + } + state->aes_ccm_ooo.unused_lanes = 0xF3210; + + /* Init AES-CMAC auth out-of-order fields */ + for (j = 0; j < 4; j++) { + state->aes_cmac_ooo.init_done[j] = 0; + state->aes_cmac_ooo.lens[j] = 0; + state->aes_cmac_ooo.job_in_lane[j] = NULL; + } + state->aes_cmac_ooo.unused_lanes = 0xF3210; + + /* Init "in order" components */ + state->next_job = 0; + state->earliest_job = -1; + + /* set SSE handlers */ + state->get_next_job = get_next_job_sse; + state->submit_job = submit_job_sse; + state->submit_job_nocheck = submit_job_nocheck_sse; + state->get_completed_job = get_completed_job_sse; + state->flush_job = flush_job_sse; + state->queue_size = queue_size_sse; + state->keyexp_128 = aes_keyexp_128_sse; + state->keyexp_192 = aes_keyexp_192_sse; + state->keyexp_256 = aes_keyexp_256_sse; + state->cmac_subkey_gen_128 = aes_cmac_subkey_gen_sse; + state->xcbc_keyexp = aes_xcbc_expand_key_sse; + state->des_key_sched = des_key_schedule; + state->sha1_one_block = sha1_one_block_sse; + state->sha224_one_block = sha224_one_block_sse; + state->sha256_one_block = sha256_one_block_sse; + state->sha384_one_block = sha384_one_block_sse; + state->sha512_one_block = sha512_one_block_sse; + state->md5_one_block = md5_one_block_sse; +} + +#include "mb_mgr_code.h" diff --git a/src/spdk/intel-ipsec-mb/sse/md5_x4x2_sse.asm b/src/spdk/intel-ipsec-mb/sse/md5_x4x2_sse.asm new file mode 100644 index 00000000..ef9edafc --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/md5_x4x2_sse.asm @@ -0,0 +1,777 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; code to compute octal MD5 using SSE + +;; Stack must be aligned to 16 bytes before call +;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15 +;; Windows preserves: rcx rbp +;; +;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15 +;; Linux preserves: rdi rbp +;; +;; clobbers xmm0-15 + +%include "os.asm" +%include "mb_mgr_datastruct.asm" + +section .data align=64 +default rel + +align 64 +MKGLOBAL(MD5_TABLE,data,internal) +MD5_TABLE: + dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478 + dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756 + dd 0x242070db, 0x242070db, 0x242070db, 0x242070db + dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee + dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf + dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a + dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613 + dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501 + dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8 + dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af + dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1 + dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be + dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122 + dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193 + dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e + dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821 + dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562 + dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340 + dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51 + dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa + dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d + dd 0x02441453, 0x02441453, 0x02441453, 0x02441453 + dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681 + dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8 + dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6 + dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6 + dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87 + dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed + dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905 + dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8 + dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9 + dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a + dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942 + dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681 + dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122 + dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c + dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44 + dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9 + dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60 + dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70 + dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6 + dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa + dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085 + dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05 + dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039 + dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5 + dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8 + dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665 + dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244 + dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97 + dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7 + dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039 + dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3 + dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92 + dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d + dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1 + dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f + dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0 + dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314 + dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1 + dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82 + dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235 + dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb + dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391 + +ONES: + dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff + +section .text + +%ifdef LINUX +;; Linux Registers +%define arg1 rdi +%define arg2 rsi +%define mem1 rcx +%define mem2 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define mem1 rdi +%define mem2 rsi +%endif + +;; rbp is not clobbered + +%define inp0 r8 +%define inp1 r9 +%define inp2 r10 +%define inp3 r11 +%define inp4 r12 +%define inp5 r13 +%define inp6 r14 +%define inp7 r15 + +%define TBL rax +%define IDX rbx + +%define A xmm0 +%define B xmm1 +%define C xmm2 +%define D xmm3 +%define E xmm4 ; tmp +%define F xmm5 ; tmp + +%define A2 xmm6 +%define B2 xmm7 +%define C2 xmm8 +%define D2 xmm9 + + +%define FUN E +%define TMP F +%define FUN2 xmm10 +%define TMP2 xmm11 + +%define T0 xmm10 +%define T1 xmm11 +%define T2 xmm12 +%define T3 xmm13 +%define T4 xmm14 +%define T5 xmm15 + +; Stack Layout +; +; 470 DD2 +; 460 CC2 +; 450 BB2 +; 440 AA2 +; 430 DD +; 420 CC +; 410 BB +; 400 AA +; +; 3F0 data2[15] for lanes 7...4 \ +; ... \ +; 300 data2[0] for lanes 7...4 \ +; 2F0 data2[15] for lanes 3...0 > mem block 2 +; ... / +; 210 data2[1] for lanes 3...0 / +; 200 data2[0] for lanes 3...0 / +; +; 1F0 data1[15] for lanes 7...4 \ +; ... \ +; 100 data1[0] for lanes 7...4 \ +; F0 data1[15] for lanes 3...0 > mem block 1 +; ... / +; 10 data1[1] for lanes 3...0 / +; 0 data1[0] for lanes 3...0 / + +; stack size must be an odd multiple of 8 bytes in size +struc STACK +_DATA: reso 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs +_DIGEST: reso 8 ; stores AA-DD, AA2-DD2 + resb 8 ; for alignment +endstruc +%define STACK_SIZE STACK_size + +%define AA rsp + _DIGEST + 16*0 +%define BB rsp + _DIGEST + 16*1 +%define CC rsp + _DIGEST + 16*2 +%define DD rsp + _DIGEST + 16*3 +%define AA2 rsp + _DIGEST + 16*4 +%define BB2 rsp + _DIGEST + 16*5 +%define CC2 rsp + _DIGEST + 16*6 +%define DD2 rsp + _DIGEST + 16*7 + +;; +;; MD5 left rotations (number of bits) +;; +rot11 equ 7 +rot12 equ 12 +rot13 equ 17 +rot14 equ 22 +rot21 equ 5 +rot22 equ 9 +rot23 equ 14 +rot24 equ 20 +rot31 equ 4 +rot32 equ 11 +rot33 equ 16 +rot34 equ 23 +rot41 equ 6 +rot42 equ 10 +rot43 equ 15 +rot44 equ 21 + +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a3 a2 a1 a0} +; r1 = {b3 b2 b1 b0} +; r2 = {c3 c2 c1 c0} +; r3 = {d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; r0 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + movdqa %%t0, %%r0 + shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} + shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} + + movdqa %%t1, %%r2 + shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} + shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} + + movdqa %%r1, %%t0 + shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} + + movdqa %%r3, %%r0 + shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} + + shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} + shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} +%endmacro + +;; +;; Magic functions defined in RFC 1321 +;; +; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z)))) +%macro MAGIC_F 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + movdqa %%F,%%Z + pxor %%F,%%Y + pand %%F,%%X + pxor %%F,%%Z +%endmacro + +; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y)) +%macro MAGIC_G 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + MAGIC_F %%F,%%Z,%%X,%%Y +%endmacro + +; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z)) +%macro MAGIC_H 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + movdqa %%F,%%Z + pxor %%F,%%Y + pxor %%F,%%X +%endmacro + +; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z))) +%macro MAGIC_I 4 +%define %%F %1 +%define %%X %2 +%define %%Y %3 +%define %%Z %4 + movdqa %%F,%%Z + pxor %%F,[rel ONES] ; pnot %%F + por %%F,%%X + pxor %%F,%%Y +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + movdqa %%tmp, %%reg + psrld %%tmp, (32-%%imm) + pslld %%reg, %%imm + por %%reg, %%tmp +%endmacro + +;; +;; single MD5 step +;; +;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) +;; +; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot +%macro MD5_STEP1 14 +%define %%MAGIC_FUN %1 +%define %%A %2 +%define %%B %3 +%define %%C %4 +%define %%D %5 +%define %%A2 %6 +%define %%B2 %7 +%define %%C2 %8 +%define %%D2 %9 +%define %%FUN %10 +%define %%TMP %11 +%define %%data %12 +%define %%MD5const %13 +%define %%nrot %14 + + paddd %%A, %%MD5const + paddd %%A2, %%MD5const + paddd %%A, [%%data] + paddd %%A2, [%%data + 16*16] + %%MAGIC_FUN %%FUN, %%B,%%C,%%D + paddd %%A, %%FUN + %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2 + paddd %%A2, %%FUN + PROLD %%A,%%nrot, %%TMP + PROLD %%A2,%%nrot, %%TMP + paddd %%A, %%B + paddd %%A2, %%B2 +%endmacro + +;; +;; single MD5 step +;; +;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot) +;; +; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data, +; MD5const, nrot +%macro MD5_STEP 16 +%define %%MAGIC_FUN %1 +%define %%A %2 +%define %%B %3 +%define %%C %4 +%define %%D %5 +%define %%A2 %6 +%define %%B2 %7 +%define %%C2 %8 +%define %%D2 %9 +%define %%FUN %10 +%define %%TMP %11 +%define %%FUN2 %12 +%define %%TMP2 %13 +%define %%data %14 +%define %%MD5const %15 +%define %%nrot %16 + + paddd %%A, %%MD5const + paddd %%A2, %%MD5const + paddd %%A, [%%data] + paddd %%A2, [%%data + 16*16] + %%MAGIC_FUN %%FUN, %%B,%%C,%%D + %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2 + paddd %%A, %%FUN + paddd %%A2, %%FUN2 + PROLD %%A,%%nrot, %%TMP + PROLD %%A2,%%nrot, %%TMP2 + paddd %%A, %%B + paddd %%A2, %%B2 +%endmacro + +; void md5_x4x2_sse(MD5_ARGS *args, UINT64 num_blks) +; arg 1 : pointer to MD5_ARGS structure +; arg 2 : number of blocks (>=1) +; +align 32 +MKGLOBAL(md5_x4x2_sse,function,internal) +md5_x4x2_sse: + + sub rsp, STACK_SIZE + + ;; each row of transposed digests is split into 2 parts, the right half stored in A, and left half in A2 + ;; Initialize digests + movdqa A,[arg1 + 0*16 + 0*MD5_DIGEST_ROW_SIZE] + movdqa B,[arg1 + 0*16 + 1*MD5_DIGEST_ROW_SIZE] + movdqa C,[arg1 + 0*16 + 2*MD5_DIGEST_ROW_SIZE] + movdqa D,[arg1 + 0*16 + 3*MD5_DIGEST_ROW_SIZE] + + ;; Initialize digests + movdqa A2,[arg1 + 1*16 + 0*MD5_DIGEST_ROW_SIZE] + movdqa B2,[arg1 + 1*16 + 1*MD5_DIGEST_ROW_SIZE] + movdqa C2,[arg1 + 1*16 + 2*MD5_DIGEST_ROW_SIZE] + movdqa D2,[arg1 + 1*16 + 3*MD5_DIGEST_ROW_SIZE] + + lea TBL, [rel MD5_TABLE] + + ;; load input pointers + mov inp0,[arg1+_data_ptr_md5 +0*PTR_SZ] + mov inp1,[arg1+_data_ptr_md5 +1*PTR_SZ] + mov inp2,[arg1+_data_ptr_md5 +2*PTR_SZ] + mov inp3,[arg1+_data_ptr_md5 +3*PTR_SZ] + mov inp4,[arg1+_data_ptr_md5 +4*PTR_SZ] + mov inp5,[arg1+_data_ptr_md5 +5*PTR_SZ] + mov inp6,[arg1+_data_ptr_md5 +6*PTR_SZ] + mov inp7,[arg1+_data_ptr_md5 +7*PTR_SZ] + xor IDX, IDX + + ; Make ping-pong pointers to the two memory blocks + mov mem1, rsp + lea mem2, [rsp + 16*16*2] + + +;; Load first block of data and save back to stack +%assign I 0 +%rep 4 + movdqu T2,[inp0+IDX+I*16] + movdqu T1,[inp1+IDX+I*16] + movdqu T4,[inp2+IDX+I*16] + movdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem1+(I*4+0)*16],T0 + movdqa [mem1+(I*4+1)*16],T1 + movdqa [mem1+(I*4+2)*16],T2 + movdqa [mem1+(I*4+3)*16],T3 + + movdqu T2,[inp4+IDX+I*16] + movdqu T1,[inp5+IDX+I*16] + movdqu T4,[inp6+IDX+I*16] + movdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem1+(I*4+0)*16 + 16*16],T0 + movdqa [mem1+(I*4+1)*16 + 16*16],T1 + movdqa [mem1+(I*4+2)*16 + 16*16],T2 + movdqa [mem1+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) +%endrep + +lloop: + ; save old digests + movdqa [AA], A + movdqa [BB], B + movdqa [CC], C + movdqa [DD], D + ; save old digests + movdqa [AA2], A2 + movdqa [BB2], B2 + movdqa [CC2], C2 + movdqa [DD2], D2 + + add IDX, 4*16 + sub arg2, 1 + je lastblock + + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+ 0*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 1*16, [TBL+ 1*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+ 2*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 3*16, [TBL+ 3*16], rot14 + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+ 4*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 5*16, [TBL+ 5*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+ 6*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 7*16, [TBL+ 7*16], rot14 + +%assign I 0 + movdqu T2,[inp0+IDX+I*16] + movdqu T1,[inp1+IDX+I*16] + movdqu T4,[inp2+IDX+I*16] + movdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16],T0 + movdqa [mem2+(I*4+1)*16],T1 + movdqa [mem2+(I*4+2)*16],T2 + movdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+ 8*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 9*16, [TBL+ 9*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+10*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +11*16, [TBL+11*16], rot14 + MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+12*16], rot11 + MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +13*16, [TBL+13*16], rot12 + MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+14*16], rot13 + MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +15*16, [TBL+15*16], rot14 + + + movdqu T2,[inp4+IDX+I*16] + movdqu T1,[inp5+IDX+I*16] + movdqu T4,[inp6+IDX+I*16] + movdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16 + 16*16],T0 + movdqa [mem2+(I*4+1)*16 + 16*16],T1 + movdqa [mem2+(I*4+2)*16 + 16*16],T2 + movdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+16*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 6*16, [TBL+17*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+18*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 0*16, [TBL+19*16], rot24 + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+20*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +10*16, [TBL+21*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+22*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 4*16, [TBL+23*16], rot24 + + movdqu T2,[inp0+IDX+I*16] + movdqu T1,[inp1+IDX+I*16] + movdqu T4,[inp2+IDX+I*16] + movdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16],T0 + movdqa [mem2+(I*4+1)*16],T1 + movdqa [mem2+(I*4+2)*16],T2 + movdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+24*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +14*16, [TBL+25*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+26*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 8*16, [TBL+27*16], rot24 + MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+28*16], rot21 + MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 2*16, [TBL+29*16], rot22 + MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+30*16], rot23 + MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +12*16, [TBL+31*16], rot24 + + movdqu T2,[inp4+IDX+I*16] + movdqu T1,[inp5+IDX+I*16] + movdqu T4,[inp6+IDX+I*16] + movdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16 + 16*16],T0 + movdqa [mem2+(I*4+1)*16 + 16*16],T1 + movdqa [mem2+(I*4+2)*16 + 16*16],T2 + movdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+32*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 8*16, [TBL+33*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+34*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +14*16, [TBL+35*16], rot34 + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+36*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 4*16, [TBL+37*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+38*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +10*16, [TBL+39*16], rot34 + + movdqu T2,[inp0+IDX+I*16] + movdqu T1,[inp1+IDX+I*16] + movdqu T4,[inp2+IDX+I*16] + movdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16],T0 + movdqa [mem2+(I*4+1)*16],T1 + movdqa [mem2+(I*4+2)*16],T2 + movdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+40*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 0*16, [TBL+41*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+42*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 6*16, [TBL+43*16], rot34 + MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+44*16], rot31 + MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +12*16, [TBL+45*16], rot32 + MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+46*16], rot33 + MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 2*16, [TBL+47*16], rot34 + + movdqu T2,[inp4+IDX+I*16] + movdqu T1,[inp5+IDX+I*16] + movdqu T4,[inp6+IDX+I*16] + movdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16 + 16*16],T0 + movdqa [mem2+(I*4+1)*16 + 16*16],T1 + movdqa [mem2+(I*4+2)*16 + 16*16],T2 + movdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+48*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 7*16, [TBL+49*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+50*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 5*16, [TBL+51*16], rot44 + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+52*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 3*16, [TBL+53*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+54*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 1*16, [TBL+55*16], rot44 + + movdqu T2,[inp0+IDX+I*16] + movdqu T1,[inp1+IDX+I*16] + movdqu T4,[inp2+IDX+I*16] + movdqu T3,[inp3+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16],T0 + movdqa [mem2+(I*4+1)*16],T1 + movdqa [mem2+(I*4+2)*16],T2 + movdqa [mem2+(I*4+3)*16],T3 + + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+56*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +15*16, [TBL+57*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+58*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +13*16, [TBL+59*16], rot44 + MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+60*16], rot41 + MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +11*16, [TBL+61*16], rot42 + MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+62*16], rot43 + MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 9*16, [TBL+63*16], rot44 + + movdqu T2,[inp4+IDX+I*16] + movdqu T1,[inp5+IDX+I*16] + movdqu T4,[inp6+IDX+I*16] + movdqu T3,[inp7+IDX+I*16] + TRANSPOSE T2, T1, T4, T3, T0, T5 + movdqa [mem2+(I*4+0)*16 + 16*16],T0 + movdqa [mem2+(I*4+1)*16 + 16*16],T1 + movdqa [mem2+(I*4+2)*16 + 16*16],T2 + movdqa [mem2+(I*4+3)*16 + 16*16],T3 +%assign I (I+1) + + + paddd A,[AA] + paddd B,[BB] + paddd C,[CC] + paddd D,[DD] + + paddd A2,[AA2] + paddd B2,[BB2] + paddd C2,[CC2] + paddd D2,[DD2] + + ; swap mem1 and mem2 + xchg mem1, mem2 + + jmp lloop + +lastblock: + + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+ 0*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+ 1*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+ 2*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+ 3*16], rot14 + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+ 4*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+ 5*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+ 6*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+ 7*16], rot14 + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+ 8*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+ 9*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+10*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+11*16], rot14 + MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+12*16], rot11 + MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+13*16], rot12 + MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+14*16], rot13 + MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+15*16], rot14 + + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+16*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+17*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+18*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+19*16], rot24 + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+20*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+21*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+22*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+23*16], rot24 + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+24*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+25*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+26*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+27*16], rot24 + MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+28*16], rot21 + MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+29*16], rot22 + MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+30*16], rot23 + MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+31*16], rot24 + + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+32*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+33*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+34*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+35*16], rot34 + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+36*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+37*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+38*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+39*16], rot34 + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+40*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+41*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+42*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+43*16], rot34 + MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+44*16], rot31 + MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+45*16], rot32 + MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+46*16], rot33 + MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+47*16], rot34 + + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+48*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+49*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+50*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+51*16], rot44 + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+52*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+53*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+54*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+55*16], rot44 + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+56*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+57*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+58*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+59*16], rot44 + MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+60*16], rot41 + MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+61*16], rot42 + MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+62*16], rot43 + MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+63*16], rot44 + + paddd A,[AA] + paddd B,[BB] + paddd C,[CC] + paddd D,[DD] + + paddd A2,[AA2] + paddd B2,[BB2] + paddd C2,[CC2] + paddd D2,[DD2] + + ; write out digests + movdqu [arg1 + 0*16 + 0*MD5_DIGEST_ROW_SIZE], A + movdqu [arg1 + 0*16 + 1*MD5_DIGEST_ROW_SIZE], B + movdqu [arg1 + 0*16 + 2*MD5_DIGEST_ROW_SIZE], C + movdqu [arg1 + 0*16 + 3*MD5_DIGEST_ROW_SIZE], D + movdqu [arg1 + 1*16 + 0*MD5_DIGEST_ROW_SIZE], A2 + movdqu [arg1 + 1*16 + 1*MD5_DIGEST_ROW_SIZE], B2 + movdqu [arg1 + 1*16 + 2*MD5_DIGEST_ROW_SIZE], C2 + movdqu [arg1 + 1*16 + 3*MD5_DIGEST_ROW_SIZE], D2 + + ;; update input pointers + add inp0, IDX + add inp1, IDX + add inp2, IDX + add inp3, IDX + add inp4, IDX + add inp5, IDX + add inp6, IDX + add inp7, IDX + mov [arg1 +_data_ptr_md5 + 0*PTR_SZ], inp0 + mov [arg1 +_data_ptr_md5 + 1*PTR_SZ], inp1 + mov [arg1 +_data_ptr_md5 + 2*PTR_SZ], inp2 + mov [arg1 +_data_ptr_md5 + 3*PTR_SZ], inp3 + mov [arg1 +_data_ptr_md5 + 4*PTR_SZ], inp4 + mov [arg1 +_data_ptr_md5 + 5*PTR_SZ], inp5 + mov [arg1 +_data_ptr_md5 + 6*PTR_SZ], inp6 + mov [arg1 +_data_ptr_md5 + 7*PTR_SZ], inp7 + + ;;;;;;;;;;;;;;;; + ;; Postamble + add rsp, STACK_SIZE + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/sha1_mult_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha1_mult_sse.asm new file mode 100644 index 00000000..ad36a999 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/sha1_mult_sse.asm @@ -0,0 +1,425 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "os.asm" + +;%define DO_DBGPRINT +%include "dbgprint.asm" + +%include "mb_mgr_datastruct.asm" + +section .data +default rel +align 16 +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b +K00_19: ;ddq 0x5A8279995A8279995A8279995A827999 + dq 0x5A8279995A827999, 0x5A8279995A827999 +K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 +K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC +K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 + +section .text + +;; code to compute quad SHA1 using SSE +;; derived from ...\sha1_multiple\sha1_quad4.asm +;; variation of sha1_mult2.asm : clobbers all xmm regs, rcx left intact +;; rbx, rsi, rdi, rbp, r12-r15 left intact +;; This version is not safe to call from C/C++ + +;; Stack must be aligned to 16 bytes before call +;; Windows clobbers: rax rdx r8 r9 r10 r11 +;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15 +;; +;; Linux clobbers: rax rsi r8 r9 r10 r11 +;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15 +;; +;; clobbers xmm0-15 + +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a3 a2 a1 a0} +; r1 = {b3 b2 b1 b0} +; r2 = {c3 c2 c1 c0} +; r3 = {d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; r0 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + movaps %%t0, %%r0 ; t0 = {a3 a2 a1 a0} + shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} + shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} + + movaps %%t1, %%r2 ; t1 = {c3 c2 c1 c0} + shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} + shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} + + movaps %%r1, %%t0 ; r1 = {b1 b0 a1 a0} + shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} + + movaps %%r3, %%r0 ; r3 = {b3 b2 a3 a2} + shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} + + shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} + shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} +%endmacro +;; +;; Magic functions defined in FIPS 180-1 +;; +; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D))) +%macro MAGIC_F0 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + movdqa %%regF,%%regC + pxor %%regF,%%regD + pand %%regF,%%regB + pxor %%regF,%%regD +%endmacro + +; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F1 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + movdqa %%regF,%%regD + pxor %%regF,%%regC + pxor %%regF,%%regB +%endmacro + +; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D)) +%macro MAGIC_F2 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + movdqa %%regF,%%regB + movdqa %%regT,%%regB + por %%regF,%%regC + pand %%regT,%%regC + pand %%regF,%%regD + por %%regF,%%regT +%endmacro + +; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F3 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT +%endmacro + +; PROLD reg, imm, tmp +%macro PROLD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + movdqa %%tmp, %%reg + pslld %%reg, %%imm + psrld %%tmp, (32-%%imm) + por %%reg, %%tmp +%endmacro + +%macro SHA1_STEP_00_15 10 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 + paddd %%regE,%%immCNT + paddd %%regE,[rsp + (%%memW * 16)] + movdqa %%regT,%%regA + PROLD %%regT,5, %%regF + paddd %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + paddd %%regE,%%regF +%endmacro + +%macro SHA1_STEP_16_79 10 +%define %%regA %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regE %5 +%define %%regT %6 +%define %%regF %7 +%define %%memW %8 +%define %%immCNT %9 +%define %%MAGIC %10 + paddd %%regE,%%immCNT + movdqa W14, [rsp + ((%%memW - 14) & 15) * 16] + pxor W16, W14 + pxor W16, [rsp + ((%%memW - 8) & 15) * 16] + pxor W16, [rsp + ((%%memW - 3) & 15) * 16] + movdqa %%regF, W16 + pslld W16, 1 + psrld %%regF, (32-1) + por %%regF, W16 + ROTATE_W + + movdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF + paddd %%regE,%%regF + movdqa %%regT,%%regA + PROLD %%regT,5, %%regF + paddd %%regE,%%regT + %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D) + PROLD %%regB,30, %%regT + paddd %%regE,%%regF +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; FRAMESZ must be an odd multiple of 8 +%define FRAMESZ 16*16 + 8 + +%define MOVPS movdqu + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define inp0 r8 +%define inp1 r9 +%define inp2 r10 +%define inp3 r11 + +%define IDX rax + +%define A xmm0 +%define B xmm1 +%define C xmm2 +%define D xmm3 +%define E xmm4 +%define F xmm5 ; tmp +%define G xmm6 ; tmp + +%define TMP G +%define FUN F +%define K xmm7 + +%define AA xmm8 +%define BB xmm9 +%define CC xmm10 +%define DD xmm11 +%define EE xmm12 + +%define T0 xmm6 +%define T1 xmm7 +%define T2 xmm8 +%define T3 xmm9 +%define T4 xmm10 +%define T5 xmm11 + +%define W14 xmm13 +%define W15 xmm14 +%define W16 xmm15 + +%macro ROTATE_ARGS 0 +%xdefine TMP_ E +%xdefine E D +%xdefine D C +%xdefine C B +%xdefine B A +%xdefine A TMP_ +%endm + +%macro ROTATE_W 0 +%xdefine TMP_ W16 +%xdefine W16 W15 +%xdefine W15 W14 +%xdefine W14 TMP_ +%endm + +align 32 + +; XMM registers are clobbered. Saving/restoring must be done at a higher level + +; void sha1_mult_sse(SHA1_ARGS *args, UINT32 size_in_blocks); +; arg 1 : rcx : pointer to args +; arg 2 : rdx : size (in blocks) ;; assumed to be >= 1 +MKGLOBAL(sha1_mult_sse,function,internal) +sha1_mult_sse: + + sub rsp, FRAMESZ + + ;; Initialize digests + movdqa A, [arg1 + 0*SHA1_DIGEST_ROW_SIZE] + movdqa B, [arg1 + 1*SHA1_DIGEST_ROW_SIZE] + movdqa C, [arg1 + 2*SHA1_DIGEST_ROW_SIZE] + movdqa D, [arg1 + 3*SHA1_DIGEST_ROW_SIZE] + movdqa E, [arg1 + 4*SHA1_DIGEST_ROW_SIZE] + DBGPRINTL_XMM "Sha1-SSE Incoming transposed digest", A, B, C, D, E + ;; load input pointers + mov inp0,[arg1 + _data_ptr_sha1 + 0*PTR_SZ] + mov inp1,[arg1 + _data_ptr_sha1 + 1*PTR_SZ] + mov inp2,[arg1 + _data_ptr_sha1 + 2*PTR_SZ] + mov inp3,[arg1 + _data_ptr_sha1 + 3*PTR_SZ] + DBGPRINTL64 "Sha1-SSE Incoming data ptrs", inp0, inp1, inp2, inp3 + xor IDX, IDX +lloop: + movdqa F, [rel PSHUFFLE_BYTE_FLIP_MASK] +%assign I 0 +%rep 4 + MOVPS T2,[inp0+IDX] + MOVPS T1,[inp1+IDX] + MOVPS T4,[inp2+IDX] + MOVPS T3,[inp3+IDX] + TRANSPOSE T2, T1, T4, T3, T0, T5 + DBGPRINTL_XMM "sha1 incoming data", T0, T1, T2, T3 + pshufb T0, F + movdqa [rsp+(I*4+0)*16],T0 + pshufb T1, F + movdqa [rsp+(I*4+1)*16],T1 + pshufb T2, F + movdqa [rsp+(I*4+2)*16],T2 + pshufb T3, F + movdqa [rsp+(I*4+3)*16],T3 + add IDX, 4*4 +%assign I (I+1) +%endrep + + ; save old digests + movdqa AA, A + movdqa BB, B + movdqa CC, C + movdqa DD, D + movdqa EE, E + +;; +;; perform 0-79 steps +;; + movdqa K, [rel K00_19] +;; do rounds 0...15 +%assign I 0 +%rep 16 + SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 16...19 + movdqa W16, [rsp + ((16 - 16) & 15) * 16] + movdqa W15, [rsp + ((16 - 15) & 15) * 16] +%rep 4 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 20...39 + movdqa K, [rel K20_39] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 40...59 + movdqa K, [rel K40_59] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2 + ROTATE_ARGS +%assign I (I+1) +%endrep + +;; do rounds 60...79 + movdqa K, [rel K60_79] +%rep 20 + SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3 + ROTATE_ARGS +%assign I (I+1) +%endrep + + paddd A,AA + paddd B,BB + paddd C,CC + paddd D,DD + paddd E,EE + + sub arg2, 1 + jne lloop + + ; write out digests + movdqa [arg1 + 0*SHA1_DIGEST_ROW_SIZE], A + movdqa [arg1 + 1*SHA1_DIGEST_ROW_SIZE], B + movdqa [arg1 + 2*SHA1_DIGEST_ROW_SIZE], C + movdqa [arg1 + 3*SHA1_DIGEST_ROW_SIZE], D + movdqa [arg1 + 4*SHA1_DIGEST_ROW_SIZE], E + DBGPRINTL_XMM "Sha1 Outgoing transposed digest", A, B, C, D, E + ; update input pointers + add inp0, IDX + mov [arg1 + _data_ptr_sha1 + 0*PTR_SZ], inp0 + add inp1, IDX + mov [arg1 + _data_ptr_sha1 + 1*PTR_SZ], inp1 + add inp2, IDX + mov [arg1 + _data_ptr_sha1 + 2*PTR_SZ], inp2 + add inp3, IDX + mov [arg1 + _data_ptr_sha1 + 3*PTR_SZ], inp3 + DBGPRINTL64 "Sha1-sse outgoing data ptrs", inp0, inp1, inp2, inp3 + ;;;;;;;;;;;;;;;; + ;; Postamble + + add rsp, FRAMESZ + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm new file mode 100644 index 00000000..e2b0fe85 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/sha1_ni_x2_sse.asm @@ -0,0 +1,482 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; Stack must be aligned to 32 bytes before call +;; +;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Windows clobbers: RDX R10 R11 +;; Windows preserves: RAX RBX RCX RBP RSI RDI R8 R9 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Linux clobbers: RDI R10 R11 +;; Linux preserves: RAX RBX RCX RDX RBP RSI R8 R9 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; +;; Linux/Windows clobbers: xmm0 - xmm15 + +%include "os.asm" +;%define DO_DBGPRINT +%include "dbgprint.asm" + +%include "mb_mgr_datastruct.asm" + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define arg3 rcx +%define arg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define arg3 rdi +%define arg4 rsi +%endif + +%define args arg1 +%define NUM_BLKS arg2 + +; reso = resdq => 16 bytes +struc frame +.ABCD_SAVE reso 1 +.E_SAVE reso 1 +.ABCD_SAVEb reso 1 +.E_SAVEb reso 1 +.align resq 1 +endstruc + +%define INP r10 +%define INPb r11 + +%define ABCD xmm0 +%define E0 xmm1 ; Need two E's b/c they ping pong +%define E1 xmm2 +%define MSG0 xmm3 +%define MSG1 xmm4 +%define MSG2 xmm5 +%define MSG3 xmm6 + +%define ABCDb xmm7 +%define E0b xmm8 ; Need two E's b/c they ping pong +%define E1b xmm9 +%define MSG0b xmm10 +%define MSG1b xmm11 +%define MSG2b xmm12 +%define MSG3b xmm13 + +%define SHUF_MASK xmm14 +%define E_MASK xmm15 + +section .data +default rel +align 64 +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x000102030405060708090a0b0c0d0e0f + dq 0x08090a0b0c0d0e0f, 0x0001020304050607 +UPPER_WORD_MASK: ;ddq 0xFFFFFFFF000000000000000000000000 + dq 0x0000000000000000, 0xFFFFFFFF00000000 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sha1_ni(SHA1_ARGS *args, UINT32 size_in_blocks) +;; arg1 : pointer to args +;; arg2 : size (in blocks) ;; assumed to be >= 1 + +section .text +MKGLOBAL(sha1_ni,function,internal) +align 32 +sha1_ni: + sub rsp, frame_size + + DBGPRINTL "enter sha1-ni-x2" + + shl NUM_BLKS, 6 ; convert to bytes + jz done_hash + + ;; load input pointers + mov INP, [args + _data_ptr_sha1 + 0*PTR_SZ] + DBGPRINTL64 "jobA: pointer", INP + mov INPb, [args + _data_ptr_sha1 + 1*PTR_SZ] + + add NUM_BLKS, INP ; pointer to end of data block -> loop exit condition + + ;; load initial digest + movdqu ABCD, [args + 0*SHA1NI_DIGEST_ROW_SIZE] + pxor E0, E0 + pinsrd E0, [args + 0*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], 3 + pshufd ABCD, ABCD, 0x1B + + DBGPRINTL_XMM "jobA: digest in words[0-3]", ABCD + DBGPRINTL_XMM "jobA: digest in word 4", E0 + + movdqu ABCDb, [args + 1*SHA1NI_DIGEST_ROW_SIZE] + pxor E0b, E0b + pinsrd E0b, [args + 1*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], 3 + pshufd ABCDb, ABCDb, 0x1B + + movdqa SHUF_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK] + movdqa E_MASK, [rel UPPER_WORD_MASK] + + DBGPRINTL "jobA data:" +loop0: + ;; Copy digests + movdqa [rsp + frame.ABCD_SAVE], ABCD + movdqa [rsp + frame.E_SAVE], E0 + movdqa [rsp + frame.ABCD_SAVEb], ABCDb + movdqa [rsp + frame.E_SAVEb], E0b + + ;; Only needed if not using sha1nexte for rounds 0-3 + pand E0, E_MASK + pand E0b, E_MASK + + ;; Needed if using sha1nexte for rounds 0-3 + ;; Need to rotate E right by 30 + ;movdqa E1, E0 + ;psrld E0, 30 + ;pslld E1, 2 + ;pxor E0, E1 + + ;; Rounds 0-3 + movdqu MSG0, [INP + 0*16] + pshufb MSG0, SHUF_MASK + DBGPRINT_XMM MSG0 + ;sha1nexte E0, MSG0 + paddd E0, MSG0 ; instead of sha1nexte + movdqa E1, ABCD + sha1rnds4 ABCD, E0, 0 + movdqu MSG0b, [INPb + 0*16] + pshufb MSG0b, SHUF_MASK + ;sha1nexte E0b, MSG0b + paddd E0b, MSG0b ; instead of sha1nexte + movdqa E1b, ABCDb + sha1rnds4 ABCDb, E0b, 0 + + ;; Rounds 4-7 + movdqu MSG1, [INP + 1*16] + pshufb MSG1, SHUF_MASK + DBGPRINT_XMM MSG1 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1rnds4 ABCD, E1, 0 + sha1msg1 MSG0, MSG1 + movdqu MSG1b, [INPb + 1*16] + pshufb MSG1b, SHUF_MASK + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1rnds4 ABCDb, E1b, 0 + sha1msg1 MSG0b, MSG1b + + ;; Rounds 8-11 + movdqu MSG2, [INP + 2*16] + pshufb MSG2, SHUF_MASK + DBGPRINT_XMM MSG2 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1rnds4 ABCD, E0, 0 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + movdqu MSG2b, [INPb + 2*16] + pshufb MSG2b, SHUF_MASK + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1rnds4 ABCDb, E0b, 0 + sha1msg1 MSG1b, MSG2b + pxor MSG0b, MSG2b + + ;; Rounds 12-15 + movdqu MSG3, [INP + 3*16] + pshufb MSG3, SHUF_MASK + DBGPRINT_XMM MSG3 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 0 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + movdqu MSG3b, [INPb + 3*16] + pshufb MSG3b, SHUF_MASK + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1msg2 MSG0b, MSG3b + sha1rnds4 ABCDb, E1b, 0 + sha1msg1 MSG2b, MSG3b + pxor MSG1b, MSG3b + + + ;; Rounds 16-19 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 0 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + sha1nexte E0b, MSG0b + movdqa E1b, ABCDb + sha1msg2 MSG1b, MSG0b + sha1rnds4 ABCDb, E0b, 0 + sha1msg1 MSG3b, MSG0b + pxor MSG2b, MSG0b + + ;; Rounds 20-23 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1msg2 MSG2b, MSG1b + sha1rnds4 ABCDb, E1b, 1 + sha1msg1 MSG0b, MSG1b + pxor MSG3b, MSG1b + + ;; Rounds 24-27 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 1 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1msg2 MSG3b, MSG2b + sha1rnds4 ABCDb, E0b, 1 + sha1msg1 MSG1b, MSG2b + pxor MSG0b, MSG2b + + ;; Rounds 28-31 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1msg2 MSG0b, MSG3b + sha1rnds4 ABCDb, E1b, 1 + sha1msg1 MSG2b, MSG3b + pxor MSG1b, MSG3b + + ;; Rounds 32-35 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 1 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + sha1nexte E0b, MSG0b + movdqa E1b, ABCDb + sha1msg2 MSG1b, MSG0b + sha1rnds4 ABCDb, E0b, 1 + sha1msg1 MSG3b, MSG0b + pxor MSG2b, MSG0b + + ;; Rounds 36-39 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 1 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1msg2 MSG2b, MSG1b + sha1rnds4 ABCDb, E1b, 1 + sha1msg1 MSG0b, MSG1b + pxor MSG3b, MSG1b + + ;; Rounds 40-43 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1msg2 MSG3b, MSG2b + sha1rnds4 ABCDb, E0b, 2 + sha1msg1 MSG1b, MSG2b + pxor MSG0b, MSG2b + + ;; Rounds 44-47 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 2 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1msg2 MSG0b, MSG3b + sha1rnds4 ABCDb, E1b, 2 + sha1msg1 MSG2b, MSG3b + pxor MSG1b, MSG3b + + ;; Rounds 48-51 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + sha1nexte E0b, MSG0b + movdqa E1b, ABCDb + sha1msg2 MSG1b, MSG0b + sha1rnds4 ABCDb, E0b, 2 + sha1msg1 MSG3b, MSG0b + pxor MSG2b, MSG0b + + ;; Rounds 52-55 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 2 + sha1msg1 MSG0, MSG1 + pxor MSG3, MSG1 + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1msg2 MSG2b, MSG1b + sha1rnds4 ABCDb, E1b, 2 + sha1msg1 MSG0b, MSG1b + pxor MSG3b, MSG1b + + ;; Rounds 56-59 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 2 + sha1msg1 MSG1, MSG2 + pxor MSG0, MSG2 + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1msg2 MSG3b, MSG2b + sha1rnds4 ABCDb, E0b, 2 + sha1msg1 MSG1b, MSG2b + pxor MSG0b, MSG2b + + ;; Rounds 60-63 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1msg2 MSG0, MSG3 + sha1rnds4 ABCD, E1, 3 + sha1msg1 MSG2, MSG3 + pxor MSG1, MSG3 + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1msg2 MSG0b, MSG3b + sha1rnds4 ABCDb, E1b, 3 + sha1msg1 MSG2b, MSG3b + pxor MSG1b, MSG3b + + ;; Rounds 64-67 + sha1nexte E0, MSG0 + movdqa E1, ABCD + sha1msg2 MSG1, MSG0 + sha1rnds4 ABCD, E0, 3 + sha1msg1 MSG3, MSG0 + pxor MSG2, MSG0 + sha1nexte E0b, MSG0b + movdqa E1b, ABCDb + sha1msg2 MSG1b, MSG0b + sha1rnds4 ABCDb, E0b, 3 + sha1msg1 MSG3b, MSG0b + pxor MSG2b, MSG0b + + ;; Rounds 68-71 + sha1nexte E1, MSG1 + movdqa E0, ABCD + sha1msg2 MSG2, MSG1 + sha1rnds4 ABCD, E1, 3 + pxor MSG3, MSG1 + sha1nexte E1b, MSG1b + movdqa E0b, ABCDb + sha1msg2 MSG2b, MSG1b + sha1rnds4 ABCDb, E1b, 3 + pxor MSG3b, MSG1b + + ;; Rounds 72-75 + sha1nexte E0, MSG2 + movdqa E1, ABCD + sha1msg2 MSG3, MSG2 + sha1rnds4 ABCD, E0, 3 + sha1nexte E0b, MSG2b + movdqa E1b, ABCDb + sha1msg2 MSG3b, MSG2b + sha1rnds4 ABCDb, E0b, 3 + + ;; Rounds 76-79 + sha1nexte E1, MSG3 + movdqa E0, ABCD + sha1rnds4 ABCD, E1, 3 + sha1nexte E1b, MSG3b + movdqa E0b, ABCDb + sha1rnds4 ABCDb, E1b, 3 + + ;; Need to rotate E left by 30 + movdqa E1, E0 + pslld E0, 30 + psrld E1, 2 + pxor E0, E1 + movdqa E1b, E0b + pslld E0b, 30 + psrld E1b, 2 + pxor E0b, E1b + + paddd ABCD, [rsp + frame.ABCD_SAVE] + paddd E0, [rsp + frame.E_SAVE] + paddd ABCDb, [rsp + frame.ABCD_SAVEb] + paddd E0b, [rsp + frame.E_SAVEb] + + add INP, 64 + add INPb, 64 + cmp INP, NUM_BLKS + jne loop0 + + ;; write out digests + pshufd ABCD, ABCD, 0x1B + movdqu [args + 0*SHA1NI_DIGEST_ROW_SIZE], ABCD + pextrd [args + 0*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], E0, 3 + DBGPRINTL_XMM "jobA: digest out words[0-3]", ABCD + DBGPRINTL_XMM "jobA: digest out word 4", E0 + + pshufd ABCDb, ABCDb, 0x1B + movdqu [args + 1*SHA1NI_DIGEST_ROW_SIZE], ABCDb + pextrd [args + 1*SHA1NI_DIGEST_ROW_SIZE + 4*SHA1_DIGEST_WORD_SIZE], E0b, 3 + + ;; update input pointers + mov [args + _data_ptr_sha1 + 0*PTR_SZ], INP + mov [args + _data_ptr_sha1 + 1*PTR_SZ], INPb + +done_hash: + add rsp, frame_size + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/sha1_one_block_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha1_one_block_sse.asm new file mode 100644 index 00000000..aaa5a58f --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/sha1_one_block_sse.asm @@ -0,0 +1,507 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; SHA1 code, hybrid, rolled, interleaved +; Uses SSE instructions +%include "os.asm" + +section .data +default rel +align 16 +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b +K00_19: ;ddq 0x5A8279995A8279995A8279995A827999 + dq 0x5A8279995A827999, 0x5A8279995A827999 +K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1 + dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1 +K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC + dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC +K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6 + dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6 + +section .text + +%define MOVDQ movdqu ;; assume buffers not aligned + +%ifdef LINUX +%define INP rdi ; 1st arg +%define CTX rsi ; 2nd arg +%define REG3 ecx +%define REG4 edx +%else +%define INP rcx ; 1st arg +%define CTX rdx ; 2nd arg +%define REG3 edi +%define REG4 esi +%endif + +%define FRAMESZ 3*16 + 1*8 +%define _RSP FRAMESZ-1*8 + rsp + +%define a eax +%define b ebx +%define c REG3 +%define d REG4 +%define e r8d +%define T1 r9d +%define f r10d +%define RND r11d +%define g r12d +%define h r13d + +%define XTMP0 xmm0 +%define XTMP1 xmm1 +%define XK xmm2 + +%xdefine X0 xmm3 +%xdefine X1 xmm4 +%xdefine X2 xmm5 +%xdefine X3 xmm6 +%xdefine X4 xmm7 + +%define XFER xmm8 + +%define SZ 4 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros + +%macro rotate_Xs 0 +%xdefine X_ X0 +%xdefine X0 X1 +%xdefine X1 X2 +%xdefine X2 X3 +%xdefine X3 X4 +%xdefine X4 X_ +%endmacro + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + + +;; Magic functions defined in FIPS 180-1 +;; +; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D))) +%macro MAGIC_F0 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + mov %%regF,%%regC + xor %%regF,%%regD + and %%regF,%%regB + xor %%regF,%%regD +%endmacro + +; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F1 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + mov %%regF,%%regD + xor %%regF,%%regC + xor %%regF,%%regB +%endmacro + +; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D)) +%macro MAGIC_F2 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + mov %%regF,%%regB + mov %%regT,%%regB + or %%regF,%%regC + and %%regT,%%regC + and %%regF,%%regD + or %%regF,%%regT +%endmacro + +; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D) +%macro MAGIC_F3 5 +%define %%regF %1 +%define %%regB %2 +%define %%regC %3 +%define %%regD %4 +%define %%regT %5 + MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT +%endmacro + +;; input is T1 +%macro ROUND 1 +%define %%MAGIC %1 + add e,T1 + mov T1,a + rol T1,5 + add e,T1 + %%MAGIC h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + rol b,30 + add h,e +ROTATE_ARGS +%endmacro + +%macro do_4i 1 + movdqa XFER, XK + paddd XFER, X0 + pextrd T1, XFER, 0 + ;ROUND %1 + add e,T1 + ;SCHEDULE_4 + movdqa XTMP0, X1 + palignr XTMP0, X0, 8 ; XTMP0 = W[-14] + mov T1,a + movdqa XTMP1, X2 + rol T1,5 + pxor XTMP1, X0 ; XTMP1 = W[-8] ^ W[-16] + add e,T1 + pxor XTMP0, XTMP1 ; XTMP0 = W[-8] ^ W[-14] ^ W[-16] + %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + + ;; Finish low half + movdqa X4, X3 + rol b,30 + psrldq X4, 4 ; X4 = W[-3] {xxBA} + add h,e +ROTATE_ARGS + pextrd T1, XFER, 1 + ;ROUND %1 + add e,T1 + pxor X4, XTMP0 ; + mov T1,a + movdqa XTMP1, X4 + rol T1,5 + ;; rotate X4 left 1 + psrld XTMP1, (32-1) + add e,T1 + pslld X4, 1 + %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + pxor X4, XTMP1 ; X4 = W[0] {xxBA} + rol b,30 + add h,e +ROTATE_ARGS + pextrd T1, XFER, 2 + ;ROUND %1 + add e,T1 + movdqa XTMP1, X4 + mov T1,a + + ;; Finish high half + palignr XTMP1, X3, 4 ; XTMP1 = w[-3] {DCxx} + rol T1,5 + add e,T1 + pxor XTMP0, XTMP1 + %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + ;; rotate XTMP0 left 1 + movdqa XTMP1, XTMP0 + psrld XTMP1, (32-1) + rol b,30 + add h,e +ROTATE_ARGS + pextrd T1, XFER, 3 + ;ROUND %1 + add e,T1 + mov T1,a + pslld XTMP0, 1 + rol T1,5 + add e,T1 + pxor XTMP0, XTMP1 ; XTMP0 = W[0] {DCxx} + %1 h,b,c,d,T1 ;; FUN = MAGIC_Fi(B,C,D) + ;; COMBINE HALVES + shufps X4, XTMP0, 11100100b ; X4 = X[0] {DCBA} + rol b,30 + add h,e + + rotate_Xs +ROTATE_ARGS +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sha1_one_block_sse(void *input_data, UINT32 digest[8] +;; arg 1 : rcx : pointer to input data +;; arg 2 : rdx : pointer to digest +MKGLOBAL(sha1_one_block_sse,function,) +align 32 +sha1_one_block_sse: + push rbx + push rsi + push rdi + push r12 + push r13 + + ;; byte swap first 16 dwords + movdqa XTMP0, [rel PSHUFFLE_BYTE_FLIP_MASK] + mov rax,rsp ; copy rsp + MOVDQ X0, [INP + 0*16] + sub rsp,FRAMESZ + MOVDQ X1, [INP + 1*16] + and rsp,-64 ; align stack frame + + movdqa [rsp + 0 * 16], xmm6 + movdqa [rsp + 1 * 16], xmm7 + movdqa [rsp + 2 * 16], xmm8 + + MOVDQ X2, [INP + 2*16] + mov [_RSP],rax ; save copy of rsp + MOVDQ X3, [INP + 3*16] + ;; load initial digest + mov a,0x67452301 + pshufb X0, XTMP0 + mov b,0xefcdab89 + pshufb X1, XTMP0 + mov c,0x98badcfe + pshufb X2, XTMP0 + mov d,0x10325476 + pshufb X3, XTMP0 + mov e,0xc3d2e1f0 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; do rounds 00-19 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movdqa XK, [rel K00_19] + mov RND, 3 + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + jmp loop1_5 +align 16 +loop1: + + do_4i MAGIC_F0 + +loop1_5: + do_4i MAGIC_F0 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + movdqa X0, X2 + movdqa X2, X4 + movdqa X4, X1 + movdqa X1, X3 + + sub RND, 1 + jne loop1 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; end rounds 00-19 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; do rounds 20-39 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movdqa XK, [rel K20_39] + mov RND, 3 + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + jmp loop2_5 +align 16 +loop2: + + do_4i MAGIC_F1 + +loop2_5: + do_4i MAGIC_F1 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + movdqa X0, X2 + movdqa X2, X4 + movdqa X4, X1 + movdqa X1, X3 + + sub RND, 1 + jne loop2 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; end rounds 20-39 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; do rounds 40-59 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + movdqa XK, [rel K40_59] + mov RND, 3 + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + ROTATE_ARGS + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + jmp loop3_5 +align 16 +loop3: + + do_4i MAGIC_F2 + +loop3_5: + do_4i MAGIC_F2 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + movdqa X0, X2 + movdqa X2, X4 + movdqa X4, X1 + movdqa X1, X3 + + sub RND, 1 + jne loop3 + + rotate_Xs + rotate_Xs + rotate_Xs + rotate_Xs + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; end rounds 40-59 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + + + + ;; do rounds 60-79 + movdqa XK, [rel K60_79] + + do_4i MAGIC_F3 + + movdqa XFER, XK + paddd XFER, X0 + pextrd T1, XFER, 0 + ROUND MAGIC_F3 + pextrd T1, XFER, 1 + ROUND MAGIC_F3 + pextrd T1, XFER, 2 + ROUND MAGIC_F3 + pextrd T1, XFER, 3 + ROUND MAGIC_F3 + + movdqa XFER, XK + paddd XFER, X1 + pextrd T1, XFER, 0 + ROUND MAGIC_F3 + pextrd T1, XFER, 1 + ROUND MAGIC_F3 + pextrd T1, XFER, 2 + ROUND MAGIC_F3 + pextrd T1, XFER, 3 + ROUND MAGIC_F3 + + movdqa XFER, XK + paddd XFER, X2 + pextrd T1, XFER, 0 + ROUND MAGIC_F3 + pextrd T1, XFER, 1 + ROUND MAGIC_F3 + pextrd T1, XFER, 2 + ROUND MAGIC_F3 + pextrd T1, XFER, 3 + ROUND MAGIC_F3 + + movdqa XFER, XK + paddd XFER, X3 + pextrd T1, XFER, 0 + ROUND MAGIC_F3 + pextrd T1, XFER, 1 + ROUND MAGIC_F3 + pextrd T1, XFER, 2 + ROUND MAGIC_F3 + pextrd T1, XFER, 3 + ROUND MAGIC_F3 + + add a,0x67452301 + mov [SZ*0 + CTX], a + add b,0xefcdab89 + mov [SZ*1 + CTX], b + add c,0x98badcfe + mov [SZ*2 + CTX], c + add d,0x10325476 + mov [SZ*3 + CTX], d + add e,0xc3d2e1f0 + mov [SZ*4 + CTX], e + + movdqa xmm8, [rsp + 2 * 16] + movdqa xmm7, [rsp + 1 * 16] + movdqa xmm6, [rsp + 0 * 16] + + mov rsp,[_RSP] + pop r13 + pop r12 + pop rdi + pop rsi + pop rbx + + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/sha224_one_block_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha224_one_block_sse.asm new file mode 100644 index 00000000..26c887a9 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/sha224_one_block_sse.asm @@ -0,0 +1,41 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define H0 0xc1059ed8 +%define H1 0x367cd507 +%define H2 0x3070dd17 +%define H3 0xf70e5939 +%define H4 0xffc00b31 +%define H5 0x68581511 +%define H6 0x64f98fa7 +%define H7 0xbefa4fa4 +%define FUNC sha224_one_block_sse + +%include "sha256_one_block_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/sha256_ni_x2_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha256_ni_x2_sse.asm new file mode 100644 index 00000000..d2eab4f1 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/sha256_ni_x2_sse.asm @@ -0,0 +1,604 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; Stack must be aligned to 32 bytes before call +;; +;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Windows clobbers: RCX RDX RSI RDI R11 +;; Windows preserves: RAX RBX RBP R8 R9 R10 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; Linux clobbers: RCX RDX RSI RDI R11 +;; Linux preserves: RAX RBX RBP R8 R9 R10 R12 R13 R14 R15 +;; ----------------------------------------------------------- +;; +;; Linux/Windows clobbers: xmm0 - xmm15 + +%include "os.asm" +;%define DO_DBGPRINT +%include "dbgprint.asm" + +%include "mb_mgr_datastruct.asm" + +; resdq = res0 => 16 bytes +struc frame +.ABEF_SAVE reso 1 +.CDGH_SAVE reso 1 +.ABEF_SAVEb reso 1 +.CDGH_SAVEb reso 1 +.align resq 1 +endstruc + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define arg3 rcx +%define arg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define arg3 rdi +%define arg4 rsi +%endif + +%define args arg1 +%define NUM_BLKS arg2 + +%define INP arg3 +%define INPb arg4 + + +%define SHA256CONSTANTS r11 + +;; MSG MUST be xmm0 (implicit argument) +%define MSG xmm0 +%define STATE0 xmm1 +%define STATE1 xmm2 +%define MSGTMP0 xmm3 +%define MSGTMP1 xmm4 +%define MSGTMP2 xmm5 +%define MSGTMP3 xmm6 +%define MSGTMP4 xmm7 + +%define STATE0b xmm8 +%define STATE1b xmm9 +%define MSGTMP0b xmm10 +%define MSGTMP1b xmm11 +%define MSGTMP2b xmm12 +%define MSGTMP3b xmm13 +%define MSGTMP xmm14 + +%define SHUF_MASK xmm15 + +section .data +default rel +align 64 +K256: + dd 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + dd 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + dd 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + dd 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + dd 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + dd 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + dd 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + dd 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + dd 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + dd 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + dd 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + dd 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + dd 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + dd 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + dd 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + dd 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + +PSHUFFLE_BYTE_FLIP_MASK: + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sha256_ni(SHA256_ARGS *args, UINT32 size_in_blocks) +;; arg1 : pointer to args +;; arg2 : size (in blocks) ;; assumed to be >= 1 +section .text +MKGLOBAL(sha256_ni,function,internal) +align 32 +sha256_ni: + sub rsp, frame_size + + DBGPRINTL "enter sha256-ni-x2" + + shl NUM_BLKS, 6 ; convert to bytes + jz done_hash + + DBGPRINTL64 "jobA/B byte size:", NUM_BLKS + + ;; load input pointers + mov INP, [args + _data_ptr_sha256 + 0*PTR_SZ] + mov INPb, [args + _data_ptr_sha256 + 1*PTR_SZ] + + add NUM_BLKS, INP ; pointer to end of data + + ;; load initial digest + ;; Probably need to reorder these appropriately + ;; DCBA, HGFE -> ABEF, CDGH + + movdqu STATE0, [args + 0*SHA256NI_DIGEST_ROW_SIZE] + movdqu STATE1, [args + 0*SHA256NI_DIGEST_ROW_SIZE + 16] + movdqu STATE0b, [args + 1*SHA256NI_DIGEST_ROW_SIZE] + movdqu STATE1b, [args + 1*SHA256NI_DIGEST_ROW_SIZE + 16] + DBGPRINTL "jobA digest in:" + DBGPRINT_XMM STATE0 + DBGPRINT_XMM STATE1 + DBGPRINTL "jobB digest in:" + DBGPRINT_XMM STATE0b + DBGPRINT_XMM STATE1b + + pshufd STATE0, STATE0, 0xB1 ; CDAB + pshufd STATE1, STATE1, 0x1B ; EFGH + movdqa MSGTMP4, STATE0 + pshufd STATE0b, STATE0b, 0xB1 ; CDAB + pshufd STATE1b, STATE1b, 0x1B ; EFGH + movdqa MSGTMP, STATE0b + palignr STATE0, STATE1, 8 ; ABEF + palignr STATE0b, STATE1b, 8 ; ABEF + pblendw STATE1, MSGTMP4, 0xF0 ; CDGH + pblendw STATE1b, MSGTMP, 0xF0 ; CDGH + + lea SHA256CONSTANTS,[rel K256] + movdqa SHUF_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK] + +%ifdef DO_DBGPRINT + ;; prin buffer A + push r10 + push NUM_BLKS + DBGPRINTL "jobA data:" + xor r10, r10 + sub NUM_BLKS, INP +.loop_dbgA: + movdqu MSG, [INP + r10 + 0*16] + DBGPRINT_XMM MSG + movdqu MSG, [INP + r10 + 1*16] + DBGPRINT_XMM MSG + movdqu MSG, [INP + r10 + 2*16] + DBGPRINT_XMM MSG + movdqu MSG, [INP + r10 + 3*16] + DBGPRINT_XMM MSG + add r10, 64 + cmp NUM_BLKS, r10 + jne .loop_dbgA + pop NUM_BLKS + pop r10 +%endif + +%ifdef DO_DBGPRINT + ;; prin buffer B + push r10 + push NUM_BLKS + DBGPRINTL "jobB data:" + xor r10, r10 + sub NUM_BLKS, INP +.loop_dbgB: + movdqu MSG, [INPb + r10 + 0*16] + DBGPRINT_XMM MSG + movdqu MSG, [INPb + r10 + 1*16] + DBGPRINT_XMM MSG + movdqu MSG, [INPb + r10 + 2*16] + DBGPRINT_XMM MSG + movdqu MSG, [INPb + r10 + 3*16] + DBGPRINT_XMM MSG + add r10, 64 + cmp NUM_BLKS, r10 + jne .loop_dbgB + pop NUM_BLKS + pop r10 +%endif + +.loop0: + ;; Save digests + movdqa [rsp + frame.ABEF_SAVE], STATE0 + movdqa [rsp + frame.CDGH_SAVE], STATE1 + movdqa [rsp + frame.ABEF_SAVEb], STATE0b + movdqa [rsp + frame.CDGH_SAVEb], STATE1b + + ;; Rounds 0-3 + movdqu MSG, [INP + 0*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP0, MSG + paddd MSG, [SHA256CONSTANTS + 0*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqu MSG, [INPb + 0*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP0b, MSG + paddd MSG, [SHA256CONSTANTS + 0*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + + ;; Rounds 4-7 + movdqu MSG, [INP + 1*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP1, MSG + paddd MSG, [SHA256CONSTANTS + 1*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqu MSG, [INPb + 1*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP1b, MSG + paddd MSG, [SHA256CONSTANTS + 1*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + sha256msg1 MSGTMP0, MSGTMP1 + sha256msg1 MSGTMP0b, MSGTMP1b + + ;; Rounds 8-11 + movdqu MSG, [INP + 2*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP2, MSG + paddd MSG, [SHA256CONSTANTS + 2*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqu MSG, [INPb + 2*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP2b, MSG + paddd MSG, [SHA256CONSTANTS + 2*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + sha256msg1 MSGTMP1, MSGTMP2 + sha256msg1 MSGTMP1b, MSGTMP2b + + ;; Rounds 12-15 + movdqu MSG, [INP + 3*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP3, MSG + paddd MSG, [SHA256CONSTANTS + 3*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP3 + palignr MSGTMP, MSGTMP2, 4 + paddd MSGTMP0, MSGTMP + sha256msg2 MSGTMP0, MSGTMP3 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqu MSG, [INPb + 3*16] + pshufb MSG, SHUF_MASK + movdqa MSGTMP3b, MSG + paddd MSG, [SHA256CONSTANTS + 3*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP3b + palignr MSGTMP, MSGTMP2b, 4 + paddd MSGTMP0b, MSGTMP + sha256msg2 MSGTMP0b, MSGTMP3b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + sha256msg1 MSGTMP2, MSGTMP3 + sha256msg1 MSGTMP2b, MSGTMP3b + + ;; Rounds 16-19 + movdqa MSG, MSGTMP0 + paddd MSG, [SHA256CONSTANTS + 4*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP0 + palignr MSGTMP, MSGTMP3, 4 + paddd MSGTMP1, MSGTMP + sha256msg2 MSGTMP1, MSGTMP0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqa MSG, MSGTMP0b + paddd MSG, [SHA256CONSTANTS + 4*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP0b + palignr MSGTMP, MSGTMP3b, 4 + paddd MSGTMP1b, MSGTMP + sha256msg2 MSGTMP1b, MSGTMP0b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + sha256msg1 MSGTMP3, MSGTMP0 + sha256msg1 MSGTMP3b, MSGTMP0b + + ;; Rounds 20-23 + movdqa MSG, MSGTMP1 + paddd MSG, [SHA256CONSTANTS + 5*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP1 + palignr MSGTMP, MSGTMP0, 4 + paddd MSGTMP2, MSGTMP + sha256msg2 MSGTMP2, MSGTMP1 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqa MSG, MSGTMP1b + paddd MSG, [SHA256CONSTANTS + 5*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP1b + palignr MSGTMP, MSGTMP0b, 4 + paddd MSGTMP2b, MSGTMP + sha256msg2 MSGTMP2b, MSGTMP1b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + sha256msg1 MSGTMP0, MSGTMP1 + sha256msg1 MSGTMP0b, MSGTMP1b + + ;; Rounds 24-27 + movdqa MSG, MSGTMP2 + paddd MSG, [SHA256CONSTANTS + 6*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP2 + palignr MSGTMP, MSGTMP1, 4 + paddd MSGTMP3, MSGTMP + sha256msg2 MSGTMP3, MSGTMP2 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqa MSG, MSGTMP2b + paddd MSG, [SHA256CONSTANTS + 6*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP2b + palignr MSGTMP, MSGTMP1b, 4 + paddd MSGTMP3b, MSGTMP + sha256msg2 MSGTMP3b, MSGTMP2b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + sha256msg1 MSGTMP1, MSGTMP2 + sha256msg1 MSGTMP1b, MSGTMP2b + + ;; Rounds 28-31 + movdqa MSG, MSGTMP3 + paddd MSG, [SHA256CONSTANTS + 7*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP3 + palignr MSGTMP, MSGTMP2, 4 + paddd MSGTMP0, MSGTMP + sha256msg2 MSGTMP0, MSGTMP3 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqa MSG, MSGTMP3b + paddd MSG, [SHA256CONSTANTS + 7*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP3b + palignr MSGTMP, MSGTMP2b, 4 + paddd MSGTMP0b, MSGTMP + sha256msg2 MSGTMP0b, MSGTMP3b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + sha256msg1 MSGTMP2, MSGTMP3 + sha256msg1 MSGTMP2b, MSGTMP3b + + ;; Rounds 32-35 + movdqa MSG, MSGTMP0 + paddd MSG, [SHA256CONSTANTS + 8*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP0 + palignr MSGTMP, MSGTMP3, 4 + paddd MSGTMP1, MSGTMP + sha256msg2 MSGTMP1, MSGTMP0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqa MSG, MSGTMP0b + paddd MSG, [SHA256CONSTANTS + 8*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP0b + palignr MSGTMP, MSGTMP3b, 4 + paddd MSGTMP1b, MSGTMP + sha256msg2 MSGTMP1b, MSGTMP0b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + sha256msg1 MSGTMP3, MSGTMP0 + sha256msg1 MSGTMP3b, MSGTMP0b + + ;; Rounds 36-39 + movdqa MSG, MSGTMP1 + paddd MSG, [SHA256CONSTANTS + 9*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP1 + palignr MSGTMP, MSGTMP0, 4 + paddd MSGTMP2, MSGTMP + sha256msg2 MSGTMP2, MSGTMP1 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqa MSG, MSGTMP1b + paddd MSG, [SHA256CONSTANTS + 9*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP1b + palignr MSGTMP, MSGTMP0b, 4 + paddd MSGTMP2b, MSGTMP + sha256msg2 MSGTMP2b, MSGTMP1b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + sha256msg1 MSGTMP0, MSGTMP1 + sha256msg1 MSGTMP0b, MSGTMP1b + + ;; Rounds 40-43 + movdqa MSG, MSGTMP2 + paddd MSG, [SHA256CONSTANTS + 10*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP2 + palignr MSGTMP, MSGTMP1, 4 + paddd MSGTMP3, MSGTMP + sha256msg2 MSGTMP3, MSGTMP2 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqa MSG, MSGTMP2b + paddd MSG, [SHA256CONSTANTS + 10*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP2b + palignr MSGTMP, MSGTMP1b, 4 + paddd MSGTMP3b, MSGTMP + sha256msg2 MSGTMP3b, MSGTMP2b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + sha256msg1 MSGTMP1, MSGTMP2 + sha256msg1 MSGTMP1b, MSGTMP2b + + ;; Rounds 44-47 + movdqa MSG, MSGTMP3 + paddd MSG, [SHA256CONSTANTS + 11*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP3 + palignr MSGTMP, MSGTMP2, 4 + paddd MSGTMP0, MSGTMP + sha256msg2 MSGTMP0, MSGTMP3 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqa MSG, MSGTMP3b + paddd MSG, [SHA256CONSTANTS + 11*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP3b + palignr MSGTMP, MSGTMP2b, 4 + paddd MSGTMP0b, MSGTMP + sha256msg2 MSGTMP0b, MSGTMP3b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + sha256msg1 MSGTMP2, MSGTMP3 + sha256msg1 MSGTMP2b, MSGTMP3b + + ;; Rounds 48-51 + movdqa MSG, MSGTMP0 + paddd MSG, [SHA256CONSTANTS + 12*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP0 + palignr MSGTMP, MSGTMP3, 4 + paddd MSGTMP1, MSGTMP + sha256msg2 MSGTMP1, MSGTMP0 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqa MSG, MSGTMP0b + paddd MSG, [SHA256CONSTANTS + 12*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP0b + palignr MSGTMP, MSGTMP3b, 4 + paddd MSGTMP1b, MSGTMP + sha256msg2 MSGTMP1b, MSGTMP0b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + sha256msg1 MSGTMP3, MSGTMP0 + sha256msg1 MSGTMP3b, MSGTMP0b + + ;; Rounds 52-55 + movdqa MSG, MSGTMP1 + paddd MSG, [SHA256CONSTANTS + 13*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP1 + palignr MSGTMP, MSGTMP0, 4 + paddd MSGTMP2, MSGTMP + sha256msg2 MSGTMP2, MSGTMP1 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqa MSG, MSGTMP1b + paddd MSG, [SHA256CONSTANTS + 13*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP1b + palignr MSGTMP, MSGTMP0b, 4 + paddd MSGTMP2b, MSGTMP + sha256msg2 MSGTMP2b, MSGTMP1b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + + ;; Rounds 56-59 + movdqa MSG, MSGTMP2 + paddd MSG, [SHA256CONSTANTS + 14*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP2 + palignr MSGTMP, MSGTMP1, 4 + paddd MSGTMP3, MSGTMP + sha256msg2 MSGTMP3, MSGTMP2 + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqa MSG, MSGTMP2b + paddd MSG, [SHA256CONSTANTS + 14*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + movdqa MSGTMP, MSGTMP2b + palignr MSGTMP, MSGTMP1b, 4 + paddd MSGTMP3b, MSGTMP + sha256msg2 MSGTMP3b, MSGTMP2b + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + + ;; Rounds 60-63 + movdqa MSG, MSGTMP3 + paddd MSG, [SHA256CONSTANTS + 15*16] + sha256rnds2 STATE1, STATE0, MSG ; MSG is implicit argument + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0, STATE1, MSG ; MSG is implicit argument + movdqa MSG, MSGTMP3b + paddd MSG, [SHA256CONSTANTS + 15*16] + sha256rnds2 STATE1b, STATE0b, MSG ; MSG is implicit argument + pshufd MSG, MSG, 0x0E + sha256rnds2 STATE0b, STATE1b, MSG ; MSG is implicit argument + + paddd STATE0, [rsp + frame.ABEF_SAVE] + paddd STATE1, [rsp + frame.CDGH_SAVE] + paddd STATE0b, [rsp + frame.ABEF_SAVEb] + paddd STATE1b, [rsp + frame.CDGH_SAVEb] + + add INP, 64 + add INPb, 64 + cmp INP, NUM_BLKS + jne .loop0 + + ;; update data pointers + mov [args + _data_ptr_sha256 + 0*PTR_SZ], INP + mov [args + _data_ptr_sha256 + 1*PTR_SZ], INPb + + ; Reorder for writeback + pshufd STATE0, STATE0, 0x1B ; FEBA + pshufd STATE1, STATE1, 0xB1 ; DCHG + movdqa MSGTMP4, STATE0 + pshufd STATE0b, STATE0b, 0x1B ; FEBA + pshufd STATE1b, STATE1b, 0xB1 ; DCHG + movdqa MSGTMP, STATE0b + pblendw STATE0, STATE1, 0xF0 ; DCBA + pblendw STATE0b, STATE1b, 0xF0 ; DCBA + palignr STATE1, MSGTMP4, 8 ; HGFE + palignr STATE1b, MSGTMP, 8 ; HGFE + + ;; update digests + movdqu [args + 0*SHA256NI_DIGEST_ROW_SIZE + 0*16], STATE0 + movdqu [args + 0*SHA256NI_DIGEST_ROW_SIZE + 1*16], STATE1 + movdqu [args + 1*SHA256NI_DIGEST_ROW_SIZE + 0*16], STATE0b + movdqu [args + 1*SHA256NI_DIGEST_ROW_SIZE + 1*16], STATE1b + + DBGPRINTL "jobA digest out:" + DBGPRINT_XMM STATE0 + DBGPRINT_XMM STATE1 + DBGPRINTL "jobB digest out:" + DBGPRINT_XMM STATE0b + DBGPRINT_XMM STATE1b + +done_hash: + DBGPRINTL "exit sha256-ni-x2" + + add rsp, frame_size + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/sha256_one_block_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha256_one_block_sse.asm new file mode 100644 index 00000000..9369d188 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/sha256_one_block_sse.asm @@ -0,0 +1,517 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%include "os.asm" + +section .data +default rel +align 64 +K256: + dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +; shuffle xBxA -> 00BA +_SHUF_00BA: ;ddq 0xFFFFFFFFFFFFFFFF0b0a090803020100 + dq 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF +; shuffle xDxC -> DC00 +_SHUF_DC00: ;ddq 0x0b0a090803020100FFFFFFFFFFFFFFFF + dq 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 + +section .text + + +%define MOVDQ movdqu ;; assume buffers not aligned + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros + +; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask +; Load xmm with mem and byte swap each dword +%macro COPY_XMM_AND_BSWAP 3 + MOVDQ %1, %2 + pshufb %1, %3 +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define X0 xmm4 +%define X1 xmm5 +%define X2 xmm6 +%define X3 xmm7 + +%define XTMP0 xmm0 +%define XTMP1 xmm1 +%define XTMP2 xmm2 +%define XTMP3 xmm3 +%define XTMP4 xmm8 +%define XFER xmm9 + +%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA +%define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00 +%define BYTE_FLIP_MASK xmm12 + +%ifdef LINUX +%define CTX rsi ; 2nd arg +%define INP rdi ; 1st arg + +%define SRND rdi ; clobbers INP +%define c ecx +%define d r8d +%define e edx +%else +%define CTX rdx ; 2nd arg +%define INP rcx ; 1st arg + +%define SRND rcx ; clobbers INP +%define c edi +%define d esi +%define e r8d + +%endif +%define TBL rbp +%define a eax +%define b ebx + +%define f r9d +%define g r10d +%define h r11d + +%define y0 r13d +%define y1 r14d +%define y2 r15d + + +struc STACK +%ifndef LINUX +_XMM_SAVE: reso 7 +%endif +_XFER: reso 1 +endstruc + +%ifndef H0 +%define H0 0x6a09e667 +%define H1 0xbb67ae85 +%define H2 0x3c6ef372 +%define H3 0xa54ff53a +%define H4 0x510e527f +%define H5 0x9b05688c +%define H6 0x1f83d9ab +%define H7 0x5be0cd19 +%define FUNC sha256_one_block_sse +%endif + +; rotate_Xs +; Rotate values of symbols X0...X3 +%macro rotate_Xs 0 +%xdefine X_ X0 +%xdefine X0 X1 +%xdefine X1 X2 +%xdefine X2 X3 +%xdefine X3 X_ +%endm + +; ROTATE_ARGS +; Rotate values of symbols a...h +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +%macro FOUR_ROUNDS_AND_SCHED 0 + ;; compute s0 four at a time and s1 two at a time + ;; compute W[-16] + W[-7] 4 at a time + movdqa XTMP0, X3 + mov y0, e ; y0 = e + ror y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + palignr XTMP0, X2, 4 ; XTMP0 = W[-7] + ror y1, (22-13) ; y1 = a >> (22-13) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + movdqa XTMP1, X1 + xor y1, a ; y1 = a ^ (a >> (22-13) + xor y2, g ; y2 = f^g + paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16] + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + ;; compute s0 + palignr XTMP1, X0, 4 ; XTMP1 = W[-15] + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + movdqa XTMP2, XTMP1 ; XTMP2 = W[-15] + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 0*4] ; y2 = k + w + S1 + CH + movdqa XTMP3, XTMP1 ; XTMP3 = W[-15] + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + pslld XTMP1, (32-7) + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + psrld XTMP2, 7 + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS + movdqa XTMP2, XTMP3 ; XTMP2 = W[-15] + mov y0, e ; y0 = e + mov y1, a ; y1 = a + movdqa XTMP4, XTMP3 ; XTMP4 = W[-15] + ror y0, (25-11) ; y0 = e >> (25-11) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + ror y1, (22-13) ; y1 = a >> (22-13) + pslld XTMP3, (32-18) + xor y1, a ; y1 = a ^ (a >> (22-13) + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor y2, g ; y2 = f^g + psrld XTMP2, 18 + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + pxor XTMP1, XTMP3 + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor y2, g ; y2 = CH = ((f^g)&e)^g + psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3 + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 1*4] ; y2 = k + w + S1 + CH + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18 + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + pxor XTMP1, XTMP4 ; XTMP1 = s0 + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + ;; compute low s1 + pshufd XTMP2, X3, 11111010b ; XTMP2 = W[-2] {BBAA} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS + movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA} + mov y0, e ; y0 = e + mov y1, a ; y1 = a + ror y0, (25-11) ; y0 = e >> (25-11) + movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA} + xor y0, e ; y0 = e ^ (e >> (25-11)) + ror y1, (22-13) ; y1 = a >> (22-13) + mov y2, f ; y2 = f + xor y1, a ; y1 = a ^ (a >> (22-13) + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA} + xor y2, g ; y2 = f^g + psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA} + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA} + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + xor y2, g ; y2 = CH = ((f^g)&e)^g + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + pxor XTMP2, XTMP3 + add y2, y0 ; y2 = S1 + CH + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, [rsp + _XFER + 2*4] ; y2 = k + w + S1 + CH + pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA} + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA} + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + ;; compute high s1 + pshufd XTMP2, XTMP0, 01010000b ; XTMP2 = W[-2] {DDCC} + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS + movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC} + mov y0, e ; y0 = e + ror y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + movdqa X0, XTMP2 ; X0 = W[-2] {DDCC} + ror y1, (22-13) ; y1 = a >> (22-13) + xor y0, e ; y0 = e ^ (e >> (25-11)) + mov y2, f ; y2 = f + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC} + xor y1, a ; y1 = a ^ (a >> (22-13) + xor y2, g ; y2 = f^g + psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC} + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + and y2, e ; y2 = (f^g)&e + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC} + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + pxor XTMP2, XTMP3 + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 3*4] ; y2 = k + w + S1 + CH + pxor X0, XTMP2 ; X0 = s1 {xDxC} + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + pshufb X0, SHUF_DC00 ; X0 = s1 {DC00} + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]} + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + +ROTATE_ARGS +rotate_Xs +%endm + +;; input is [rsp + _XFER + %1 * 4] +%macro DO_ROUND 1 + mov y0, e ; y0 = e + ror y0, (25-11) ; y0 = e >> (25-11) + mov y1, a ; y1 = a + xor y0, e ; y0 = e ^ (e >> (25-11)) + ror y1, (22-13) ; y1 = a >> (22-13) + mov y2, f ; y2 = f + xor y1, a ; y1 = a ^ (a >> (22-13) + ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6)) + xor y2, g ; y2 = f^g + xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6)) + ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2)) + and y2, e ; y2 = (f^g)&e + xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2)) + ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25) + xor y2, g ; y2 = CH = ((f^g)&e)^g + add y2, y0 ; y2 = S1 + CH + ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22) + add y2, [rsp + _XFER + %1 * 4] ; y2 = k + w + S1 + CH + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + or y0, c ; y0 = a|c + add d, h ; d = d + h + S1 + CH + k + w + and y2, c ; y2 = a&c + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = h + S1 + CH + k + w + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ + ROTATE_ARGS +%endm + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void FUNC(void *input_data, UINT32 digest[8]) +;; arg 1 : pointer to input data +;; arg 2 : pointer to digest +section .text +MKGLOBAL(FUNC,function,) +align 32 +FUNC: + push rbx +%ifndef LINUX + push rsi + push rdi +%endif + push rbp + push r13 + push r14 + push r15 + + sub rsp,STACK_size +%ifndef LINUX + movdqa [rsp + _XMM_SAVE + 0*16],xmm6 + movdqa [rsp + _XMM_SAVE + 1*16],xmm7 + movdqa [rsp + _XMM_SAVE + 2*16],xmm8 + movdqa [rsp + _XMM_SAVE + 3*16],xmm9 + movdqa [rsp + _XMM_SAVE + 4*16],xmm10 + movdqa [rsp + _XMM_SAVE + 5*16],xmm11 + movdqa [rsp + _XMM_SAVE + 6*16],xmm12 +%endif + + ;; load initial digest + mov a,H0 + mov b,H1 + mov c,H2 + mov d,H3 + mov e,H4 + mov f,H5 + mov g,H6 + mov h,H7 + + movdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK] + movdqa SHUF_00BA, [rel _SHUF_00BA] + movdqa SHUF_DC00, [rel _SHUF_DC00] + + lea TBL,[rel K256] + + ;; byte swap first 16 dwords + COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK + + ;; schedule 48 input dwords, by doing 3 rounds of 16 each + mov SRND, 3 +align 16 +loop1: + movdqa XFER, [TBL + 0*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + movdqa XFER, [TBL + 1*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + movdqa XFER, [TBL + 2*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + FOUR_ROUNDS_AND_SCHED + + movdqa XFER, [TBL + 3*16] + paddd XFER, X0 + movdqa [rsp + _XFER], XFER + add TBL, 4*16 + FOUR_ROUNDS_AND_SCHED + + sub SRND, 1 + jne loop1 + + mov SRND, 2 +loop2: + paddd X0, [TBL + 0*16] + movdqa [rsp + _XFER], X0 + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + paddd X1, [TBL + 1*16] + movdqa [rsp + _XFER], X1 + add TBL, 2*16 + DO_ROUND 0 + DO_ROUND 1 + DO_ROUND 2 + DO_ROUND 3 + + movdqa X0, X2 + movdqa X1, X3 + + sub SRND, 1 + jne loop2 + + add a,H0 + add b,H1 + add c,H2 + add d,H3 + add e,H4 + add f,H5 + add g,H6 + mov [4*0 + CTX],a + mov [4*1 + CTX],b + mov [4*2 + CTX],c + mov [4*3 + CTX],d + mov [4*4 + CTX],e + mov [4*5 + CTX],f + mov [4*6 + CTX],g + add h,H7 + mov [4*7 + CTX],h + +done_hash: +%ifndef LINUX + movdqa xmm6,[rsp + _XMM_SAVE + 0*16] + movdqa xmm7,[rsp + _XMM_SAVE + 1*16] + movdqa xmm8,[rsp + _XMM_SAVE + 2*16] + movdqa xmm9,[rsp + _XMM_SAVE + 3*16] + movdqa xmm10,[rsp + _XMM_SAVE + 4*16] + movdqa xmm11,[rsp + _XMM_SAVE + 5*16] + movdqa xmm12,[rsp + _XMM_SAVE + 6*16] +%endif + + add rsp, STACK_size + + pop r15 + pop r14 + pop r13 + pop rbp +%ifndef LINUX + pop rdi + pop rsi +%endif + pop rbx + + ret + + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/sha384_one_block_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha384_one_block_sse.asm new file mode 100644 index 00000000..51a031fd --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/sha384_one_block_sse.asm @@ -0,0 +1,41 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define H0 0xcbbb9d5dc1059ed8 +%define H1 0x629a292a367cd507 +%define H2 0x9159015a3070dd17 +%define H3 0x152fecd8f70e5939 +%define H4 0x67332667ffc00b31 +%define H5 0x8eb44a8768581511 +%define H6 0xdb0c2e0d64f98fa7 +%define H7 0x47b5481dbefa4fa4 +%define FUNC sha384_one_block_sse + +%include "sha512_one_block_sse.asm" diff --git a/src/spdk/intel-ipsec-mb/sse/sha512_one_block_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha512_one_block_sse.asm new file mode 100644 index 00000000..a42e942f --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/sha512_one_block_sse.asm @@ -0,0 +1,495 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +; This code schedules 1 blocks at a time, with 4 lanes per block +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%include "os.asm" + +%define MOVDQ movdqu ;; assume buffers not aligned + +%ifndef FUNC +%define FUNC sha512_one_block_sse +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros + +; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask +; Load xmm with mem and byte swap each dword +%macro COPY_XMM_AND_BSWAP 3 + MOVDQ %1, %2 + pshufb %1, %3 +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define X0 xmm4 +%define X1 xmm5 +%define X2 xmm6 +%define X3 xmm7 +%define X4 xmm8 +%define X5 xmm9 +%define X6 xmm10 +%define X7 xmm11 + +%define XTMP0 xmm0 +%define XTMP1 xmm1 +%define XTMP2 xmm2 +%define XTMP3 xmm3 +%define XFER xmm13 + +%define BYTE_FLIP_MASK xmm12 + +%ifdef LINUX +%define CTX rsi ; 2nd arg +%define INP rdi ; 1st arg + +%define SRND rdi ; clobbers INP +%define c rcx +%define d r8 +%define e rdx +%else +%define CTX rdx ; 2nd arg +%define INP rcx ; 1st arg + +%define SRND rcx ; clobbers INP +%define c rdi +%define d rsi +%define e r8 + +%endif +%define TBL rbp +%define a rax +%define b rbx + +%define f r9 +%define g r10 +%define h r11 + +%define y0 r13 +%define y1 r14 +%define y2 r15 + + +struc STACK +%ifndef LINUX +_XMM_SAVE: reso 8 +%endif +_XFER: reso 1 +endstruc + +%ifndef H0 +%define H0 0x6a09e667f3bcc908 +%define H1 0xbb67ae8584caa73b +%define H2 0x3c6ef372fe94f82b +%define H3 0xa54ff53a5f1d36f1 +%define H4 0x510e527fade682d1 +%define H5 0x9b05688c2b3e6c1f +%define H6 0x1f83d9abfb41bd6b +%define H7 0x5be0cd19137e2179 +%endif + +; rotate_Xs +; Rotate values of symbols X0...X7 +%macro rotate_Xs 0 +%xdefine X_ X0 +%xdefine X0 X1 +%xdefine X1 X2 +%xdefine X2 X3 +%xdefine X3 X4 +%xdefine X4 X5 +%xdefine X5 X6 +%xdefine X6 X7 +%xdefine X7 X_ +%endm + +; ROTATE_ARGS +; Rotate values of symbols a...h +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +%macro TWO_ROUNDS_AND_SCHED 0 + + ;; compute s0 four at a time and s1 two at a time + ;; compute W[-16] + W[-7] 4 at a time + movdqa XTMP0, X5 + mov y0, e ; y0 = e + mov y1, a ; y1 = a + ror y0, (41-18) ; y0 = e >> (41-18) + palignr XTMP0, X4, 8 ; XTMP0 = W[-7] + xor y0, e ; y0 = e ^ (e >> (41-18)) + mov y2, f ; y2 = f + ror y1, (39-34) ; y1 = a >> (39-34) + xor y1, a ; y1 = a ^ (a >> (39-34) + movdqa XTMP1, X1 + ror y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14)) + xor y2, g ; y2 = f^g + paddq XTMP0, X0 ; XTMP0 = W[-7] + W[-16] + ror y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28)) + xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (41-14)) + and y2, e ; y2 = (f^g)&e + ;; compute s0 + palignr XTMP1, X0, 8 ; XTMP1 = W[-15] + xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28)) + xor y2, g ; y2 = CH = ((f^g)&e)^g + movdqa XTMP2, XTMP1 ; XTMP2 = W[-15] + ror y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41) + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 0*8] ; y2 = k + w + S1 + CH + ror y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39) + movdqa XTMP3, XTMP1 ; XTMP3 = W[-15] + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + psllq XTMP1, (64-1) + mov y2, a ; y2 = a + or y0, c ; y0 = a|c + psrlq XTMP2, 1 + add d, h ; d = d + t1 + and y2, c ; y2 = a&c + por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 1 + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = t1 + S0 + movdqa XTMP2, XTMP3 ; XTMP2 = W[-15] + psrlq XTMP2, 8 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = t1 + S0 + MAJ + movdqa X0, XTMP3 ; X0 = W[-15] + psllq XTMP3, (64-8) + + +ROTATE_ARGS + pxor XTMP1, XTMP3 + psrlq X0, 7 ; X0 = W[-15] >> 7 + mov y0, e ; y0 = e + mov y1, a ; y1 = a + pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 1 ^ W[-15] ror 8 + ror y0, (41-18) ; y0 = e >> (41-18) + xor y0, e ; y0 = e ^ (e >> (41-18)) + mov y2, f ; y2 = f + pxor XTMP1, X0 ; XTMP1 = s0 + ror y1, (39-34) ; y1 = a >> (39-34) + xor y1, a ; y1 = a ^ (a >> (39-34) + ;; compute s1 + movdqa XTMP2, X7 ; XTMP2 = W[-2] + ror y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14)) + xor y2, g ; y2 = f^g + paddq XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0 + ror y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28)) + xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (41-14)) + movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] + movdqa X0, XTMP2 ; X0 = W[-2] + and y2, e ; y2 = (f^g)&e + ror y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41) + xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28)) + psllq XTMP3, (64-19) + xor y2, g ; y2 = CH = ((f^g)&e)^g + add y2, y0 ; y2 = S1 + CH + add y2, [rsp + _XFER + 1*8] ; y2 = k + w + S1 + CH + psrlq X0, 19 + ror y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39) + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + por XTMP3, X0 ; XTMP3 = W[-2] ror 19 + mov y2, a ; y2 = a + or y0, c ; y0 = a|c + movdqa X0, XTMP2 ; X0 = W[-2] + movdqa XTMP1, XTMP2 ; XTMP1 = W[-2] + add d, h ; d = d + t1 + and y2, c ; y2 = a&c + psllq X0, (64-61) + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = t1 + S0 + psrlq XTMP1, 61 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = t1 + S0 + MAJ + por X0, XTMP1 ; X0 = W[-2] ror 61 + psrlq XTMP2, 6 ; XTMP2 = W[-2] >> 6 + pxor XTMP2, XTMP3 + pxor X0, XTMP2 ; X0 = s1 + paddq X0, XTMP0 ; X0 = {W[1], W[0]} + +ROTATE_ARGS +rotate_Xs +%endm + +;; input is [rsp + _XFER + %1 * 8] +%macro DO_ROUND 1 + mov y0, e ; y0 = e + ror y0, (41-18) ; y0 = e >> (41-18) + mov y1, a ; y1 = a + xor y0, e ; y0 = e ^ (e >> (41-18)) + ror y1, (39-34) ; y1 = a >> (39-34) + mov y2, f ; y2 = f + xor y1, a ; y1 = a ^ (a >> (39-34) + ror y0, (18-14) ; y0 = (e >> (18-14)) ^ (e >> (41-14)) + xor y2, g ; y2 = f^g + xor y0, e ; y0 = e ^ (e >> (18-14)) ^ (e >> (25-6)) + ror y1, (34-28) ; y1 = (a >> (34-28)) ^ (a >> (39-28)) + and y2, e ; y2 = (f^g)&e + xor y1, a ; y1 = a ^ (a >> (34-28)) ^ (a >> (39-28)) + ror y0, 14 ; y0 = S1 = (e>>14) & (e>>18) ^ (e>>41) + xor y2, g ; y2 = CH = ((f^g)&e)^g + add y2, y0 ; y2 = S1 + CH + ror y1, 28 ; y1 = S0 = (a>>28) ^ (a>>34) ^ (a>>39) + add y2, [rsp + _XFER + %1*8] ; y2 = k + w + S1 + CH + mov y0, a ; y0 = a + add h, y2 ; h = h + S1 + CH + k + w + mov y2, a ; y2 = a + or y0, c ; y0 = a|c + add d, h ; d = d + t1 + and y2, c ; y2 = a&c + and y0, b ; y0 = (a|c)&b + add h, y1 ; h = t1 + S0 + or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c) + add h, y0 ; h = t1 + S0 + MAJ + ROTATE_ARGS +%endm + +section .data +default rel +align 64 +K512: + dq 0x428a2f98d728ae22,0x7137449123ef65cd + dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + dq 0x3956c25bf348b538,0x59f111f1b605d019 + dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + dq 0xd807aa98a3030242,0x12835b0145706fbe + dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + dq 0x9bdc06a725c71235,0xc19bf174cf692694 + dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + dq 0x983e5152ee66dfab,0xa831c66d2db43210 + dq 0xb00327c898fb213f,0xbf597fc7beef0ee4 + dq 0xc6e00bf33da88fc2,0xd5a79147930aa725 + dq 0x06ca6351e003826f,0x142929670a0e6e70 + dq 0x27b70a8546d22ffc,0x2e1b21385c26c926 + dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + dq 0x650a73548baf63de,0x766a0abb3c77b2a8 + dq 0x81c2c92e47edaee6,0x92722c851482353b + dq 0xa2bfe8a14cf10364,0xa81a664bbc423001 + dq 0xc24b8b70d0f89791,0xc76c51a30654be30 + dq 0xd192e819d6ef5218,0xd69906245565a910 + dq 0xf40e35855771202a,0x106aa07032bbd1b8 + dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + dq 0x748f82ee5defb2fc,0x78a5636f43172f60 + dq 0x84c87814a1f0ab72,0x8cc702081a6439ec + dq 0x90befffa23631e28,0xa4506cebde82bde9 + dq 0xbef9a3f7b2c67915,0xc67178f2e372532b + dq 0xca273eceea26619c,0xd186b8c721c0c207 + dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + dq 0x06f067aa72176fba,0x0a637dc5a2c898a6 + dq 0x113f9804bef90dae,0x1b710b35131c471b + dq 0x28db77f523047d84,0x32caab7b40c72493 + dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +h0: dq H0 +h1: dq H1 +h2: dq H2 +h3: dq H3 +h4: dq H4 +h5: dq H5 +h6: dq H6 +h7: dq H7 + +align 16 +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607 + dq 0x0001020304050607, 0x08090a0b0c0d0e0f + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void FUNC(void *input_data, UINT64 digest[8]) +;; arg 1 : pointer to input data +;; arg 2 : pointer to digest +section .text +MKGLOBAL(FUNC,function,) +align 32 +FUNC: + push rbx +%ifndef LINUX + push rsi + push rdi +%endif + push rbp + push r13 + push r14 + push r15 + + sub rsp,STACK_size +%ifndef LINUX + movdqa [rsp + _XMM_SAVE + 0*16],xmm6 + movdqa [rsp + _XMM_SAVE + 1*16],xmm7 + movdqa [rsp + _XMM_SAVE + 2*16],xmm8 + movdqa [rsp + _XMM_SAVE + 3*16],xmm9 + movdqa [rsp + _XMM_SAVE + 4*16],xmm10 + movdqa [rsp + _XMM_SAVE + 5*16],xmm11 + movdqa [rsp + _XMM_SAVE + 6*16],xmm12 + movdqa [rsp + _XMM_SAVE + 7*16],xmm13 +%endif + + ;; load initial digest + mov a,[rel h0] + mov b,[rel h1] + mov c,[rel h2] + mov d,[rel h3] + mov e,[rel h4] + mov f,[rel h5] + mov g,[rel h6] + mov h,[rel h7] + + movdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK] + + lea TBL,[rel K512] + + ;; byte swap first 16 qwords + COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X4, [INP + 4*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X5, [INP + 5*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X6, [INP + 6*16], BYTE_FLIP_MASK + COPY_XMM_AND_BSWAP X7, [INP + 7*16], BYTE_FLIP_MASK + + ;; schedule 64 input qwords, by doing 4 iterations of 16 rounds + mov SRND, 4 +align 16 +loop1: + +%assign i 0 +%rep 7 + movdqa XFER, X0 + paddq XFER, [TBL + i*16] + movdqa [rsp + _XFER], XFER + TWO_ROUNDS_AND_SCHED +%assign i (i+1) +%endrep + + movdqa XFER, X0 + paddq XFER, [TBL + 7*16] + movdqa [rsp + _XFER], XFER + add TBL, 8*16 + TWO_ROUNDS_AND_SCHED + + sub SRND, 1 + jne loop1 + + mov SRND, 2 + jmp loop2a +loop2: + movdqa X0, X4 + movdqa X1, X5 + movdqa X2, X6 + movdqa X3, X7 + +loop2a: + paddq X0, [TBL + 0*16] + movdqa [rsp + _XFER], X0 + DO_ROUND 0 + DO_ROUND 1 + + paddq X1, [TBL + 1*16] + movdqa [rsp + _XFER], X1 + DO_ROUND 0 + DO_ROUND 1 + + paddq X2, [TBL + 2*16] + movdqa [rsp + _XFER], X2 + DO_ROUND 0 + DO_ROUND 1 + + paddq X3, [TBL + 3*16] + movdqa [rsp + _XFER], X3 + add TBL, 4*16 + DO_ROUND 0 + DO_ROUND 1 + + sub SRND, 1 + jne loop2 + + add a,[rel h0] + add b,[rel h1] + add c,[rel h2] + add d,[rel h3] + add e,[rel h4] + add f,[rel h5] + add g,[rel h6] + mov [8*0 + CTX],a + mov [8*1 + CTX],b + mov [8*2 + CTX],c + mov [8*3 + CTX],d + mov [8*4 + CTX],e + mov [8*5 + CTX],f + mov [8*6 + CTX],g + add h,[rel h7] + mov [8*7 + CTX],h + +done_hash: +%ifndef LINUX + movdqa xmm6,[rsp + _XMM_SAVE + 0*16] + movdqa xmm7,[rsp + _XMM_SAVE + 1*16] + movdqa xmm8,[rsp + _XMM_SAVE + 2*16] + movdqa xmm9,[rsp + _XMM_SAVE + 3*16] + movdqa xmm10,[rsp + _XMM_SAVE + 4*16] + movdqa xmm11,[rsp + _XMM_SAVE + 5*16] + movdqa xmm12,[rsp + _XMM_SAVE + 6*16] + movdqa xmm13,[rsp + _XMM_SAVE + 7*16] +%endif + + add rsp, STACK_size + + pop r15 + pop r14 + pop r13 + pop rbp +%ifndef LINUX + pop rdi + pop rsi +%endif + pop rbx + + ret + + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/sha512_x2_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha512_x2_sse.asm new file mode 100644 index 00000000..e4533053 --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/sha512_x2_sse.asm @@ -0,0 +1,439 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; code to compute SHA512 by-2 using SSE +;; outer calling routine takes care of save and restore of XMM registers +;; Logic designed/laid out by JDG + +;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15 +;; Stack must be aligned to 16 bytes before call +;; Windows clobbers: rax rdx r8 r9 r10 r11 +;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15 +;; +;; Linux clobbers: rax rsi r8 r9 r10 r11 +;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15 +;; +;; clobbers xmm0-15 + +%include "os.asm" +%include "mb_mgr_datastruct.asm" + +;%define DO_DBGPRINT +%include "dbgprint.asm" + +section .data +default rel +align 64 +MKGLOBAL(K512_2,data,internal) +K512_2: + dq 0x428a2f98d728ae22, 0x428a2f98d728ae22 + dq 0x7137449123ef65cd, 0x7137449123ef65cd + dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f + dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc + dq 0x3956c25bf348b538, 0x3956c25bf348b538 + dq 0x59f111f1b605d019, 0x59f111f1b605d019 + dq 0x923f82a4af194f9b, 0x923f82a4af194f9b + dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118 + dq 0xd807aa98a3030242, 0xd807aa98a3030242 + dq 0x12835b0145706fbe, 0x12835b0145706fbe + dq 0x243185be4ee4b28c, 0x243185be4ee4b28c + dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2 + dq 0x72be5d74f27b896f, 0x72be5d74f27b896f + dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1 + dq 0x9bdc06a725c71235, 0x9bdc06a725c71235 + dq 0xc19bf174cf692694, 0xc19bf174cf692694 + dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2 + dq 0xefbe4786384f25e3, 0xefbe4786384f25e3 + dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5 + dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65 + dq 0x2de92c6f592b0275, 0x2de92c6f592b0275 + dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483 + dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4 + dq 0x76f988da831153b5, 0x76f988da831153b5 + dq 0x983e5152ee66dfab, 0x983e5152ee66dfab + dq 0xa831c66d2db43210, 0xa831c66d2db43210 + dq 0xb00327c898fb213f, 0xb00327c898fb213f + dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4 + dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2 + dq 0xd5a79147930aa725, 0xd5a79147930aa725 + dq 0x06ca6351e003826f, 0x06ca6351e003826f + dq 0x142929670a0e6e70, 0x142929670a0e6e70 + dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc + dq 0x2e1b21385c26c926, 0x2e1b21385c26c926 + dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed + dq 0x53380d139d95b3df, 0x53380d139d95b3df + dq 0x650a73548baf63de, 0x650a73548baf63de + dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8 + dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6 + dq 0x92722c851482353b, 0x92722c851482353b + dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364 + dq 0xa81a664bbc423001, 0xa81a664bbc423001 + dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791 + dq 0xc76c51a30654be30, 0xc76c51a30654be30 + dq 0xd192e819d6ef5218, 0xd192e819d6ef5218 + dq 0xd69906245565a910, 0xd69906245565a910 + dq 0xf40e35855771202a, 0xf40e35855771202a + dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8 + dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8 + dq 0x1e376c085141ab53, 0x1e376c085141ab53 + dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99 + dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8 + dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63 + dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb + dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373 + dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3 + dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc + dq 0x78a5636f43172f60, 0x78a5636f43172f60 + dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72 + dq 0x8cc702081a6439ec, 0x8cc702081a6439ec + dq 0x90befffa23631e28, 0x90befffa23631e28 + dq 0xa4506cebde82bde9, 0xa4506cebde82bde9 + dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915 + dq 0xc67178f2e372532b, 0xc67178f2e372532b + dq 0xca273eceea26619c, 0xca273eceea26619c + dq 0xd186b8c721c0c207, 0xd186b8c721c0c207 + dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e + dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178 + dq 0x06f067aa72176fba, 0x06f067aa72176fba + dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6 + dq 0x113f9804bef90dae, 0x113f9804bef90dae + dq 0x1b710b35131c471b, 0x1b710b35131c471b + dq 0x28db77f523047d84, 0x28db77f523047d84 + dq 0x32caab7b40c72493, 0x32caab7b40c72493 + dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc + dq 0x431d67c49c100d4c, 0x431d67c49c100d4c + dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6 + dq 0x597f299cfc657e2a, 0x597f299cfc657e2a + dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec + dq 0x6c44198c4a475817, 0x6c44198c4a475817 + +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607 + dq 0x0001020304050607, 0x08090a0b0c0d0e0f + +section .text + +%ifdef LINUX ; Linux definitions + %define arg1 rdi + %define arg2 rsi +%else ; Windows definitions + %define arg1 rcx + %define arg2 rdx +%endif + +; Common definitions +%define STATE arg1 +%define INP_SIZE arg2 + +%define IDX rax +%define ROUND r8 +%define TBL r11 + +%define inp0 r9 +%define inp1 r10 + +%define a xmm0 +%define b xmm1 +%define c xmm2 +%define d xmm3 +%define e xmm4 +%define f xmm5 +%define g xmm6 +%define h xmm7 + +%define a0 xmm8 +%define a1 xmm9 +%define a2 xmm10 + +%define TT0 xmm14 +%define TT1 xmm13 +%define TT2 xmm12 +%define TT3 xmm11 +%define TT4 xmm10 +%define TT5 xmm9 + +%define T1 xmm14 +%define TMP xmm15 + + + +%define SZ2 2*SHA512_DIGEST_WORD_SIZE ; Size of one vector register +%define ROUNDS 80*SZ2 + +; Define stack usage + +struc STACK +_DATA: resb SZ2 * 16 +_DIGEST: resb SZ2 * NUM_SHA512_DIGEST_WORDS + resb 8 ; for alignment, must be odd multiple of 8 +endstruc + +%define MOVPD movupd + +; transpose r0, r1, t0 +; Input looks like {r0 r1} +; r0 = {a1 a0} +; r1 = {b1 b0} +; +; output looks like +; r0 = {b0, a0} +; t0 = {b1, a1} + +%macro TRANSPOSE 3 +%define %%r0 %1 +%define %%r1 %2 +%define %%t0 %3 + movapd %%t0, %%r0 ; t0 = a1 a0 + shufpd %%r0, %%r1, 00b ; r0 = b0 a0 + shufpd %%t0, %%r1, 11b ; t0 = b1 a1 +%endm + + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +; PRORQ reg, imm, tmp +; packed-rotate-right-double +; does a rotate by doing two shifts and an or +%macro PRORQ 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + movdqa %%tmp, %%reg + psllq %%tmp, (64-(%%imm)) + psrlq %%reg, %%imm + por %%reg, %%tmp +%endmacro + +; PRORQ dst/src, amt +%macro PRORQ 2 + PRORQ %1, %2, TMP +%endmacro + + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15 2 +%define %%T1 %1 +%define %%i %2 + movdqa a0, e ; sig1: a0 = e + movdqa a1, e ; sig1: s1 = e + PRORQ a0, (18-14) ; sig1: a0 = (e >> 4) + + movdqa a2, f ; ch: a2 = f + pxor a2, g ; ch: a2 = f^g + pand a2, e ; ch: a2 = (f^g)&e + pxor a2, g ; a2 = ch + + PRORQ a1, 41 ; sig1: a1 = (e >> 41) + movdqa [SZ2*(%%i&0xf) + rsp],%%T1 + paddq %%T1,[TBL + ROUND] ; T1 = W + K + pxor a0, e ; sig1: a0 = e ^ (e >> 5) + PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18) + paddq h, a2 ; h = h + ch + movdqa a2, a ; sig0: a2 = a + PRORQ a2, (34-28) ; sig0: a2 = (a >> 6) + paddq h, %%T1 ; h = h + ch + W + K + pxor a0, a1 ; a0 = sigma1 + movdqa a1, a ; sig0: a1 = a + movdqa %%T1, a ; maj: T1 = a + PRORQ a1, 39 ; sig0: a1 = (a >> 39) + pxor %%T1, c ; maj: T1 = a^c + add ROUND, SZ2 ; ROUND++ + pand %%T1, b ; maj: T1 = (a^c)&b + paddq h, a0 + + paddq d, h + + pxor a2, a ; sig0: a2 = a ^ (a >> 11) + PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34) + pxor a2, a1 ; a2 = sig0 + movdqa a1, a ; maj: a1 = a + pand a1, c ; maj: a1 = a&c + por a1, %%T1 ; a1 = maj + paddq h, a1 ; h = h + ch + W + K + maj + paddq h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm + + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 2 +%define %%T1 %1 +%define %%i %2 + movdqa %%T1, [SZ2*((%%i-15)&0xf) + rsp] + movdqa a1, [SZ2*((%%i-2)&0xf) + rsp] + movdqa a0, %%T1 + PRORQ %%T1, 8-1 + movdqa a2, a1 + PRORQ a1, 61-19 + pxor %%T1, a0 + PRORQ %%T1, 1 + pxor a1, a2 + PRORQ a1, 19 + psrlq a0, 7 + pxor %%T1, a0 + psrlq a2, 6 + pxor a1, a2 + paddq %%T1, [SZ2*((%%i-16)&0xf) + rsp] + paddq a1, [SZ2*((%%i-7)&0xf) + rsp] + paddq %%T1, a1 + + ROUND_00_15 %%T1, %%i +%endm + + + +;; SHA512_ARGS: +;; UINT128 digest[8]; // transposed digests +;; UINT8 *data_ptr[2]; +;; + +;; void sha512_x2_sse(SHA512_ARGS *args, UINT64 num_blocks); +;; arg 1 : STATE : pointer args +;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1) +;; +MKGLOBAL(sha512_x2_sse,function,internal) +align 32 +sha512_x2_sse: + ; general registers preserved in outer calling routine + ; outer calling routine saves all the XMM registers + sub rsp, STACK_size + + ;; Load the pre-transposed incoming digest. + movdqa a,[STATE + 0 * SHA512_DIGEST_ROW_SIZE] + movdqa b,[STATE + 1 * SHA512_DIGEST_ROW_SIZE] + movdqa c,[STATE + 2 * SHA512_DIGEST_ROW_SIZE] + movdqa d,[STATE + 3 * SHA512_DIGEST_ROW_SIZE] + movdqa e,[STATE + 4 * SHA512_DIGEST_ROW_SIZE] + movdqa f,[STATE + 5 * SHA512_DIGEST_ROW_SIZE] + movdqa g,[STATE + 6 * SHA512_DIGEST_ROW_SIZE] + movdqa h,[STATE + 7 * SHA512_DIGEST_ROW_SIZE] + + DBGPRINTL_XMM "incoming transposed sha512 digest", a, b, c, d, e, f, g, h + lea TBL,[rel K512_2] + + ;; load the address of each of the 2 message lanes + ;; getting ready to transpose input onto stack + mov inp0,[STATE + _data_ptr_sha512 +0*PTR_SZ] + mov inp1,[STATE + _data_ptr_sha512 +1*PTR_SZ] + + xor IDX, IDX +lloop: + xor ROUND, ROUND + DBGPRINTL64 "lloop enter INP_SIZE ", INP_SIZE + DBGPRINTL64 " IDX = ", IDX + ;; save old digest + movdqa [rsp + _DIGEST + 0*SZ2], a + movdqa [rsp + _DIGEST + 1*SZ2], b + movdqa [rsp + _DIGEST + 2*SZ2], c + movdqa [rsp + _DIGEST + 3*SZ2], d + movdqa [rsp + _DIGEST + 4*SZ2], e + movdqa [rsp + _DIGEST + 5*SZ2], f + movdqa [rsp + _DIGEST + 6*SZ2], g + movdqa [rsp + _DIGEST + 7*SZ2], h + + DBGPRINTL "incoming data[" +%assign i 0 +%rep 8 + ;; load up the shuffler for little-endian to big-endian format + movdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK] + MOVPD TT0,[inp0+IDX+i*16] ;; double precision is 64 bits + MOVPD TT2,[inp1+IDX+i*16] + DBGPRINTL_XMM "input message block", TT0 + TRANSPOSE TT0, TT2, TT1 + pshufb TT0, TMP + pshufb TT1, TMP + ROUND_00_15 TT0,(i*2+0) + ROUND_00_15 TT1,(i*2+1) +%assign i (i+1) +%endrep + DBGPRINTL "]" + add IDX, 8 * 16 ;; increment by a message block + + +%assign i (i*4) + + jmp Lrounds_16_xx +align 16 +Lrounds_16_xx: +%rep 16 + ROUND_16_XX T1, i +%assign i (i+1) +%endrep + + cmp ROUND,ROUNDS + jb Lrounds_16_xx + + ;; add old digest + paddq a, [rsp + _DIGEST + 0*SZ2] + paddq b, [rsp + _DIGEST + 1*SZ2] + paddq c, [rsp + _DIGEST + 2*SZ2] + paddq d, [rsp + _DIGEST + 3*SZ2] + paddq e, [rsp + _DIGEST + 4*SZ2] + paddq f, [rsp + _DIGEST + 5*SZ2] + paddq g, [rsp + _DIGEST + 6*SZ2] + paddq h, [rsp + _DIGEST + 7*SZ2] + + sub INP_SIZE, 1 ;; unit is blocks + jne lloop + + ; write back to memory (state object) the transposed digest + movdqa [STATE + 0*SHA512_DIGEST_ROW_SIZE],a + movdqa [STATE + 1*SHA512_DIGEST_ROW_SIZE],b + movdqa [STATE + 2*SHA512_DIGEST_ROW_SIZE],c + movdqa [STATE + 3*SHA512_DIGEST_ROW_SIZE],d + movdqa [STATE + 4*SHA512_DIGEST_ROW_SIZE],e + movdqa [STATE + 5*SHA512_DIGEST_ROW_SIZE],f + movdqa [STATE + 6*SHA512_DIGEST_ROW_SIZE],g + movdqa [STATE + 7*SHA512_DIGEST_ROW_SIZE],h + DBGPRINTL_XMM "exit transposed digest ", a, b, c, d, e, f, g, h + + ; update input pointers + add inp0, IDX + mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0 + add inp1, IDX + mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + add rsp, STACK_size +DBGPRINTL "====================== exit sha512_x2_sse code =====================\n" + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif diff --git a/src/spdk/intel-ipsec-mb/sse/sha_256_mult_sse.asm b/src/spdk/intel-ipsec-mb/sse/sha_256_mult_sse.asm new file mode 100644 index 00000000..3af8511d --- /dev/null +++ b/src/spdk/intel-ipsec-mb/sse/sha_256_mult_sse.asm @@ -0,0 +1,447 @@ +;; +;; Copyright (c) 2012-2018, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; code to compute quad SHA256 using SSE +;; outer calling routine takes care of save and restore of XMM registers +;; Logic designed/laid out by JDG + +;; Stack must be aligned to 16 bytes before call +;; Windows clobbers: rax rbx rdx r8 r9 r10 r11 r12 +;; Windows preserves: rcx rsi rdi rbp r12 r14 r15 +;; +;; Linux clobbers: rax rbx rsi r8 r9 r10 r11 r12 +;; Linux preserves: rcx rdx rdi rbp r13 r14 r15 +;; +;; clobbers xmm0-15 + +%include "os.asm" +%include "mb_mgr_datastruct.asm" + +;%define DO_DBGPRINT +%include "dbgprint.asm" + +section .data +default rel +align 64 +MKGLOBAL(K256_4,data,internal) +K256_4: + dq 0x428a2f98428a2f98, 0x428a2f98428a2f98 + dq 0x7137449171374491, 0x7137449171374491 + dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf + dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5 + dq 0x3956c25b3956c25b, 0x3956c25b3956c25b + dq 0x59f111f159f111f1, 0x59f111f159f111f1 + dq 0x923f82a4923f82a4, 0x923f82a4923f82a4 + dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5 + dq 0xd807aa98d807aa98, 0xd807aa98d807aa98 + dq 0x12835b0112835b01, 0x12835b0112835b01 + dq 0x243185be243185be, 0x243185be243185be + dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3 + dq 0x72be5d7472be5d74, 0x72be5d7472be5d74 + dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe + dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7 + dq 0xc19bf174c19bf174, 0xc19bf174c19bf174 + dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1 + dq 0xefbe4786efbe4786, 0xefbe4786efbe4786 + dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6 + dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc + dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f + dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa + dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc + dq 0x76f988da76f988da, 0x76f988da76f988da + dq 0x983e5152983e5152, 0x983e5152983e5152 + dq 0xa831c66da831c66d, 0xa831c66da831c66d + dq 0xb00327c8b00327c8, 0xb00327c8b00327c8 + dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7 + dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3 + dq 0xd5a79147d5a79147, 0xd5a79147d5a79147 + dq 0x06ca635106ca6351, 0x06ca635106ca6351 + dq 0x1429296714292967, 0x1429296714292967 + dq 0x27b70a8527b70a85, 0x27b70a8527b70a85 + dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138 + dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc + dq 0x53380d1353380d13, 0x53380d1353380d13 + dq 0x650a7354650a7354, 0x650a7354650a7354 + dq 0x766a0abb766a0abb, 0x766a0abb766a0abb + dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e + dq 0x92722c8592722c85, 0x92722c8592722c85 + dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1 + dq 0xa81a664ba81a664b, 0xa81a664ba81a664b + dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70 + dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3 + dq 0xd192e819d192e819, 0xd192e819d192e819 + dq 0xd6990624d6990624, 0xd6990624d6990624 + dq 0xf40e3585f40e3585, 0xf40e3585f40e3585 + dq 0x106aa070106aa070, 0x106aa070106aa070 + dq 0x19a4c11619a4c116, 0x19a4c11619a4c116 + dq 0x1e376c081e376c08, 0x1e376c081e376c08 + dq 0x2748774c2748774c, 0x2748774c2748774c + dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5 + dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3 + dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a + dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f + dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3 + dq 0x748f82ee748f82ee, 0x748f82ee748f82ee + dq 0x78a5636f78a5636f, 0x78a5636f78a5636f + dq 0x84c8781484c87814, 0x84c8781484c87814 + dq 0x8cc702088cc70208, 0x8cc702088cc70208 + dq 0x90befffa90befffa, 0x90befffa90befffa + dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb + dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7 + dq 0xc67178f2c67178f2, 0xc67178f2c67178f2 +PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203 + dq 0x0405060700010203, 0x0c0d0e0f08090a0b + +section .text + +%ifdef LINUX ; Linux definitions + %define arg1 rdi + %define arg2 rsi +%else ; Windows definitions + %define arg1 rcx + %define arg2 rdx +%endif + +; Common definitions +%define STATE arg1 +%define INP_SIZE arg2 + +%define IDX rax +%define ROUND rbx +%define TBL r12 + +%define inp0 r8 +%define inp1 r9 +%define inp2 r10 +%define inp3 r11 + +%define a xmm0 +%define b xmm1 +%define c xmm2 +%define d xmm3 +%define e xmm4 +%define f xmm5 +%define g xmm6 +%define h xmm7 + +%define a0 xmm8 +%define a1 xmm9 +%define a2 xmm10 + +%define TT0 xmm14 +%define TT1 xmm13 +%define TT2 xmm12 +%define TT3 xmm11 +%define TT4 xmm10 +%define TT5 xmm9 + +%define T1 xmm14 +%define TMP xmm15 + +%define SZ4 4*SHA256_DIGEST_WORD_SIZE ; Size of one vector register +%define ROUNDS 64*SZ4 + +; Define stack usage +struc STACK +_DATA: resb SZ4 * 16 +_DIGEST: resb SZ4 * NUM_SHA256_DIGEST_WORDS + resb 8 ; for alignment, must be odd multiple of 8 +endstruc + +%define MOVPS movups + +; transpose r0, r1, r2, r3, t0, t1 +; "transpose" data in {r0..r3} using temps {t0..t3} +; Input looks like: {r0 r1 r2 r3} +; r0 = {a3 a2 a1 a0} +; r1 = {b3 b2 b1 b0} +; r2 = {c3 c2 c1 c0} +; r3 = {d3 d2 d1 d0} +; +; output looks like: {t0 r1 r0 r3} +; t0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; r0 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} +; +%macro TRANSPOSE 6 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%t0 %5 +%define %%t1 %6 + movaps %%t0, %%r0 ; t0 = {a3 a2 a1 a0} + shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0} + shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2} + + movaps %%t1, %%r2 ; t1 = {c3 c2 c1 c0} + shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0} + shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2} + + movaps %%r1, %%t0 ; r1 = {b1 b0 a1 a0} + shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1} + + movaps %%r3, %%r0 ; r3 = {b3 b2 a3 a2} + shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3} + + shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2} + shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0} +%endmacro + + + +%macro ROTATE_ARGS 0 +%xdefine TMP_ h +%xdefine h g +%xdefine g f +%xdefine f e +%xdefine e d +%xdefine d c +%xdefine c b +%xdefine b a +%xdefine a TMP_ +%endm + +; PRORD reg, imm, tmp +%macro PRORD 3 +%define %%reg %1 +%define %%imm %2 +%define %%tmp %3 + movdqa %%tmp, %%reg + psrld %%reg, %%imm + pslld %%tmp, (32-(%%imm)) + por %%reg, %%tmp +%endmacro + +%macro PRORD 2 + PRORD %1, %2, TMP +%endmacro + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_00_15 2 +%define %%T1 %1 +%define %%i %2 + movdqa a0, e ; sig1: a0 = e + movdqa a1, e ; sig1: s1 = e + PRORD a0, (11-6) ; sig1: a0 = (e >> 5) + + movdqa a2, f ; ch: a2 = f + pxor a2, g ; ch: a2 = f^g + pand a2, e ; ch: a2 = (f^g)&e + pxor a2, g ; a2 = ch + + PRORD a1, 25 ; sig1: a1 = (e >> 25) + movdqa [SZ4*(%%i&0xf) + rsp],%%T1 + paddd %%T1,[TBL + ROUND] ; T1 = W + K + pxor a0, e ; sig1: a0 = e ^ (e >> 5) + PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11) + paddd h, a2 ; h = h + ch + movdqa a2, a ; sig0: a2 = a + PRORD a2, (13-2) ; sig0: a2 = (a >> 11) + paddd h, %%T1 ; h = h + ch + W + K + pxor a0, a1 ; a0 = sigma1 + movdqa a1, a ; sig0: a1 = a + movdqa %%T1, a ; maj: T1 = a + PRORD a1, 22 ; sig0: a1 = (a >> 22) + pxor %%T1, c ; maj: T1 = a^c + add ROUND, SZ4 ; ROUND++ + pand %%T1, b ; maj: T1 = (a^c)&b + paddd h, a0 + + paddd d, h + + pxor a2, a ; sig0: a2 = a ^ (a >> 11) + PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13) + pxor a2, a1 ; a2 = sig0 + movdqa a1, a ; maj: a1 = a + pand a1, c ; maj: a1 = a&c + por a1, %%T1 ; a1 = maj + paddd h, a1 ; h = h + ch + W + K + maj + paddd h, a2 ; h = h + ch + W + K + maj + sigma0 + + ROTATE_ARGS +%endm + + +;; arguments passed implicitly in preprocessor symbols i, a...h +%macro ROUND_16_XX 2 +%define %%T1 %1 +%define %%i %2 + movdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp] + movdqa a1, [SZ4*((%%i-2)&0xf) + rsp] + movdqa a0, %%T1 + PRORD %%T1, 18-7 + movdqa a2, a1 + PRORD a1, 19-17 + pxor %%T1, a0 + PRORD %%T1, 7 + pxor a1, a2 + PRORD a1, 17 + psrld a0, 3 + pxor %%T1, a0 + psrld a2, 10 + pxor a1, a2 + paddd %%T1, [SZ4*((%%i-16)&0xf) + rsp] + paddd a1, [SZ4*((%%i-7)&0xf) + rsp] + paddd %%T1, a1 + + ROUND_00_15 %%T1, %%i +%endm + + + +;; SHA256_ARGS: +;; UINT128 digest[8]; // transposed digests +;; UINT8 *data_ptr[4]; +;; + +;; void sha_256_mult_sse(SHA256_ARGS *args, UINT64 num_blocks); +;; arg 1 : STATE : pointer args +;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1) +;; +MKGLOBAL(sha_256_mult_sse,function,internal) +align 32 +sha_256_mult_sse: + ; general registers preserved in outer calling routine + ; outer calling routine saves all the XMM registers + sub rsp, STACK_size + + ;; Load the pre-transposed incoming digest. + movdqa a,[STATE + 0 * SHA256_DIGEST_ROW_SIZE ] + movdqa b,[STATE + 1 * SHA256_DIGEST_ROW_SIZE ] + movdqa c,[STATE + 2 * SHA256_DIGEST_ROW_SIZE ] + movdqa d,[STATE + 3 * SHA256_DIGEST_ROW_SIZE ] + movdqa e,[STATE + 4 * SHA256_DIGEST_ROW_SIZE ] + movdqa f,[STATE + 5 * SHA256_DIGEST_ROW_SIZE ] + movdqa g,[STATE + 6 * SHA256_DIGEST_ROW_SIZE ] + movdqa h,[STATE + 7 * SHA256_DIGEST_ROW_SIZE ] + + DBGPRINTL_XMM "incoming transposed sha256 digest", a, b, c, d, e, f, g, h + lea TBL,[rel K256_4] + + ;; load the address of each of the 4 message lanes + ;; getting ready to transpose input onto stack + mov inp0,[STATE + _data_ptr_sha256 + 0*PTR_SZ] + mov inp1,[STATE + _data_ptr_sha256 + 1*PTR_SZ] + mov inp2,[STATE + _data_ptr_sha256 + 2*PTR_SZ] + mov inp3,[STATE + _data_ptr_sha256 + 3*PTR_SZ] + DBGPRINTL64 "incoming input data ptrs ", inp0, inp1, inp2, inp3 + xor IDX, IDX +lloop: + xor ROUND, ROUND + + ;; save old digest + movdqa [rsp + _DIGEST + 0*SZ4], a + movdqa [rsp + _DIGEST + 1*SZ4], b + movdqa [rsp + _DIGEST + 2*SZ4], c + movdqa [rsp + _DIGEST + 3*SZ4], d + movdqa [rsp + _DIGEST + 4*SZ4], e + movdqa [rsp + _DIGEST + 5*SZ4], f + movdqa [rsp + _DIGEST + 6*SZ4], g + movdqa [rsp + _DIGEST + 7*SZ4], h + +%assign i 0 +%rep 4 + movdqa TMP, [rel PSHUFFLE_BYTE_FLIP_MASK] + MOVPS TT2,[inp0+IDX+i*16] + MOVPS TT1,[inp1+IDX+i*16] + MOVPS TT4,[inp2+IDX+i*16] + MOVPS TT3,[inp3+IDX+i*16] + TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5 + pshufb TT0, TMP + pshufb TT1, TMP + pshufb TT2, TMP + pshufb TT3, TMP + ROUND_00_15 TT0,(i*4+0) + ROUND_00_15 TT1,(i*4+1) + ROUND_00_15 TT2,(i*4+2) + ROUND_00_15 TT3,(i*4+3) +%assign i (i+1) +%endrep + add IDX, 4*4*4 + + +%assign i (i*4) + + jmp Lrounds_16_xx +align 16 +Lrounds_16_xx: +%rep 16 + ROUND_16_XX T1, i +%assign i (i+1) +%endrep + + cmp ROUND,ROUNDS + jb Lrounds_16_xx + + ;; add old digest + paddd a, [rsp + _DIGEST + 0*SZ4] + paddd b, [rsp + _DIGEST + 1*SZ4] + paddd c, [rsp + _DIGEST + 2*SZ4] + paddd d, [rsp + _DIGEST + 3*SZ4] + paddd e, [rsp + _DIGEST + 4*SZ4] + paddd f, [rsp + _DIGEST + 5*SZ4] + paddd g, [rsp + _DIGEST + 6*SZ4] + paddd h, [rsp + _DIGEST + 7*SZ4] + + sub INP_SIZE, 1 ;; unit is blocks + jne lloop + + ; write back to memory (state object) the transposed digest + movdqa [STATE+0*SHA256_DIGEST_ROW_SIZE ],a + movdqa [STATE+1*SHA256_DIGEST_ROW_SIZE ],b + movdqa [STATE+2*SHA256_DIGEST_ROW_SIZE ],c + movdqa [STATE+3*SHA256_DIGEST_ROW_SIZE ],d + movdqa [STATE+4*SHA256_DIGEST_ROW_SIZE ],e + movdqa [STATE+5*SHA256_DIGEST_ROW_SIZE ],f + movdqa [STATE+6*SHA256_DIGEST_ROW_SIZE ],g + movdqa [STATE+7*SHA256_DIGEST_ROW_SIZE ],h + DBGPRINTL_XMM "updated transposed sha256 digest", a, b, c, d, e, f, g, h + + ; update input pointers + add inp0, IDX + mov [STATE + _data_ptr_sha256 + 0*8], inp0 + add inp1, IDX + mov [STATE + _data_ptr_sha256 + 1*8], inp1 + add inp2, IDX + mov [STATE + _data_ptr_sha256 + 2*8], inp2 + add inp3, IDX + mov [STATE + _data_ptr_sha256 + 3*8], inp3 + + DBGPRINTL64 "updated input data ptrs ", inp0, inp1, inp2, inp3 + + ;;;;;;;;;;;;;;;; + ;; Postamble + + add rsp, STACK_size + ; outer calling routine restores XMM and other GP registers + ret + +%ifdef LINUX +section .note.GNU-stack noalloc noexec nowrite progbits +%endif |