diff options
Diffstat (limited to 'src/spdk/intel-ipsec-mb/include/transpose_avx2.asm')
-rw-r--r-- | src/spdk/intel-ipsec-mb/include/transpose_avx2.asm | 218 |
1 files changed, 218 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/include/transpose_avx2.asm b/src/spdk/intel-ipsec-mb/include/transpose_avx2.asm new file mode 100644 index 000000000..fed12cf4b --- /dev/null +++ b/src/spdk/intel-ipsec-mb/include/transpose_avx2.asm @@ -0,0 +1,218 @@ +;; +;; Copyright (c) 2012-2019, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%ifndef _TRANSPOSE_AVX2_ASM_ +%define _TRANSPOSE_AVX2_ASM_ + +%include "include/reg_sizes.asm" + +; LOAD ALL 8 LANES FOR 8x8 32-BIT TRANSPOSE +; +; r0-r7 [out] ymm registers which will contain the data to be transposed +; addr0-addr7 [in] pointers to the next 32-byte block of data to be fetch for all 8 lanes +; ptr_offset [in] offset to be applied on all pointers (addr0-addr7) +%macro TRANSPOSE8_U32_LOAD8 17 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%addr0 %9 +%define %%addr1 %10 +%define %%addr2 %11 +%define %%addr3 %12 +%define %%addr4 %13 +%define %%addr5 %14 +%define %%addr6 %15 +%define %%addr7 %16 +%define %%ptr_offset %17 + +; Expected output data +; +; r0 = {e3 e2 e1 e0 a3 a2 a1 a0} +; r1 = {f3 f2 f1 f0 b3 b2 b1 b0} +; r2 = {g3 g2 g1 g0 c3 c2 c1 c0} +; r3 = {h3 h2 h1 h0 d3 d2 d1 d0} +; r4 = {e7 e6 e5 e4 a7 a6 a5 a4} +; r5 = {f7 f6 f5 f4 b7 b6 b5 b4} +; r6 = {g7 g6 g5 g4 c7 c6 c5 c4} +; r7 = {h7 h6 h5 h4 d7 d6 d5 d4} + + vmovups XWORD(%%r0),[%%addr0+%%ptr_offset] + vmovups XWORD(%%r1),[%%addr1+%%ptr_offset] + vmovups XWORD(%%r2),[%%addr2+%%ptr_offset] + vmovups XWORD(%%r3),[%%addr3+%%ptr_offset] + vmovups XWORD(%%r4),[%%addr0+%%ptr_offset+16] + vmovups XWORD(%%r5),[%%addr1+%%ptr_offset+16] + vmovups XWORD(%%r6),[%%addr2+%%ptr_offset+16] + vmovups XWORD(%%r7),[%%addr3+%%ptr_offset+16] + + vinserti128 %%r0, %%r0, [%%addr4+%%ptr_offset], 0x01 + vinserti128 %%r1, %%r1, [%%addr5+%%ptr_offset], 0x01 + vinserti128 %%r2, %%r2, [%%addr6+%%ptr_offset], 0x01 + vinserti128 %%r3, %%r3, [%%addr7+%%ptr_offset], 0x01 + vinserti128 %%r4, %%r4, [%%addr4+%%ptr_offset+16], 0x01 + vinserti128 %%r5, %%r5, [%%addr5+%%ptr_offset+16], 0x01 + vinserti128 %%r6, %%r6, [%%addr6+%%ptr_offset+16], 0x01 + vinserti128 %%r7, %%r7, [%%addr7+%%ptr_offset+16], 0x01 + +%endmacro + +; 8x8 32-BIT TRANSPOSE +; +; Before calling this macro, TRANSPOSE8_U32_LOAD8 must be called. +; +; r0-r3 [in/out] ymm registers containing bytes 0-15 of each 32B block (e.g. ymm0 = [e3-e0 a3-a0]) +; r4-r7 [in/out] ymm registers containing bytes 16-31 of each 32B block (e.g. ymm4 = [e4-e7 a4-a7]) +; t0-t1 [clobbered] ymm temporary registers +%macro TRANSPOSE8_U32 10 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%r4 %5 +%define %%r5 %6 +%define %%r6 %7 +%define %%r7 %8 +%define %%t0 %9 +%define %%t1 %10 +; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7} +; r0 = {e3 e2 e1 e0 a3 a2 a1 a0} +; r1 = {f3 f2 f1 f0 b3 b2 b1 b0} +; r2 = {g3 g2 g1 g0 c3 c2 c1 c0} +; r3 = {h3 h2 h1 h0 d3 d2 d1 d0} +; r4 = {e7 e6 e5 e4 a7 a6 a5 a4} +; r5 = {f7 f6 f5 f4 b7 b6 b5 b4} +; r6 = {g7 g6 g5 g4 c7 c6 c5 c4} +; r7 = {h7 h6 h5 h4 d7 d6 d5 d4} +; +; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7} +; r0 = {h0 g0 f0 e0 d0 c0 b0 a0} +; r1 = {h1 g1 f1 e1 d1 c1 b1 a1} +; r2 = {h2 g2 f2 e2 d2 c2 b2 a2} +; r3 = {h3 g3 f3 e3 d3 c3 b3 a3} +; r4 = {h4 g4 f4 e4 d4 c4 b4 a4} +; r5 = {h5 g5 f5 e5 d5 c5 b5 a5} +; r6 = {h6 g6 f6 e6 d6 c6 b6 a6} +; r7 = {h7 g7 f7 e7 d7 c7 b7 a7} +; + ; process top half (r0..r3) + vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {f1 f0 e1 e0 b1 b0 a1 a0} + vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {f3 f2 e3 e2 b3 b2 a3 a2} + vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {h1 h0 g1 g0 d1 d0 c1 c0} + vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {h3 h2 g3 g2 d3 d2 c3 c2} + + vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1} + vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3} + vshufps %%r2, %%r0, %%r2, 0x88 ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2} + vshufps %%r0, %%t0, %%t1, 0x88 ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0} + + ;; process bottom half (r4..r7) + vshufps %%t0, %%r4, %%r5, 0x44 ; t0 = {f5 f4 e5 e4 b5 b4 a5 a4} + vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 b7 b6 a7 a6} + vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 d5 d4 c5 c4} + vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 d7 d6 c7 c6} + + vshufps %%r5, %%t0, %%t1, 0xDD ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5} + vshufps %%r7, %%r4, %%r6, 0xDD ; r7 = {h7 g7 f7 e7 d7 c7 b7 a7} + vshufps %%r6, %%r4, %%r6, 0x88 ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6} + vshufps %%r4, %%t0, %%t1, 0x88 ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4} +%endmacro + +; LOAD ALL 4 LANES FOR 4x4 64-BIT TRANSPOSE +; +; r0-r3 [out] ymm registers which will contain the data to be transposed +; addr0-addr3 [in] pointers to the next 32-byte block of data to be fetch for the 4 lanes +; ptr_offset [in] offset to be applied on all pointers (addr0-addr3) +%macro TRANSPOSE4_U64_LOAD4 9 +%define %%r0 %1 +%define %%r1 %2 +%define %%r2 %3 +%define %%r3 %4 +%define %%addr0 %5 +%define %%addr1 %6 +%define %%addr2 %7 +%define %%addr3 %8 +%define %%ptr_offset %9 + +; Expected output data +; +; r0 = {c1 c0 a1 a0} +; r1 = {d1 d0 b1 b0} +; r2 = {c3 c2 a3 a2} +; r3 = {d3 d2 b3 b2} + + vmovupd XWORD(%%r0),[%%addr0+%%ptr_offset] + vmovupd XWORD(%%r1),[%%addr1+%%ptr_offset] + vmovupd XWORD(%%r2),[%%addr0+%%ptr_offset+16] + vmovupd XWORD(%%r3),[%%addr1+%%ptr_offset+16] + + vinserti128 %%r0, %%r0, [%%addr2+%%ptr_offset], 0x01 + vinserti128 %%r1, %%r1, [%%addr3+%%ptr_offset], 0x01 + vinserti128 %%r2, %%r2, [%%addr2+%%ptr_offset+16], 0x1 + vinserti128 %%r3, %%r3, [%%addr3+%%ptr_offset+16], 0x01 + +%endmacro + +; 4x4 64-BIT TRANSPOSE +; +; Before calling this macro, TRANSPOSE4_U64_LOAD4 must be called. +; +; This macro takes 4 registers as input (r0-r3) +; and transposes their content (64-bit elements) +; outputing the data in registers (o0,r1,o2,r3), +; using two additional registers +%macro TRANSPOSE4_U64 6 +%define %%r0 %1 ; [in] ymm register for row 0 input (c0-c1 a1-a0) +%define %%r1 %2 ; [in/out] ymm register for row 1 input (d0-d1 b1-b0) and output +%define %%r2 %3 ; [in] ymm register for row 2 input (c3-c2 a3-a2) +%define %%r3 %4 ; [in/out] ymm register for row 3 input (d3-d2 b3-b2) and output +%define %%o0 %5 ; [out] ymm register for row 0 output +%define %%o2 %6 ; [out] ymm register for row 2 output +; Input looks like: {r0 r1 r2 r3} +; r0 = {c1 c0 a1 a0} +; r1 = {d1 d0 b1 b0} +; r2 = {c3 c2 a3 a2} +; r3 = {d3 d2 b3 b2} +; +; output looks like: {o0 r1 o2 r3} +; o0 = {d0 c0 b0 a0} +; r1 = {d1 c1 b1 a1} +; o2 = {d2 c2 b2 a2} +; r3 = {d3 c3 b3 a3} + ; vshufps does not cross the mid-way boundary and hence is cheaper + vshufps %%o0, %%r0, %%r1, 0x44 ; o0 = {d0 c0 b0 a0} + vshufps %%r1, %%r0, %%r1, 0xEE ; r1 = {d1 d0 b1 b0} + + vshufps %%o2, %%r2, %%r3, 0x44 ; o1 = {d2 c2 b2 a2} + vshufps %%r3, %%r2, %%r3, 0xEE ; r3 = {d3 c3 b3 a3} +%endmacro + +%endif ;; _TRANSPOSE_AVX2_ASM_ |