summaryrefslogtreecommitdiffstats
path: root/src/spdk/intel-ipsec-mb/include/transpose_avx2.asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/spdk/intel-ipsec-mb/include/transpose_avx2.asm')
-rw-r--r--src/spdk/intel-ipsec-mb/include/transpose_avx2.asm218
1 files changed, 218 insertions, 0 deletions
diff --git a/src/spdk/intel-ipsec-mb/include/transpose_avx2.asm b/src/spdk/intel-ipsec-mb/include/transpose_avx2.asm
new file mode 100644
index 000000000..fed12cf4b
--- /dev/null
+++ b/src/spdk/intel-ipsec-mb/include/transpose_avx2.asm
@@ -0,0 +1,218 @@
+;;
+;; Copyright (c) 2012-2019, Intel Corporation
+;;
+;; Redistribution and use in source and binary forms, with or without
+;; modification, are permitted provided that the following conditions are met:
+;;
+;; * Redistributions of source code must retain the above copyright notice,
+;; this list of conditions and the following disclaimer.
+;; * Redistributions in binary form must reproduce the above copyright
+;; notice, this list of conditions and the following disclaimer in the
+;; documentation and/or other materials provided with the distribution.
+;; * Neither the name of Intel Corporation nor the names of its contributors
+;; may be used to endorse or promote products derived from this software
+;; without specific prior written permission.
+;;
+;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;
+
+%ifndef _TRANSPOSE_AVX2_ASM_
+%define _TRANSPOSE_AVX2_ASM_
+
+%include "include/reg_sizes.asm"
+
+; LOAD ALL 8 LANES FOR 8x8 32-BIT TRANSPOSE
+;
+; r0-r7 [out] ymm registers which will contain the data to be transposed
+; addr0-addr7 [in] pointers to the next 32-byte block of data to be fetch for all 8 lanes
+; ptr_offset [in] offset to be applied on all pointers (addr0-addr7)
+%macro TRANSPOSE8_U32_LOAD8 17
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%addr0 %9
+%define %%addr1 %10
+%define %%addr2 %11
+%define %%addr3 %12
+%define %%addr4 %13
+%define %%addr5 %14
+%define %%addr6 %15
+%define %%addr7 %16
+%define %%ptr_offset %17
+
+; Expected output data
+;
+; r0 = {e3 e2 e1 e0 a3 a2 a1 a0}
+; r1 = {f3 f2 f1 f0 b3 b2 b1 b0}
+; r2 = {g3 g2 g1 g0 c3 c2 c1 c0}
+; r3 = {h3 h2 h1 h0 d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4 a7 a6 a5 a4}
+; r5 = {f7 f6 f5 f4 b7 b6 b5 b4}
+; r6 = {g7 g6 g5 g4 c7 c6 c5 c4}
+; r7 = {h7 h6 h5 h4 d7 d6 d5 d4}
+
+ vmovups XWORD(%%r0),[%%addr0+%%ptr_offset]
+ vmovups XWORD(%%r1),[%%addr1+%%ptr_offset]
+ vmovups XWORD(%%r2),[%%addr2+%%ptr_offset]
+ vmovups XWORD(%%r3),[%%addr3+%%ptr_offset]
+ vmovups XWORD(%%r4),[%%addr0+%%ptr_offset+16]
+ vmovups XWORD(%%r5),[%%addr1+%%ptr_offset+16]
+ vmovups XWORD(%%r6),[%%addr2+%%ptr_offset+16]
+ vmovups XWORD(%%r7),[%%addr3+%%ptr_offset+16]
+
+ vinserti128 %%r0, %%r0, [%%addr4+%%ptr_offset], 0x01
+ vinserti128 %%r1, %%r1, [%%addr5+%%ptr_offset], 0x01
+ vinserti128 %%r2, %%r2, [%%addr6+%%ptr_offset], 0x01
+ vinserti128 %%r3, %%r3, [%%addr7+%%ptr_offset], 0x01
+ vinserti128 %%r4, %%r4, [%%addr4+%%ptr_offset+16], 0x01
+ vinserti128 %%r5, %%r5, [%%addr5+%%ptr_offset+16], 0x01
+ vinserti128 %%r6, %%r6, [%%addr6+%%ptr_offset+16], 0x01
+ vinserti128 %%r7, %%r7, [%%addr7+%%ptr_offset+16], 0x01
+
+%endmacro
+
+; 8x8 32-BIT TRANSPOSE
+;
+; Before calling this macro, TRANSPOSE8_U32_LOAD8 must be called.
+;
+; r0-r3 [in/out] ymm registers containing bytes 0-15 of each 32B block (e.g. ymm0 = [e3-e0 a3-a0])
+; r4-r7 [in/out] ymm registers containing bytes 16-31 of each 32B block (e.g. ymm4 = [e4-e7 a4-a7])
+; t0-t1 [clobbered] ymm temporary registers
+%macro TRANSPOSE8_U32 10
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {e3 e2 e1 e0 a3 a2 a1 a0}
+; r1 = {f3 f2 f1 f0 b3 b2 b1 b0}
+; r2 = {g3 g2 g1 g0 c3 c2 c1 c0}
+; r3 = {h3 h2 h1 h0 d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4 a7 a6 a5 a4}
+; r5 = {f7 f6 f5 f4 b7 b6 b5 b4}
+; r6 = {g7 g6 g5 g4 c7 c6 c5 c4}
+; r7 = {h7 h6 h5 h4 d7 d6 d5 d4}
+;
+; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
+; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
+; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
+; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
+; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
+; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
+; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
+; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
+;
+ ; process top half (r0..r3)
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {f1 f0 e1 e0 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {f3 f2 e3 e2 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {h1 h0 g1 g0 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {h3 h2 g3 g2 d3 d2 c3 c2}
+
+ vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
+ vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
+ vshufps %%r2, %%r0, %%r2, 0x88 ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
+ vshufps %%r0, %%t0, %%t1, 0x88 ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
+
+ ;; process bottom half (r4..r7)
+ vshufps %%t0, %%r4, %%r5, 0x44 ; t0 = {f5 f4 e5 e4 b5 b4 a5 a4}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 b7 b6 a7 a6}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 d5 d4 c5 c4}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 d7 d6 c7 c6}
+
+ vshufps %%r5, %%t0, %%t1, 0xDD ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
+ vshufps %%r7, %%r4, %%r6, 0xDD ; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
+ vshufps %%r6, %%r4, %%r6, 0x88 ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
+ vshufps %%r4, %%t0, %%t1, 0x88 ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
+%endmacro
+
+; LOAD ALL 4 LANES FOR 4x4 64-BIT TRANSPOSE
+;
+; r0-r3 [out] ymm registers which will contain the data to be transposed
+; addr0-addr3 [in] pointers to the next 32-byte block of data to be fetch for the 4 lanes
+; ptr_offset [in] offset to be applied on all pointers (addr0-addr3)
+%macro TRANSPOSE4_U64_LOAD4 9
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%addr0 %5
+%define %%addr1 %6
+%define %%addr2 %7
+%define %%addr3 %8
+%define %%ptr_offset %9
+
+; Expected output data
+;
+; r0 = {c1 c0 a1 a0}
+; r1 = {d1 d0 b1 b0}
+; r2 = {c3 c2 a3 a2}
+; r3 = {d3 d2 b3 b2}
+
+ vmovupd XWORD(%%r0),[%%addr0+%%ptr_offset]
+ vmovupd XWORD(%%r1),[%%addr1+%%ptr_offset]
+ vmovupd XWORD(%%r2),[%%addr0+%%ptr_offset+16]
+ vmovupd XWORD(%%r3),[%%addr1+%%ptr_offset+16]
+
+ vinserti128 %%r0, %%r0, [%%addr2+%%ptr_offset], 0x01
+ vinserti128 %%r1, %%r1, [%%addr3+%%ptr_offset], 0x01
+ vinserti128 %%r2, %%r2, [%%addr2+%%ptr_offset+16], 0x1
+ vinserti128 %%r3, %%r3, [%%addr3+%%ptr_offset+16], 0x01
+
+%endmacro
+
+; 4x4 64-BIT TRANSPOSE
+;
+; Before calling this macro, TRANSPOSE4_U64_LOAD4 must be called.
+;
+; This macro takes 4 registers as input (r0-r3)
+; and transposes their content (64-bit elements)
+; outputing the data in registers (o0,r1,o2,r3),
+; using two additional registers
+%macro TRANSPOSE4_U64 6
+%define %%r0 %1 ; [in] ymm register for row 0 input (c0-c1 a1-a0)
+%define %%r1 %2 ; [in/out] ymm register for row 1 input (d0-d1 b1-b0) and output
+%define %%r2 %3 ; [in] ymm register for row 2 input (c3-c2 a3-a2)
+%define %%r3 %4 ; [in/out] ymm register for row 3 input (d3-d2 b3-b2) and output
+%define %%o0 %5 ; [out] ymm register for row 0 output
+%define %%o2 %6 ; [out] ymm register for row 2 output
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {c1 c0 a1 a0}
+; r1 = {d1 d0 b1 b0}
+; r2 = {c3 c2 a3 a2}
+; r3 = {d3 d2 b3 b2}
+;
+; output looks like: {o0 r1 o2 r3}
+; o0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; o2 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+ ; vshufps does not cross the mid-way boundary and hence is cheaper
+ vshufps %%o0, %%r0, %%r1, 0x44 ; o0 = {d0 c0 b0 a0}
+ vshufps %%r1, %%r0, %%r1, 0xEE ; r1 = {d1 d0 b1 b0}
+
+ vshufps %%o2, %%r2, %%r3, 0x44 ; o1 = {d2 c2 b2 a2}
+ vshufps %%r3, %%r2, %%r3, 0xEE ; r3 = {d3 c3 b3 a3}
+%endmacro
+
+%endif ;; _TRANSPOSE_AVX2_ASM_