;;
;; Copyright (c) 2012-2019, Intel Corporation
;;
;; Redistribution and use in source and binary forms, with or without
;; modification, are permitted provided that the following conditions are met:
;;
;;     * Redistributions of source code must retain the above copyright notice,
;;       this list of conditions and the following disclaimer.
;;     * Redistributions in binary form must reproduce the above copyright
;;       notice, this list of conditions and the following disclaimer in the
;;       documentation and/or other materials provided with the distribution.
;;     * Neither the name of Intel Corporation nor the names of its contributors
;;       may be used to endorse or promote products derived from this software
;;       without specific prior written permission.
;;
;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;

%ifndef _TRANSPOSE_AVX2_ASM_
%define _TRANSPOSE_AVX2_ASM_

%include "include/reg_sizes.asm"

; LOAD ALL 8 LANES FOR 8x8 32-BIT TRANSPOSE
;
; r0-r7       [out] ymm registers which will contain the data to be transposed
; addr0-addr7 [in]  pointers to the next 32-byte block of data to be fetch for all 8 lanes
; ptr_offset  [in] offset to be applied on all pointers (addr0-addr7)
%macro TRANSPOSE8_U32_LOAD8 17
%define %%r0 %1
%define %%r1 %2
%define %%r2 %3
%define %%r3 %4
%define %%r4 %5
%define %%r5 %6
%define %%r6 %7
%define %%r7 %8
%define %%addr0 %9
%define %%addr1 %10
%define %%addr2 %11
%define %%addr3 %12
%define %%addr4 %13
%define %%addr5 %14
%define %%addr6 %15
%define %%addr7 %16
%define %%ptr_offset %17

; Expected output data
;
; r0 = {e3 e2 e1 e0  a3 a2 a1 a0}
; r1 = {f3 f2 f1 f0  b3 b2 b1 b0}
; r2 = {g3 g2 g1 g0  c3 c2 c1 c0}
; r3 = {h3 h2 h1 h0  d3 d2 d1 d0}
; r4 = {e7 e6 e5 e4  a7 a6 a5 a4}
; r5 = {f7 f6 f5 f4  b7 b6 b5 b4}
; r6 = {g7 g6 g5 g4  c7 c6 c5 c4}
; r7 = {h7 h6 h5 h4  d7 d6 d5 d4}

	vmovups	XWORD(%%r0),[%%addr0+%%ptr_offset]
	vmovups	XWORD(%%r1),[%%addr1+%%ptr_offset]
	vmovups	XWORD(%%r2),[%%addr2+%%ptr_offset]
	vmovups	XWORD(%%r3),[%%addr3+%%ptr_offset]
	vmovups	XWORD(%%r4),[%%addr0+%%ptr_offset+16]
	vmovups	XWORD(%%r5),[%%addr1+%%ptr_offset+16]
	vmovups	XWORD(%%r6),[%%addr2+%%ptr_offset+16]
	vmovups	XWORD(%%r7),[%%addr3+%%ptr_offset+16]

	vinserti128 %%r0, %%r0, [%%addr4+%%ptr_offset], 0x01
	vinserti128 %%r1, %%r1, [%%addr5+%%ptr_offset], 0x01
	vinserti128 %%r2, %%r2, [%%addr6+%%ptr_offset], 0x01
	vinserti128 %%r3, %%r3, [%%addr7+%%ptr_offset], 0x01
	vinserti128 %%r4, %%r4, [%%addr4+%%ptr_offset+16], 0x01
	vinserti128 %%r5, %%r5, [%%addr5+%%ptr_offset+16], 0x01
	vinserti128 %%r6, %%r6, [%%addr6+%%ptr_offset+16], 0x01
	vinserti128 %%r7, %%r7, [%%addr7+%%ptr_offset+16], 0x01

%endmacro

; 8x8 32-BIT TRANSPOSE
;
; Before calling this macro, TRANSPOSE8_U32_LOAD8 must be called.
;
; r0-r3          [in/out]    ymm registers containing bytes 0-15 of each 32B block (e.g. ymm0 = [e3-e0 a3-a0])
; r4-r7          [in/out]    ymm registers containing bytes 16-31 of each 32B block (e.g. ymm4 = [e4-e7 a4-a7])
; t0-t1          [clobbered] ymm temporary registers
%macro TRANSPOSE8_U32 10
%define %%r0 %1
%define %%r1 %2
%define %%r2 %3
%define %%r3 %4
%define %%r4 %5
%define %%r5 %6
%define %%r6 %7
%define %%r7 %8
%define %%t0 %9
%define %%t1 %10
; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
; r0 = {e3 e2 e1 e0   a3 a2 a1 a0}
; r1 = {f3 f2 f1 f0   b3 b2 b1 b0}
; r2 = {g3 g2 g1 g0   c3 c2 c1 c0}
; r3 = {h3 h2 h1 h0   d3 d2 d1 d0}
; r4 = {e7 e6 e5 e4   a7 a6 a5 a4}
; r5 = {f7 f6 f5 f4   b7 b6 b5 b4}
; r6 = {g7 g6 g5 g4   c7 c6 c5 c4}
; r7 = {h7 h6 h5 h4   d7 d6 d5 d4}
;
; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
; r0 = {h0 g0 f0 e0   d0 c0 b0 a0}
; r1 = {h1 g1 f1 e1   d1 c1 b1 a1}
; r2 = {h2 g2 f2 e2   d2 c2 b2 a2}
; r3 = {h3 g3 f3 e3   d3 c3 b3 a3}
; r4 = {h4 g4 f4 e4   d4 c4 b4 a4}
; r5 = {h5 g5 f5 e5   d5 c5 b5 a5}
; r6 = {h6 g6 f6 e6   d6 c6 b6 a6}
; r7 = {h7 g7 f7 e7   d7 c7 b7 a7}
;
	; process top half (r0..r3)
	vshufps	%%t0, %%r0, %%r1, 0x44	; t0 = {f1 f0 e1 e0   b1 b0 a1 a0}
	vshufps	%%r0, %%r0, %%r1, 0xEE	; r0 = {f3 f2 e3 e2   b3 b2 a3 a2}
	vshufps %%t1, %%r2, %%r3, 0x44	; t1 = {h1 h0 g1 g0   d1 d0 c1 c0}
	vshufps	%%r2, %%r2, %%r3, 0xEE	; r2 = {h3 h2 g3 g2   d3 d2 c3 c2}

	vshufps	%%r1, %%t0, %%t1, 0xDD	; r1 = {h1 g1 f1 e1   d1 c1 b1 a1}
	vshufps	%%r3, %%r0, %%r2, 0xDD	; r3 = {h3 g3 f3 e3   d3 c3 b3 a3}
	vshufps	%%r2, %%r0, %%r2, 0x88	; r2 = {h2 g2 f2 e2   d2 c2 b2 a2}
	vshufps	%%r0, %%t0, %%t1, 0x88	; r0 = {h0 g0 f0 e0   d0 c0 b0 a0}

	;; process bottom half (r4..r7)
	vshufps	%%t0, %%r4, %%r5, 0x44	; t0 = {f5 f4 e5 e4   b5 b4 a5 a4}
	vshufps	%%r4, %%r4, %%r5, 0xEE	; r4 = {f7 f6 e7 e6   b7 b6 a7 a6}
	vshufps %%t1, %%r6, %%r7, 0x44	; t1 = {h5 h4 g5 g4   d5 d4 c5 c4}
	vshufps	%%r6, %%r6, %%r7, 0xEE	; r6 = {h7 h6 g7 g6   d7 d6 c7 c6}

	vshufps	%%r5, %%t0, %%t1, 0xDD	; r5 = {h5 g5 f5 e5   d5 c5 b5 a5}
	vshufps	%%r7, %%r4, %%r6, 0xDD	; r7 = {h7 g7 f7 e7   d7 c7 b7 a7}
	vshufps	%%r6, %%r4, %%r6, 0x88	; r6 = {h6 g6 f6 e6   d6 c6 b6 a6}
	vshufps	%%r4, %%t0, %%t1, 0x88	; r4 = {h4 g4 f4 e4   d4 c4 b4 a4}
%endmacro

; LOAD ALL 4 LANES FOR 4x4 64-BIT TRANSPOSE
;
; r0-r3 [out] ymm registers which will contain the data to be transposed
; addr0-addr3 [in] pointers to the next 32-byte block of data to be fetch for the 4 lanes
; ptr_offset  [in] offset to be applied on all pointers (addr0-addr3)
%macro TRANSPOSE4_U64_LOAD4 9
%define %%r0 %1
%define %%r1 %2
%define %%r2 %3
%define %%r3 %4
%define %%addr0 %5
%define %%addr1 %6
%define %%addr2 %7
%define %%addr3 %8
%define %%ptr_offset %9

; Expected output data
;
; r0 = {c1 c0 a1 a0}
; r1 = {d1 d0 b1 b0}
; r2 = {c3 c2 a3 a2}
; r3 = {d3 d2 b3 b2}

	vmovupd	XWORD(%%r0),[%%addr0+%%ptr_offset]
	vmovupd	XWORD(%%r1),[%%addr1+%%ptr_offset]
	vmovupd	XWORD(%%r2),[%%addr0+%%ptr_offset+16]
	vmovupd	XWORD(%%r3),[%%addr1+%%ptr_offset+16]

	vinserti128 %%r0, %%r0, [%%addr2+%%ptr_offset], 0x01
	vinserti128 %%r1, %%r1, [%%addr3+%%ptr_offset], 0x01
	vinserti128 %%r2, %%r2, [%%addr2+%%ptr_offset+16], 0x1
	vinserti128 %%r3, %%r3, [%%addr3+%%ptr_offset+16], 0x01

%endmacro

; 4x4 64-BIT TRANSPOSE
;
; Before calling this macro, TRANSPOSE4_U64_LOAD4 must be called.
;
; This macro takes 4 registers as input (r0-r3)
; and transposes their content (64-bit elements)
; outputing the data in registers (o0,r1,o2,r3),
; using two additional registers
%macro TRANSPOSE4_U64 6
%define %%r0 %1 ; [in]     ymm register for row 0 input (c0-c1 a1-a0)
%define %%r1 %2 ; [in/out] ymm register for row 1 input (d0-d1 b1-b0) and output
%define %%r2 %3 ; [in]     ymm register for row 2 input (c3-c2 a3-a2)
%define %%r3 %4 ; [in/out] ymm register for row 3 input (d3-d2 b3-b2) and output
%define %%o0 %5 ; [out]    ymm register for row 0 output
%define %%o2 %6 ; [out]    ymm register for row 2 output
; Input looks like: {r0 r1 r2 r3}
; r0 = {c1 c0 a1 a0}
; r1 = {d1 d0 b1 b0}
; r2 = {c3 c2 a3 a2}
; r3 = {d3 d2 b3 b2}
;
; output looks like: {o0 r1 o2 r3}
; o0 = {d0 c0 b0 a0}
; r1 = {d1 c1 b1 a1}
; o2 = {d2 c2 b2 a2}
; r3 = {d3 c3 b3 a3}
	; vshufps does not cross the mid-way boundary and hence is cheaper
	vshufps	%%o0, %%r0, %%r1, 0x44	; o0 = {d0 c0 b0 a0}
	vshufps	%%r1, %%r0, %%r1, 0xEE	; r1 = {d1 d0 b1 b0}

	vshufps	%%o2, %%r2, %%r3, 0x44	; o1 = {d2 c2 b2 a2}
	vshufps	%%r3, %%r2, %%r3, 0xEE	; r3 = {d3 c3 b3 a3}
%endmacro

%endif ;; _TRANSPOSE_AVX2_ASM_