diff options
Diffstat (limited to 'third_party/dav1d/src/x86/itx_avx2.asm')
-rw-r--r-- | third_party/dav1d/src/x86/itx_avx2.asm | 5542 |
1 files changed, 5542 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/itx_avx2.asm b/third_party/dav1d/src/x86/itx_avx2.asm new file mode 100644 index 0000000000..a67f053a61 --- /dev/null +++ b/third_party/dav1d/src/x86/itx_avx2.asm @@ -0,0 +1,5542 @@ +; Copyright © 2018-2021, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 16 + +; Note: The order of (at least some of) those constants matter! + +const deint_shuf, db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 + +%macro COEF_PAIR 2 +pw_%1_%2: dw %1, %2 +pw_m%2_%1: dw -%2, %1 +%endmacro + +; ADST-only +pw_3803_1321: dw 3803, 1321 +pw_m1321_2482: dw -1321, 2482 +pw_2482_3344: dw 2482, 3344 +pw_m3344_3344: dw -3344, 3344 +pw_m3803_3344: dw -3803, 3344 +pw_m3803_m6688: dw -3803, -6688 +pw_2896_m2896: dw 2896, -2896 + +const pw_5, times 2 dw 5 +const pw_2048, times 2 dw 2048 +const pw_4096, times 2 dw 4096 +const pw_8192, times 2 dw 8192 +const pw_16384, times 2 dw 16384 +const pw_1697x16, times 2 dw 1697*16 +const pw_1697x8, times 2 dw 1697*8 +const pw_2896x8, times 2 dw 2896*8 +const pd_2048, dd 2048 + +const pw_2896_2896, dw 2896, 2896 +const pw_m2896_2896, dw -2896, 2896 +const pw_1567_3784, dw 1567, 3784 +const pw_m3784_1567, dw -3784, 1567 +COEF_PAIR 3784, 1567 +COEF_PAIR 201, 4091 +COEF_PAIR 995, 3973 +COEF_PAIR 1751, 3703 +COEF_PAIR 2440, 3290 +COEF_PAIR 3035, 2751 +COEF_PAIR 3513, 2106 +COEF_PAIR 3857, 1380 +COEF_PAIR 4052, 601 +COEF_PAIR 401, 4076 +COEF_PAIR 1931, 3612 +COEF_PAIR 3166, 2598 +COEF_PAIR 3920, 1189 +COEF_PAIR 799, 4017 +COEF_PAIR 3406, 2276 +pw_m799_m4017: dw -799, -4017 +const pw_m1567_m3784, dw -1567, -3784 +pw_m3406_m2276: dw -3406, -2276 +pw_m401_m4076: dw -401, -4076 +pw_m3166_m2598: dw -3166, -2598 +pw_m1931_m3612: dw -1931, -3612 +pw_m3920_m1189: dw -3920, -1189 +COEF_PAIR 2276, 3406 +COEF_PAIR 4017, 799 + +%macro COEF_X8 1-* +%rep %0 + dw %1*8, %1*8 + %rotate 1 +%endrep +%endmacro + +pw_3703x8: COEF_X8 3703 +pw_1751x8: COEF_X8 1751 +pw_m1380x8: COEF_X8 -1380 +pw_3857x8: COEF_X8 3857 +pw_3973x8: COEF_X8 3973 +pw_995x8: COEF_X8 995 +pw_m2106x8: COEF_X8 -2106 +pw_3513x8: COEF_X8 3513 +pw_3290x8: COEF_X8 3290 +pw_2440x8: COEF_X8 2440 +pw_m601x8: COEF_X8 -601 +pw_4052x8: COEF_X8 4052 + +const idct64_mul +COEF_X8 4095, 101, 4065, 501, 2967, -2824, 3229, -2520 +COEF_X8 3745, 1660, 3564, 2019, 3822, -1474, 3948, -1092 +COEF_X8 3996, 897, 3889, 1285, 3461, -2191, 3659, -1842 +COEF_X8 3349, 2359, 3102, 2675, 4036, -700, 4085, -301 + +pw_201_4091x8: dw 201*8, 4091*8 +pw_m601_4052x8: dw -601*8, 4052*8 +pw_995_3973x8: dw 995*8, 3973*8 +pw_m1380_3857x8: dw -1380*8, 3857*8 +pw_1751_3703x8: dw 1751*8, 3703*8 +pw_m2106_3513x8: dw -2106*8, 3513*8 +pw_2440_3290x8: dw 2440*8, 3290*8 +pw_m2751_3035x8: dw -2751*8, 3035*8 + +%define o_idct64_offset idct64_mul - (o_base) - 8 + +SECTION .text + +; Code size reduction trickery: Instead of using rip-relative loads with +; mandatory 4-byte offsets everywhere, we can set up a base pointer with a +; single rip-relative lea and then address things relative from that with +; 1-byte offsets as long as data is within +-128 bytes of the base pointer. +%define o_base deint_shuf + 128 +%define o(x) (r6 - (o_base) + (x)) +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +; flags: 1 = swap, 2 = interleave, 4: coef_regs +%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags +%if %7 & 4 + pmaddwd m%2, m%5, m%1 + pmaddwd m%1, m%6 +%else +%if %7 & 1 + vpbroadcastd m%2, [o(pw_%5_%6)] + vpbroadcastd m%3, [o(pw_m%6_%5)] +%else + vpbroadcastd m%2, [o(pw_m%6_%5)] + vpbroadcastd m%3, [o(pw_%5_%6)] +%endif + pmaddwd m%2, m%1 + pmaddwd m%1, m%3 +%endif + paddd m%2, m%4 + paddd m%1, m%4 +%if %7 & 2 + pslld m%2, 4 + psrld m%1, 12 + pblendw m%1, m%2, 0xaa +%else + psrad m%2, 12 + psrad m%1, 12 + packssdw m%1, m%2 +%endif +%endmacro + +; flags: 1 = swap, 2 = interleave, 4 = coef_regs +%macro ITX_MUL4X_PACK 9-10 0 ; dst/src, tmp[1-3], rnd, coef[1-4], flags +%if %10 & 1 + vpbroadcastd m%3, [o(pw_%8_%9)] + vpbroadcastd m%4, [o(pw_m%9_%8)] + vpbroadcastd xm%2, [o(pw_%6_%7)] + vpblendd m%2, m%3, 0xf0 + vpbroadcastd xm%3, [o(pw_m%7_%6)] +%else + vpbroadcastd m%3, [o(pw_m%9_%8)] + vpbroadcastd m%4, [o(pw_%8_%9)] + vpbroadcastd xm%2, [o(pw_m%7_%6)] + vpblendd m%2, m%3, 0xf0 + vpbroadcastd xm%3, [o(pw_%6_%7)] +%endif + vpblendd m%3, m%4, 0xf0 + ITX_MUL2X_PACK %1, %4, _, %5, %2, %3, (4|%10) +%endmacro + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2 + punpckhwd m%3, m%2, m%1 + punpcklwd m%2, m%1 +%if %7 < 32 + pmaddwd m%1, m%7, m%2 + pmaddwd m%4, m%7, m%3 +%else + vpbroadcastd m%1, [o(pw_m%7_%6)] + pmaddwd m%4, m%3, m%1 + pmaddwd m%1, m%2 +%endif + paddd m%4, m%5 + paddd m%1, m%5 + psrad m%4, 12 + psrad m%1, 12 + packssdw m%1, m%4 +%if %7 < 32 + pmaddwd m%3, m%6 + pmaddwd m%2, m%6 +%else + vpbroadcastd m%4, [o(pw_%6_%7)] + pmaddwd m%3, m%4 + pmaddwd m%2, m%4 +%endif + paddd m%3, m%5 + paddd m%2, m%5 + psrad m%3, 12 + psrad m%2, 12 +%if %0 == 8 + packssdw m%8, m%2, m%3 +%else + packssdw m%2, m%3 +%endif +%endmacro + +%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048 + ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, %5 ; t2, t3 + ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, %4 ; t1, t0 + psubsw m%3, m%1, m%2 + paddsw m%2, m%1 + paddsw m%1, m%4, m%5 + psubsw m%4, m%5 +%endmacro + +%macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048 + ITX_MULSUB_2W %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a + ITX_MULSUB_2W %2, %8, %9, %10, %11, 799, 4017 ; t4a, t7a + ITX_MULSUB_2W %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3 + paddsw m%9, m%2, m%6 ; t4 + psubsw m%2, m%6 ; t5a + paddsw m%10, m%8, m%4 ; t7 + psubsw m%8, m%4 ; t6a + ITX_MULSUB_2W %1, %5, %4, %6, %11, 2896, 2896 ; t1, t0 + ITX_MULSUB_2W %8, %2, %4, %6, %11, 2896, 2896 ; t5, t6 + psubsw m%6, m%1, m%3 ; dct4 out2 + paddsw m%3, m%1 ; dct4 out1 + paddsw m%1, m%5, m%7 ; dct4 out0 + psubsw m%5, m%7 ; dct4 out3 + psubsw m%7, m%3, m%2 ; out6 + paddsw m%2, m%3 ; out1 + paddsw m%3, m%6, m%8 ; out2 + psubsw m%6, m%8 ; out5 + psubsw m%8, m%1, m%10 ; out7 + paddsw m%1, m%10 ; out0 + paddsw m%4, m%5, m%9 ; out3 + psubsw m%5, m%9 ; out4 +%endmacro + +; in1 = %1, in3 = %2, in5 = %3, in7 = %4 +; in9 = %5, in11 = %6, in13 = %7, in15 = %8 +%macro IDCT16_1D_ODDHALF 11 ; src[1-8], tmp[1-2], pd_2048 + ITX_MULSUB_2W %1, %8, %9, %10, %11, 401, 4076 ; t8a, t15a + ITX_MULSUB_2W %5, %4, %9, %10, %11, 3166, 2598 ; t9a, t14a + ITX_MULSUB_2W %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a + ITX_MULSUB_2W %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a + psubsw m%9, m%2, m%6 ; t13 + paddsw m%6, m%2 ; t12 + psubsw m%2, m%8, m%4 ; t14 + paddsw m%8, m%4 ; t15 + psubsw m%4, m%7, m%3 ; t10 + paddsw m%3, m%7 ; t11 + psubsw m%7, m%1, m%5 ; t9 + paddsw m%1, m%5 ; t8 + ITX_MULSUB_2W %2, %7, %5, %10, %11, 1567, 3784 ; t9a, t14a + ITX_MULSUB_2W %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a + psubsw m%5, m%1, m%3 ; t11a + paddsw m%1, m%3 ; t8a + psubsw m%3, m%7, m%4 ; t13 + paddsw m%7, m%4 ; t14 + psubsw m%4, m%8, m%6 ; t12a + paddsw m%8, m%6 ; t15a + psubsw m%6, m%2, m%9 ; t10 + paddsw m%2, m%9 ; t9 + ITX_MULSUB_2W %3, %6, %9, %10, %11, 2896, 2896 ; t10a, t13a + ITX_MULSUB_2W %4, %5, %9, %10, %11, 2896, 2896 ; t11, t12 +%endmacro + +%macro WRAP_XMM 1+ + INIT_XMM cpuname + %1 + INIT_YMM cpuname +%endmacro + +%macro ITX4_END 4-5 2048 ; row[1-4], rnd +%if %5 + vpbroadcastd m2, [o(pw_%5)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 +%endif + lea r2, [dstq+strideq*2] +%assign %%i 1 +%rep 4 + %if %1 & 2 + CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) + %else + CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) + %endif + %assign %%i %%i + 1 + %rotate 1 +%endrep + movd m2, [%%row_adr1] + pinsrd m2, [%%row_adr2], 1 + movd m3, [%%row_adr3] + pinsrd m3, [%%row_adr4], 1 + pmovzxbw m2, m2 + pmovzxbw m3, m3 + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + movd [%%row_adr1], m0 + pextrd [%%row_adr2], m0, 1 + pextrd [%%row_adr3], m0, 2 + pextrd [%%row_adr4], m0, 3 + ret +%endmacro + +%macro IWHT4_1D_PACKED 0 + punpckhqdq m3, m0, m1 ; in1 in3 + punpcklqdq m0, m1 ; in0 in2 + psubw m2, m0, m3 + paddw m0, m3 + punpckhqdq m2, m2 ; t2 t2 + punpcklqdq m0, m0 ; t0 t0 + psubw m1, m0, m2 + psraw m1, 1 + psubw m1, m3 ; t1 t3 + psubw m0, m1 ; ____ out0 + paddw m2, m1 ; out3 ____ +%endmacro + +INIT_XMM avx2 +cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, c + mova m0, [cq+16*0] + mova m1, [cq+16*1] + pxor m2, m2 + mova [cq+16*0], m2 + mova [cq+16*1], m2 + psraw m0, 2 + psraw m1, 2 + IWHT4_1D_PACKED + punpckhwd m0, m1 + punpcklwd m3, m1, m2 + punpckhdq m1, m0, m3 + punpckldq m0, m3 + IWHT4_1D_PACKED + vpblendd m0, m2, 0x03 + ITX4_END 3, 0, 2, 1, 0 + +%macro INV_TXFM_FN 3 ; type1, type2, size +cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 5, 0, dst, stride, c, eob, tx2 + %define %%p1 m(i%1_%3_internal_8bpc) + lea r6, [o_base] + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [m(i%2_%3_internal_8bpc).pass2] +%ifidn %1_%2, dct_dct + test eobd, eobd + jnz %%p1 +%else + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endmacro + +%macro INV_TXFM_4X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x4 +%ifidn %1_%2, dct_dct + vpbroadcastw m0, [cq] + vpbroadcastd m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mov [cq], eobd ; 0 + pmulhrsw m0, m1 + mova m1, m0 + jmp m(iadst_4x4_internal_8bpc).end2 +%endif +%endmacro + +%macro IDCT4_1D_PACKED 0 + vpbroadcastd m4, [o(pd_2048)] + punpckhwd m2, m1, m0 + punpcklwd m1, m0 + ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784 + ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896 + paddsw m0, m1, m2 ; out0 out1 + psubsw m1, m2 ; out3 out2 +%endmacro + +%macro IADST4_1D_PACKED 0 + punpcklwd m2, m1, m0 + punpckhwd m3, m1, m0 + vpbroadcastd m5, [o(pw_m3344_3344)] + vpbroadcastd m0, [o(pw_3803_1321)] + vpbroadcastd m4, [o(pw_m1321_2482)] + pmaddwd m1, m5, m2 ; 3344*in3 - 3344*in2 + psrld m5, 16 + pmaddwd m0, m2 + pmaddwd m2, m4 + pmaddwd m5, m3 ; 3344*in0 + paddd m1, m5 ; 3344*in0 - 3344*in2 + 3344*in3 + vpbroadcastd m4, [o(pw_2482_3344)] + vpbroadcastd m5, [o(pw_m3803_3344)] + pmaddwd m4, m3 + pmaddwd m5, m3 + paddd m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3 + vpbroadcastd m0, [o(pw_m3803_m6688)] + pmaddwd m3, m0 + vpbroadcastd m0, [o(pd_2048)] + paddd m2, m0 + paddd m1, m0 + paddd m0, m4 + paddd m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3 + paddd m2, m4 + paddd m2, m3 + REPX {psrad x, 12}, m1, m2, m0, m5 + packssdw m0, m5 ; out0 out1 + packssdw m1, m2 ; out2 out3 +%endmacro + +INV_TXFM_4X4_FN dct, dct +INV_TXFM_4X4_FN dct, adst +INV_TXFM_4X4_FN dct, flipadst +INV_TXFM_4X4_FN dct, identity + +cglobal idct_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + IDCT4_1D_PACKED + mova m2, [o(deint_shuf)] + shufps m3, m0, m1, q1331 + shufps m0, m1, q0220 + pshufb m0, m2 + pshufb m1, m3, m2 + jmp tx2q +.pass2: + IDCT4_1D_PACKED + pxor m2, m2 + mova [cq+16*0], m2 + mova [cq+16*1], m2 + ITX4_END 0, 1, 3, 2 + +INV_TXFM_4X4_FN adst, dct +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst +INV_TXFM_4X4_FN adst, identity + +cglobal iadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + call .main + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call .main +.end: + pxor m2, m2 + mova [cq+16*0], m2 + mova [cq+16*1], m2 +.end2: + ITX4_END 0, 1, 2, 3 +ALIGN function_align +cglobal_label .main + IADST4_1D_PACKED + ret + +INV_TXFM_4X4_FN flipadst, dct +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst +INV_TXFM_4X4_FN flipadst, identity + +cglobal iflipadst_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + call m(iadst_4x4_internal_8bpc).main + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + jmp tx2q +.pass2: + call m(iadst_4x4_internal_8bpc).main +.end: + pxor m2, m2 + mova [cq+16*0], m2 + mova [cq+16*1], m2 +.end2: + ITX4_END 3, 2, 1, 0 + +INV_TXFM_4X4_FN identity, dct +INV_TXFM_4X4_FN identity, adst +INV_TXFM_4X4_FN identity, flipadst +INV_TXFM_4X4_FN identity, identity + +cglobal iidentity_4x4_internal_8bpc, 0, 5, 6, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + vpbroadcastd m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + jmp tx2q +.pass2: + vpbroadcastd m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_4x4_internal_8bpc).end + +%macro WRITE_4X8 2 ; coefs[1-2] + movd xm4, [dstq+strideq*0] + pinsrd xm4, [dstq+strideq*1], 1 + movd xm5, [dstq+strideq*2] + pinsrd xm5, [dstq+r3 ], 1 + pinsrd xm4, [r2 +strideq*0], 2 + pinsrd xm4, [r2 +strideq*1], 3 + pinsrd xm5, [r2 +strideq*2], 2 + pinsrd xm5, [r2 +r3 ], 3 + pmovzxbw m4, xm4 + pmovzxbw m5, xm5 + paddw m4, m%1 + paddw m5, m%2 + packuswb m4, m5 + vextracti128 xm5, m4, 1 + movd [dstq+strideq*0], xm4 + pextrd [dstq+strideq*1], xm4, 1 + pextrd [dstq+strideq*2], xm4, 2 + pextrd [dstq+r3 ], xm4, 3 + movd [r2 +strideq*0], xm5 + pextrd [r2 +strideq*1], xm5, 1 + pextrd [r2 +strideq*2], xm5, 2 + pextrd [r2 +r3 ], xm5, 3 +%endmacro + +%macro INV_TXFM_4X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x8 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_2048)] + mov [cq], eobd + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + mova m1, m0 + jmp m(iadst_4x8_internal_8bpc).end3 +%endif +%endmacro + +%macro IDCT8_1D_PACKED 0 + vpbroadcastd m6, [o(pd_2048)] + punpckhwd m5, m3, m0 ; in7 in1 + punpckhwd m4, m1, m2 ; in3 in5 + punpcklwd m3, m1 ; in6 in2 + punpcklwd m2, m0 ; in4 in0 + ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a + ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a + ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2 + psubsw m0, m5, m4 ; t5a t6a (interleaved) + paddsw m4, m5 ; t4 t7 (interleaved) + ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1 + vpbroadcastd m1, [o(pw_m2896_2896)] + ITX_MUL2X_PACK 0, 1, _, 6, 1, 5, 4 ; t6 t5 +%if mmsize > 16 + vbroadcasti128 m1, [o(deint_shuf)] + pshufb m4, m1 +%else + pshufb m4, [o(deint_shuf)] +%endif + psubsw m1, m2, m3 ; tmp3 tmp2 + paddsw m3, m2 ; tmp0 tmp1 + shufps m2, m4, m0, q1032 ; t7 t6 + vpblendd m4, m0, 0xcc ; t4 t5 + paddsw m0, m3, m2 ; out0 out1 + psubsw m3, m2 ; out7 out6 + psubsw m2, m1, m4 ; out4 out5 + paddsw m1, m4 ; out3 out2 +%endmacro + +%macro IADST8_1D_PACKED 1 ; pass + vpbroadcastd m6, [o(pd_2048)] + punpckhwd m0, m4, m3 ; 0 7 + punpckhwd m1, m5, m2 ; 2 5 + punpcklwd m2, m5 ; 4 3 + punpcklwd m3, m4 ; 6 1 +%if %1 == 1 + ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a + ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a + ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a + ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a + psubsw m4, m0, m2 ; t5 t4 + paddsw m0, m2 ; t1 t0 + psubsw m5, m1, m3 ; t6 t7 + paddsw m1, m3 ; t2 t3 + ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a + ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a +%if mmsize > 16 + vbroadcasti128 m2, [o(deint_shuf)] +%else + mova m2, [o(deint_shuf)] +%endif + pshuflw m1, m1, q2301 + pshufhw m1, m1, q2301 + psubsw m3, m0, m1 ; t3 t2 + paddsw m0, m1 ; -out7 out0 + psubsw m1, m4, m5 ; t7 t6 + paddsw m4, m5 ; out6 -out1 + pshufb m0, m2 + pshufb m4, m2 + vpbroadcastd m5, [o(pw_m2896_2896)] + pmaddwd m2, m5, m3 + pmaddwd m5, m1 + paddd m2, m6 + paddd m5, m6 + psrad m2, 12 + psrad m5, 12 + packssdw m2, m5 ; out4 -out5 + vpbroadcastd m5, [o(pw_2896_2896)] + pmaddwd m3, m5 + pmaddwd m1, m5 + paddd m3, m6 + paddd m1, m6 + psrad m3, 12 + psrad m1, 12 + packssdw m1, m3 ; out2 -out3 + punpcklqdq m3, m4, m0 ; out6 -out7 + punpckhqdq m0, m4 ; out0 -out1 +%else + ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a + ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a + ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a + ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a + psubsw m4, m0, m2 ; t4 t5 + paddsw m0, m2 ; t0 t1 + psubsw m5, m1, m3 ; t6 t7 + paddsw m1, m3 ; t2 t3 + shufps m2, m5, m4, q1032 + punpckhwd m4, m2 + punpcklwd m5, m2 + ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a + ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567 ; t7a t6a + psubsw m2, m0, m1 ; t2 t3 + paddsw m0, m1 ; out0 -out7 + psubsw m1, m4, m5 ; t7 t6 + paddsw m4, m5 ; out6 -out1 + vpbroadcastd m5, [o(pw_2896x8)] + vpblendd m3, m0, m4, 0x33 ; out6 -out7 + vpblendd m0, m4, 0xcc ; out0 -out1 + shufps m4, m2, m1, q1032 ; t3 t7 + vpblendd m1, m2, 0x33 ; t2 t6 + psubsw m2, m1, m4 ; t2-t3 t6-t7 + paddsw m1, m4 ; t2+t3 t6+t7 + pmulhrsw m2, m5 ; out4 -out5 + pshufd m1, m1, q1032 + pmulhrsw m1, m5 ; out2 -out3 +%endif +%endmacro + +INIT_YMM avx2 +INV_TXFM_4X8_FN dct, dct +INV_TXFM_4X8_FN dct, adst +INV_TXFM_4X8_FN dct, flipadst +INV_TXFM_4X8_FN dct, identity + +cglobal idct_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(pw_2896x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + IDCT4_1D_PACKED + vbroadcasti128 m2, [o(deint_shuf)] + shufps m3, m0, m1, q1331 + shufps m0, m1, q0220 + pshufb m0, m2 + pshufb m1, m3, m2 + jmp tx2q +.pass2: + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + call .main + vpbroadcastd m4, [o(pw_2048)] + vinserti128 m0, xm2, 1 + vinserti128 m1, xm3, 1 + pshufd m1, m1, q1032 + jmp m(iadst_4x8_internal_8bpc).end2 +ALIGN function_align +cglobal_label .main + WRAP_XMM IDCT8_1D_PACKED + ret + +INV_TXFM_4X8_FN adst, dct +INV_TXFM_4X8_FN adst, adst +INV_TXFM_4X8_FN adst, flipadst +INV_TXFM_4X8_FN adst, identity + +cglobal iadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(pw_2896x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + call m(iadst_8x4_internal_8bpc).main + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + pshufd xm4, xm0, q1032 + pshufd xm5, xm1, q1032 + call .main_pass2 + vpbroadcastd m4, [o(pw_2048)] + vinserti128 m0, xm2, 1 + vinserti128 m1, xm3, 1 + pxor m5, m5 + psubw m5, m4 +.end: + vpblendd m4, m5, 0xcc +.end2: + pmulhrsw m0, m4 + pmulhrsw m1, m4 + WIN64_RESTORE_XMM + pxor m2, m2 + mova [cq+32*0], m2 + mova [cq+32*1], m2 +.end3: + lea r2, [dstq+strideq*4] + lea r3, [strideq*3] + WRITE_4X8 0, 1 + RET +ALIGN function_align +.main_pass1: + WRAP_XMM IADST8_1D_PACKED 1 + ret +ALIGN function_align +cglobal_label .main_pass2 + WRAP_XMM IADST8_1D_PACKED 2 + ret + +INV_TXFM_4X8_FN flipadst, dct +INV_TXFM_4X8_FN flipadst, adst +INV_TXFM_4X8_FN flipadst, flipadst +INV_TXFM_4X8_FN flipadst, identity + +cglobal iflipadst_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(pw_2896x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + call m(iadst_8x4_internal_8bpc).main + punpcklwd m3, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m3 + punpckhwd m1, m3 + jmp tx2q +.pass2: + vextracti128 xm2, m0, 1 + vextracti128 xm3, m1, 1 + pshufd xm4, xm0, q1032 + pshufd xm5, xm1, q1032 + call m(iadst_4x8_internal_8bpc).main_pass2 + vpbroadcastd m5, [o(pw_2048)] + vinserti128 m3, xm1, 1 + vinserti128 m2, xm0, 1 + pxor m4, m4 + psubw m4, m5 + pshufd m0, m3, q1032 + pshufd m1, m2, q1032 + jmp m(iadst_4x8_internal_8bpc).end + +INV_TXFM_4X8_FN identity, dct +INV_TXFM_4X8_FN identity, adst +INV_TXFM_4X8_FN identity, flipadst +INV_TXFM_4X8_FN identity, identity + +cglobal iidentity_4x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m2, [cq+32*0], q3120 + vpermq m0, [cq+32*1], q3120 + vpbroadcastd m3, [o(pw_2896x8)] + vpbroadcastd m4, [o(pw_1697x8)] + punpcklwd m1, m2, m0 + punpckhwd m2, m0 + pmulhrsw m1, m3 + pmulhrsw m2, m3 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + pmulhrsw m2, m4, m0 + pmulhrsw m4, m1 + paddsw m0, m2 + paddsw m1, m4 + jmp tx2q +.pass2: + vpbroadcastd m4, [o(pw_4096)] + jmp m(iadst_4x8_internal_8bpc).end2 + +%macro INV_TXFM_4X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x16 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + movd xm3, [o(pw_2048)] + mov [cq], eobd + pmulhrsw xm0, xm2 + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm3 + vpbroadcastw m0, xm0 + mova m1, m0 + mova m2, m0 + mova m3, m0 + jmp m(iadst_4x16_internal_8bpc).end3 +%endif +%endmacro + +%macro IDCT16_1D_PACKED 0 + vpbroadcastd m10, [o(pd_2048)] +.main2: + punpckhwd m8, m7, m0 ; dct16 in15 in1 + punpcklwd m9, m4, m0 ; dct4 in2 in0 + punpckhwd m0, m3, m4 ; dct16 in7 in9 + punpcklwd m7, m1 ; dct8 in7 in1 + punpckhwd m1, m6 ; dct16 in3 in13 + punpcklwd m3, m5 ; dct8 in3 in5 + punpckhwd m5, m2 ; dct16 in11 in5 + punpcklwd m6, m2 ; dct4 in3 in1 + ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 3 ; t8a t15a + ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 3 ; t9a t14a + ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 3 ; t11a t12a + ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 3 ; t10a t13a + ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 3 ; t4a t7a + ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 3 ; t5a t6a + ITX_MUL2X_PACK 6, 2, 4, 10, 1567, 3784 ; t3 t2 + psubsw m2, m8, m0 ; t9 t14 + paddsw m8, m0 ; t8 t15 + psubsw m0, m1, m5 ; t10 t13 + paddsw m1, m5 ; t11 t12 + vpbroadcastd m5, [o(pw_m3784_1567)] ; reuse pw_1567_3784 + ITX_MUL2X_PACK 2, 4, _, 10, 4, 5, 6 ; t9a t14a + vpbroadcastd m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567 + ITX_MUL2X_PACK 0, 5, _, 10, 5, 4, 6 ; t10a t13a + psubsw m4, m8, m1 ; t11a t12a + paddsw m8, m1 ; t8a t15a + psubsw m1, m7, m3 ; t5a t6a + paddsw m7, m3 ; t4 t7 + paddsw m3, m2, m0 ; t9 t14 + psubsw m2, m0 ; t10 t13 +%if mmsize > 16 + vbroadcasti128 m0, [o(deint_shuf)] +%else + mova m0, [o(deint_shuf)] +%endif + pshufb m8, m0 + pshufb m7, m0 + pshufb m3, m0 + ITX_MUL2X_PACK 9, 0, 5, 10, 2896, 2896 ; t0 t1 + vpbroadcastd m0, [o(pw_m2896_2896)] + ITX_MUL2X_PACK 4, 5, _, 10, 5, 0, 4 ; t11 t12 + vpbroadcastd m5, [o(pw_2896_2896)] + ITX_MUL2X_PACK 1, 0, _, 10, 0, 5, 4 ; t6 t5 + vpbroadcastd m0, [o(pw_m2896_2896)] + ITX_MUL2X_PACK 2, 0, _, 10, 0, 5, 4 ; t13a t10a + punpckhqdq m0, m8, m3 ; t15a t14 + punpcklqdq m8, m3 ; t8a t9 + shufps m5, m4, m2, q1032 ; t12 t13a + vpblendd m4, m2, 0xcc ; t11 t10a + shufps m2, m7, m1, q1032 ; t7 t6 + vpblendd m7, m1, 0xcc ; t4 t5 + psubsw m1, m9, m6 ; dct4 out3 out2 + paddsw m9, m6 ; dct4 out0 out1 + psubsw m3, m9, m2 ; dct8 out7 out6 + paddsw m9, m2 ; dct8 out0 out1 + psubsw m2, m1, m7 ; dct8 out4 out5 + paddsw m1, m7 ; dct8 out3 out2 + psubsw m7, m9, m0 ; out15 out14 + paddsw m0, m9 ; out0 out1 + psubsw m6, m1, m5 ; out12 out13 + paddsw m1, m5 ; out3 out2 + psubsw m5, m2, m4 ; out11 out10 + paddsw m2, m4 ; out4 out5 + psubsw m4, m3, m8 ; out8 out9 + paddsw m3, m8 ; out7 out6 +%endmacro + +INV_TXFM_4X16_FN dct, dct +INV_TXFM_4X16_FN dct, adst +INV_TXFM_4X16_FN dct, flipadst +INV_TXFM_4X16_FN dct, identity + +cglobal idct_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + call m(idct_16x4_internal_8bpc).main + vpbroadcastd m5, [o(pw_16384)] + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + REPX {pmulhrsw x, m5}, m0, m4, m2, m3 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + vextracti128 xm4, m0, 1 + vextracti128 xm5, m1, 1 + vextracti128 xm6, m2, 1 + vextracti128 xm7, m3, 1 + call .main + vinserti128 m0, xm4, 1 + vinserti128 m1, xm5, 1 + vpbroadcastd m5, [o(pw_2048)] + vinserti128 m2, xm6, 1 + vinserti128 m3, xm7, 1 + pshufd m1, m1, q1032 + pshufd m3, m3, q1032 + jmp m(iadst_4x16_internal_8bpc).end2 +ALIGN function_align +cglobal_label .main + WRAP_XMM IDCT16_1D_PACKED + ret + +INV_TXFM_4X16_FN adst, dct +INV_TXFM_4X16_FN adst, adst +INV_TXFM_4X16_FN adst, flipadst +INV_TXFM_4X16_FN adst, identity + +cglobal iadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + call m(iadst_16x4_internal_8bpc).main + vpbroadcastd m5, [o(pw_16384)] + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + REPX {pmulhrsw x, m5}, m4, m2, m3, m0 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + call .main + vpbroadcastd m5, [o(pw_2896x8)] + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m5 ; -out7 out4 out6 -out5 + pmulhrsw m2, m5 ; out8 -out11 -out9 out10 + vpbroadcastd m5, [o(pw_2048)] + pshufd m1, m1, q1032 + vpblendd m4, m1, m0, 0x33 + vpblendd m0, m2, 0x33 + vpblendd m2, m3, 0x33 + vpblendd m3, m1, 0x33 + vpermq m0, m0, q2031 + vpermq m1, m2, q1302 + vpermq m2, m3, q3120 + vpermq m3, m4, q0213 + psubw m6, m7, m5 +.end: + vpblendd m5, m6, 0xcc +.end2: + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + WIN64_RESTORE_XMM + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 +.end3: + lea r2, [dstq+strideq*8] + lea r3, [strideq*3] + WRITE_4X8 0, 1 + lea dstq, [dstq+strideq*4] + lea r2, [r2 +strideq*4] + WRITE_4X8 2, 3 + RET +ALIGN function_align +.main: + vpblendd m4, m1, m0, 0xcc + vpblendd m1, m0, 0x33 + vpblendd m5, m2, m3, 0xcc + vpblendd m2, m3, 0x33 + vperm2i128 m3, m5, m2, 0x31 + vinserti128 m0, m1, xm4, 1 ; in0 in3 in2 in1 + vperm2i128 m4, m1, m4, 0x31 + vinserti128 m1, m5, xm2, 1 ; in4 in7 in6 in5 + pshufd m3, m3, q1032 ; in15 in12 in13 in14 + pshufd m2, m4, q1032 ; in11 in8 in9 in10 +cglobal_label .main2 + vpbroadcastd m8, [o(pd_2048)] + pxor m7, m7 + punpckhwd m4, m3, m0 ; in12 in3 in14 in1 + punpcklwd m0, m3 ; in0 in15 in2 in13 + punpckhwd m3, m2, m1 ; in8 in7 in10 in5 + punpcklwd m1, m2 ; in4 in11 in6 in9 + ITX_MUL4X_PACK 0, 2, 5, 6, 8, 201, 4091, 995, 3973, 3 + ITX_MUL4X_PACK 1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3 + ITX_MUL4X_PACK 3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3 + ITX_MUL4X_PACK 4, 2, 5, 6, 8, 3857, 1380, 4052, 601, 3 + psubsw m2, m0, m3 ; t9a t8a t11a t10a + paddsw m0, m3 ; t1a t0a t3a t2a + psubsw m3, m1, m4 ; t13a t12a t15a t14a + paddsw m1, m4 ; t5a t4a t7a t6a + ITX_MUL4X_PACK 2, 4, 5, 6, 8, 799, 4017, 3406, 2276, 3 + psubw m6, m7, m5 + ITX_MUL2X_PACK 3, 5, _, 8, 6, 4, 6 + vpbroadcastd m6, [o(pw_m3784_1567)] + vpbroadcastd m5, [o(pw_1567_3784)] + psubsw m4, m0, m1 ; t5 t4 t7 t6 + paddsw m0, m1 ; t1 t0 t3 t2 + psubsw m1, m2, m3 ; t13a t12a t15a t14a + paddsw m2, m3 ; t9a t8a t11a t10a + psubw m3, m7, m6 ; pw_3784_m1567 + vpblendd m6, m3, 0xf0 + ITX_MUL2X_PACK 4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a + ITX_MUL2X_PACK 1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14 + vbroadcasti128 m5, [o(deint_shuf)] + pshufb m0, m5 + pshufb m2, m5 + vperm2i128 m3, m0, m2, 0x31 ; t3 t2 t11a t10a + vinserti128 m0, xm2, 1 ; t1 t0 t9a t8a + vperm2i128 m2, m4, m1, 0x31 ; t7a t6a t15 t14 + vinserti128 m4, xm1, 1 ; t4a t5a t12 t13 + pshufd m2, m2, q1032 ; t6a t7a t14 t15 + psubsw m1, m0, m3 ; t3a t2a t11 t10 + paddsw m0, m3 ; -out15 out0 out14 -out1 + paddsw m3, m4, m2 ; -out3 out12 out2 -out13 + psubsw m4, m2 ; t6 t7 t14a t15a + shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a + vpblendd m4, m1, 0x33 ; t3a t7 t11 t15a + ret +ALIGN function_align +.main_pass1_end: + vpbroadcastd m5, [o(pw_m2896_2896)] + vpbroadcastd m6, [o(pw_2896_2896)] + punpcklwd m1, m4, m2 + punpckhwd m4, m2 + pmaddwd m2, m5, m4 + pmaddwd m4, m6 + pmaddwd m5, m1 + pmaddwd m1, m6 + REPX {paddd x, m8}, m5, m1, m2, m4 + REPX {psrad x, 12}, m5, m2, m1, m4 + packssdw m2, m5 ; -out11 out8 out10 -out9 + packssdw m1, m4 ; -out7 out4 out6 -out5 + ret + +INV_TXFM_4X16_FN flipadst, dct +INV_TXFM_4X16_FN flipadst, adst +INV_TXFM_4X16_FN flipadst, flipadst +INV_TXFM_4X16_FN flipadst, identity + +cglobal iflipadst_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + call m(iadst_16x4_internal_8bpc).main + vpbroadcastd m5, [o(pw_16384)] + punpcklwd m4, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m3, m2 + punpckhwd m3, m2 + REPX {pmulhrsw x, m5}, m4, m1, m0, m3 + punpckldq m2, m3, m1 + punpckhdq m3, m1 + punpckhdq m1, m0, m4 + punpckldq m0, m4 + jmp tx2q +.pass2: + call m(iadst_4x16_internal_8bpc).main + vpbroadcastd m5, [o(pw_2896x8)] + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m5 ; -out7 out4 out6 -out5 + pmulhrsw m2, m5 ; out8 -out11 -out9 out10 + vpbroadcastd m6, [o(pw_2048)] + pshufd m1, m1, q1032 + vpblendd m4, m0, m2, 0x33 + vpblendd m0, m1, 0xcc + vpblendd m1, m3, 0xcc + vpblendd m2, m3, 0x33 + vpermq m0, m0, q3120 + vpermq m1, m1, q0213 + vpermq m2, m2, q2031 + vpermq m3, m4, q1302 + psubw m5, m7, m6 + jmp m(iadst_4x16_internal_8bpc).end + +INV_TXFM_4X16_FN identity, dct +INV_TXFM_4X16_FN identity, adst +INV_TXFM_4X16_FN identity, flipadst +INV_TXFM_4X16_FN identity, identity + +cglobal iidentity_4x16_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 + mova m3, [cq+32*0] + mova m2, [cq+32*1] + mova m4, [cq+32*2] + mova m5, [cq+32*3] + vpbroadcastd m8, [o(pw_1697x8)] + pcmpeqw m0, m0 ; -1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m5 + punpckhwd m4, m5 + pmulhrsw m5, m8, m1 + pmulhrsw m6, m8, m2 + pmulhrsw m7, m8, m3 + pmulhrsw m8, m4 + pcmpeqw m9, m0, m1 ; we want to do a signed avg, but pavgw is + pxor m1, m9 ; unsigned. as long as both signs are equal + pcmpeqw m9, m0, m2 ; it still works, but if the input is -1 the + pxor m2, m9 ; pmulhrsw result will become 0 which causes + pcmpeqw m9, m0, m3 ; pavgw to output -32768 instead of 0 unless + pxor m3, m9 ; we explicitly deal with that case here. + pcmpeqw m0, m4 + pxor m4, m0 + pavgw m1, m5 + pavgw m2, m6 + pavgw m3, m7 + pavgw m4, m8 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + vpbroadcastd m8, [o(pw_1697x16)] + vpbroadcastd m5, [o(pw_2048)] + pmulhrsw m4, m8, m0 + pmulhrsw m6, m8, m1 + pmulhrsw m7, m8, m2 + pmulhrsw m8, m3 + REPX {paddsw x, x}, m0, m1, m2, m3 + paddsw m0, m4 + paddsw m1, m6 + paddsw m2, m7 + paddsw m3, m8 + jmp m(iadst_4x16_internal_8bpc).end2 + +%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3 ; coefs[1-2], tmp[1-2], off[1-3] + movq xm%3, [dstq ] + movhps xm%3, [dstq+%5] + movq xm%4, [dstq+%6] + movhps xm%4, [dstq+%7] + pmovzxbw m%3, xm%3 + pmovzxbw m%4, xm%4 +%ifnum %1 + paddw m%3, m%1 +%else + paddw m%3, %1 +%endif +%ifnum %2 + paddw m%4, m%2 +%else + paddw m%4, %2 +%endif + packuswb m%3, m%4 + vextracti128 xm%4, m%3, 1 + movq [dstq ], xm%3 + movhps [dstq+%6], xm%3 + movq [dstq+%5], xm%4 + movhps [dstq+%7], xm%4 +%endmacro + +%macro INV_TXFM_8X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x4 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + mov [cq], eobd + pmulhrsw xm0, xm1 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2 +%endif +%endmacro + +INV_TXFM_8X4_FN dct, dct +INV_TXFM_8X4_FN dct, adst +INV_TXFM_8X4_FN dct, flipadst +INV_TXFM_8X4_FN dct, identity + +cglobal idct_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpbroadcastd xm3, [o(pw_2896x8)] + pmulhrsw xm0, xm3, [cq+16*0] + pmulhrsw xm1, xm3, [cq+16*1] + pmulhrsw xm2, xm3, [cq+16*2] + pmulhrsw xm3, [cq+16*3] + call m(idct_4x8_internal_8bpc).main + vbroadcasti128 m4, [o(deint_shuf)] + vinserti128 m3, m1, xm3, 1 + vinserti128 m1, m0, xm2, 1 + shufps m0, m1, m3, q0220 + shufps m1, m3, q1331 + pshufb m0, m4 + pshufb m1, m4 + jmp tx2q +.pass2: + IDCT4_1D_PACKED + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + jmp m(iadst_8x4_internal_8bpc).end2 + +INV_TXFM_8X4_FN adst, dct +INV_TXFM_8X4_FN adst, adst +INV_TXFM_8X4_FN adst, flipadst +INV_TXFM_8X4_FN adst, identity + +cglobal iadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpbroadcastd xm0, [o(pw_2896x8)] + pshufd xm4, [cq+16*0], q1032 + pmulhrsw xm3, xm0, [cq+16*3] + pshufd xm5, [cq+16*1], q1032 + pmulhrsw xm2, xm0, [cq+16*2] + pmulhrsw xm4, xm0 + pmulhrsw xm5, xm0 + call m(iadst_4x8_internal_8bpc).main_pass1 + vinserti128 m0, xm2, 1 + vinserti128 m1, xm3, 1 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pxor m3, m3 + psubsw m3, m2 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call .main +.end: + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 +.end2: + vpbroadcastd m2, [o(pw_2048)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + WIN64_RESTORE_XMM +.end3: + pxor m2, m2 + mova [cq+32*0], m2 + mova [cq+32*1], m2 + lea r3, [strideq*3] + WRITE_8X4 0, 1, 4, 5 + RET +ALIGN function_align +cglobal_label .main + IADST4_1D_PACKED + ret + +INV_TXFM_8X4_FN flipadst, dct +INV_TXFM_8X4_FN flipadst, adst +INV_TXFM_8X4_FN flipadst, flipadst +INV_TXFM_8X4_FN flipadst, identity + +cglobal iflipadst_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpbroadcastd xm0, [o(pw_2896x8)] + pshufd xm4, [cq+16*0], q1032 + pmulhrsw xm3, xm0, [cq+16*3] + pshufd xm5, [cq+16*1], q1032 + pmulhrsw xm2, xm0, [cq+16*2] + pmulhrsw xm4, xm0 + pmulhrsw xm5, xm0 + call m(iadst_4x8_internal_8bpc).main_pass1 + vinserti128 m3, xm1, 1 + vinserti128 m2, xm0, 1 + punpckhwd m1, m3, m2 + punpcklwd m3, m2 + pxor m0, m0 + psubsw m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call m(iadst_8x4_internal_8bpc).main + mova m2, m1 + vpermq m1, m0, q2031 + vpermq m0, m2, q2031 + jmp m(iadst_8x4_internal_8bpc).end2 + +INV_TXFM_8X4_FN identity, dct +INV_TXFM_8X4_FN identity, adst +INV_TXFM_8X4_FN identity, flipadst +INV_TXFM_8X4_FN identity, identity + +cglobal iidentity_8x4_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + mova xm2, [cq+16*0] + mova xm0, [cq+16*1] + vinserti128 m2, [cq+16*2], 1 + vinserti128 m0, [cq+16*3], 1 + vpbroadcastd m3, [o(pw_2896x8)] + punpcklwd m1, m2, m0 + punpckhwd m2, m0 + pmulhrsw m1, m3 + pmulhrsw m2, m3 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + paddsw m0, m0 + paddsw m1, m1 + jmp tx2q +.pass2: + vpbroadcastd m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_8x4_internal_8bpc).end + +%macro INV_TXFM_8X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x8 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + or r3d, 8 +.dconly: + pmulhrsw xm0, xm2 +.dconly2: + movd xm2, [pw_2048] + pmulhrsw xm0, xm1 + lea r2, [strideq*3] + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 +.dconly_loop: + WRITE_8X4 0, 0, 1, 2, strideq*1, strideq*2, r2 + lea dstq, [dstq+strideq*4] + sub r3d, 4 + jg .dconly_loop + RET +%endif +%endmacro + +INV_TXFM_8X8_FN dct, dct +INV_TXFM_8X8_FN dct, adst +INV_TXFM_8X8_FN dct, flipadst +INV_TXFM_8X8_FN dct, identity + +cglobal idct_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 ; 0 1 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m2, [cq+32*2], q3120 ; 4 5 + vpermq m1, [cq+32*1], q3120 ; 2 3 + call .main + shufps m4, m0, m1, q0220 + shufps m5, m0, m1, q1331 + shufps m1, m2, m3, q0220 + shufps m3, m2, m3, q1331 + vbroadcasti128 m0, [o(deint_shuf)] + vpbroadcastd m2, [o(pw_16384)] + REPX {pshufb x, m0}, m4, m5, m1, m3 + REPX {pmulhrsw x, m2}, m4, m5, m1, m3 + vinserti128 m0, m4, xm1, 1 + vperm2i128 m2, m4, m1, 0x31 + vinserti128 m1, m5, xm3, 1 + vperm2i128 m3, m5, m3, 0x31 + jmp tx2q +.pass2: + call .main + vpbroadcastd m4, [o(pw_2048)] + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + jmp m(iadst_8x8_internal_8bpc).end2 +ALIGN function_align +cglobal_label .main + IDCT8_1D_PACKED + ret + +INV_TXFM_8X8_FN adst, dct +INV_TXFM_8X8_FN adst, adst +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity + +cglobal iadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m4, [cq+32*0], q1302 ; 1 0 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m5, [cq+32*1], q1302 ; 3 2 + vpermq m2, [cq+32*2], q3120 ; 4 5 + call .main_pass1 + vpbroadcastd m5, [o(pw_16384)] + punpcklwd m4, m0, m1 + punpckhwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + pxor m3, m3 + psubw m3, m5 ; negate odd elements during rounding + pmulhrsw m4, m5 + pmulhrsw m0, m3 + pmulhrsw m1, m5 + pmulhrsw m2, m3 + punpcklwd m3, m4, m0 + punpckhwd m4, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + vperm2i128 m2, m3, m0, 0x31 + vinserti128 m0, m3, xm0, 1 + vperm2i128 m3, m4, m1, 0x31 + vinserti128 m1, m4, xm1, 1 + jmp tx2q +.pass2: + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call .main_pass2 + vpbroadcastd m5, [o(pw_2048)] + vpbroadcastd xm4, [o(pw_4096)] + psubw m4, m5 ; lower half = 2048, upper half = -2048 +.end: + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 +.end2: + pmulhrsw m0, m4 + pmulhrsw m1, m4 +.end3: + pmulhrsw m2, m4 + pmulhrsw m3, m4 + WIN64_RESTORE_XMM +.end4: + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 + lea r3, [strideq*3] + WRITE_8X4 0, 1, 4, 5 + lea dstq, [dstq+strideq*4] + WRITE_8X4 2, 3, 4, 5 + RET +ALIGN function_align +.main_pass1: + IADST8_1D_PACKED 1 + ret +ALIGN function_align +cglobal_label .main_pass2 + IADST8_1D_PACKED 2 + ret + +INV_TXFM_8X8_FN flipadst, dct +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, flipadst +INV_TXFM_8X8_FN flipadst, identity + +cglobal iflipadst_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + vpermq m4, [cq+32*0], q1302 ; 1 0 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m5, [cq+32*1], q1302 ; 3 2 + vpermq m2, [cq+32*2], q3120 ; 4 5 + call m(iadst_8x8_internal_8bpc).main_pass1 + vpbroadcastd m5, [o(pw_16384)] + punpckhwd m4, m3, m2 + punpcklwd m3, m2 + punpckhwd m2, m1, m0 + punpcklwd m1, m0 + pxor m0, m0 + psubw m0, m5 + pmulhrsw m4, m0 + pmulhrsw m3, m5 + pmulhrsw m2, m0 + pmulhrsw m1, m5 + punpckhwd m0, m4, m3 + punpcklwd m4, m3 + punpckhwd m3, m2, m1 + punpcklwd m2, m1 + vinserti128 m1, m0, xm3, 1 + vperm2i128 m3, m0, m3, 0x31 + vinserti128 m0, m4, xm2, 1 + vperm2i128 m2, m4, m2, 0x31 + jmp tx2q +.pass2: + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call m(iadst_8x8_internal_8bpc).main_pass2 + vpbroadcastd m4, [o(pw_2048)] + vpbroadcastd xm5, [o(pw_4096)] + psubw m4, m5 ; lower half = -2048, upper half = 2048 + vpermq m5, m3, q2031 + vpermq m3, m0, q2031 + vpermq m0, m2, q2031 + vpermq m2, m1, q2031 + pmulhrsw m1, m0, m4 + pmulhrsw m0, m5, m4 + jmp m(iadst_8x8_internal_8bpc).end3 + +INV_TXFM_8X8_FN identity, dct +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_8bpc, 0, 5, 7, dst, stride, c, eob, tx2 + mova xm3, [cq+16*0] + mova xm2, [cq+16*1] + vinserti128 m3, [cq+16*4], 1 + vinserti128 m2, [cq+16*5], 1 + mova xm4, [cq+16*2] + mova xm0, [cq+16*3] + vinserti128 m4, [cq+16*6], 1 + vinserti128 m0, [cq+16*7], 1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + vpbroadcastd m4, [o(pw_4096)] + jmp m(iadst_8x8_internal_8bpc).end + +%macro INV_TXFM_8X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x16 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + or r3d, 16 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly +%endif +%endmacro + +%macro ITX_8X16_LOAD_COEFS 0 + vpbroadcastd m4, [o(pw_2896x8)] + pmulhrsw m0, m4, [cq+32*0] + add cq, 32*4 + pmulhrsw m7, m4, [cq+32*3] + pmulhrsw m1, m4, [cq-32*3] + pmulhrsw m6, m4, [cq+32*2] + pmulhrsw m2, m4, [cq-32*2] + pmulhrsw m5, m4, [cq+32*1] + pmulhrsw m3, m4, [cq-32*1] + pmulhrsw m4, [cq+32*0] +%endmacro + +INV_TXFM_8X16_FN dct, dct +INV_TXFM_8X16_FN dct, adst +INV_TXFM_8X16_FN dct, flipadst +INV_TXFM_8X16_FN dct, identity + +cglobal idct_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_8X16_LOAD_COEFS + call m(idct_16x8_internal_8bpc).main + vpbroadcastd m10, [o(pw_16384)] +.pass1_end: + vperm2i128 m9, m3, m7, 0x31 + vinserti128 m3, xm7, 1 + vperm2i128 m8, m2, m6, 0x31 + vinserti128 m2, xm6, 1 + vperm2i128 m6, m1, m5, 0x31 + vinserti128 m1, xm5, 1 + vperm2i128 m5, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m1 + punpcklwd m0, m1 +.pass1_end2: + punpckhwd m7, m5, m6 + punpcklwd m5, m6 + punpcklwd m6, m8, m9 + punpckhwd m8, m9 + REPX {pmulhrsw x, m10}, m2, m0, m4, m3, m5, m6, m7, m8 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + punpckldq m4, m5, m6 + punpckhdq m5, m6 + punpckldq m6, m7, m8 + punpckhdq m7, m8 + jmp tx2q +.pass2: + call .main + REPX {vpermq x, x, q3120}, m0, m2, m4, m6 + REPX {vpermq x, x, q2031}, m1, m3, m5, m7 +.end: + vpbroadcastd m8, [o(pw_2048)] +.end2: + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 +.end3: + pxor m8, m8 + REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 + lea r3, [strideq*3] + WRITE_8X4 0, 1, 8, 9 + lea dstq, [dstq+strideq*4] + WRITE_8X4 2, 3, 0, 1 + lea dstq, [dstq+strideq*4] + WRITE_8X4 4, 5, 0, 1 + lea dstq, [dstq+strideq*4] + WRITE_8X4 6, 7, 0, 1 + RET +ALIGN function_align +cglobal_label .main + IDCT16_1D_PACKED + ret + +INV_TXFM_8X16_FN adst, dct +INV_TXFM_8X16_FN adst, adst +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, identity + +cglobal iadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_8X16_LOAD_COEFS + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end + vpbroadcastd m10, [o(pw_16384)] + pslld m9, m10, 17 + psubw m10, m9 ; 16384, -16384 + jmp m(idct_8x16_internal_8bpc).pass1_end +ALIGN function_align +.pass2: + call .main + call .main_pass2_end + vpbroadcastd m9, [o(pw_2048)] + vpbroadcastd xm8, [o(pw_4096)] + psubw m8, m9 + REPX {vpermq x, x, q2031}, m0, m1, m2, m3 + REPX {vpermq x, x, q3120}, m4, m5, m6, m7 + jmp m(idct_8x16_internal_8bpc).end2 +ALIGN function_align +cglobal_label .main + REPX {pshufd x, x, q1032}, m7, m1, m5, m3 +.main2: + vpbroadcastd m10, [o(pd_2048)] + punpckhwd m8, m7, m0 ; in14 in1 + punpcklwd m0, m7 ; in0 in15 + punpcklwd m7, m6, m1 ; in12 in3 + punpckhwd m1, m6 ; in2 in13 + punpckhwd m6, m5, m2 ; in10 in5 + punpcklwd m2, m5 ; in4 in11 + punpcklwd m5, m4, m3 ; in8 in7 + punpckhwd m3, m4 ; in6 in9 + ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 3 ; t0 t1 + ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 3 ; t2 t3 + ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 3 ; t4 t5 + ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 3 ; t6 t7 + ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 3 ; t8 t9 + ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 3 ; t10 t11 + ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 3 ; t12 t13 + ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 3 ; t14 t15 + psubsw m4, m0, m5 ; t9a t8a + paddsw m0, m5 ; t1a t0a + psubsw m5, m1, m6 ; t11a t10a + paddsw m1, m6 ; t3a t2a + psubsw m6, m2, m7 ; t13a t12a + paddsw m2, m7 ; t5a t4a + psubsw m7, m3, m8 ; t15a t14a + paddsw m3, m8 ; t7a t6a + vpbroadcastd m11, [o(pw_m4017_799)] + vpbroadcastd m12, [o(pw_799_4017)] + pxor m9, m9 + ITX_MUL2X_PACK 4, 8, _, 10, 11, 12, 6 ; t8 t9 + psubw m8, m9, m11 ; pw_4017_m799 + ITX_MUL2X_PACK 6, 12, _, 10, 12, 8, 6 ; t12 t13 + vpbroadcastd m11, [o(pw_m2276_3406)] + vpbroadcastd m12, [o(pw_3406_2276)] + ITX_MUL2X_PACK 5, 8, _, 10, 11, 12, 6 ; t10 t11 + psubw m8, m9, m11 ; pw_2276_m3406 + ITX_MUL2X_PACK 7, 12, _, 10, 12, 8, 6 ; t14 t15 + psubsw m8, m1, m3 ; t7 t6 + paddsw m1, m3 ; t3 t2 + psubsw m3, m0, m2 ; t5 t4 + paddsw m0, m2 ; t1 t0 + psubsw m2, m5, m7 ; t14a t15a + paddsw m7, m5 ; t10a t11a + psubsw m5, m4, m6 ; t12a t13a + paddsw m4, m6 ; t8a t9a + vpbroadcastd m11, [o(pw_m3784_1567)] + vpbroadcastd m12, [o(pw_1567_3784)] + ITX_MUL2X_PACK 3, 6, _, 10, 12, 11, 6 ; t5a t4a + psubw m6, m9, m11 ; pw_3784_m1567 + ITX_MUL2X_PACK 8, 6, _, 10, 6, 12, 6 ; t7a t6a + vpbroadcastd m11, [o(pw_m1567_3784)] + vpbroadcastd m12, [o(pw_3784_1567)] + ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 6 ; t15 t14 + psubw m6, m9, m11 ; pw_1567_m3784 + ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 6 ; t13 t12 + vbroadcasti128 m12, [o(deint_shuf)] + paddsw m6, m4, m7 ; -out1 out14 + psubsw m4, m7 ; t10 t11 + psubsw m11, m3, m8 ; t7 t6 + paddsw m8, m3 ; out12 -out3 + psubsw m3, m0, m1 ; t3a t2a + paddsw m0, m1 ; -out15 out0 + paddsw m1, m2, m5 ; -out13 out2 + psubsw m5, m2 ; t15a t14a + pshufb m0, m12 + pshufb m6, m12 + pshufb m8, m12 + pshufb m1, m12 + shufps m7, m6, m0, q1032 ; out14 -out15 + vpblendd m0, m6, 0x33 ; -out1 out0 + punpcklqdq m6, m8, m1 ; out12 -out13 + punpckhqdq m1, m8, m1 ; -out3 out2 + ret +ALIGN function_align +.main_pass1_end: + vpbroadcastd m8, [o(pw_m2896_2896)] + vpbroadcastd m12, [o(pw_2896_2896)] + pmaddwd m9, m8, m11 ; -out11 + pmaddwd m2, m12, m5 ; -out5 + pmaddwd m5, m8 ; out10 + pmaddwd m11, m12 ; out4 + REPX {paddd x, m10}, m9, m5, m2, m11 + REPX {psrad x, 12 }, m9, m5, m2, m11 + packssdw m5, m9 ; out10 -out11 + packssdw m2, m11 ; -out5 out4 + pmaddwd m11, m8, m3 ; out8 + vpbroadcastd m8, [o(pw_2896_m2896)] + pmaddwd m3, m12 ; -out7 + pmaddwd m8, m4 ; -out9 + pmaddwd m4, m12 ; out6 + REPX {paddd x, m10}, m11, m3, m8, m4 + REPX {psrad x, 12 }, m11, m3, m8, m4 + packssdw m3, m4 ; -out7 out6 + packssdw m4, m11, m8 ; out8 -out9 + vpbroadcastd m10, [o(pw_16384)] + pxor m9, m9 + ret +ALIGN function_align +cglobal_label .main_pass2_end + vpbroadcastd m8, [o(pw_2896x8)] + pshufb m2, m11, m12 + pshufb m5, m12 + pshufb m3, m12 + pshufb m4, m12 + punpcklqdq m11, m5, m2 ; t15a t7 + punpckhqdq m5, m2 ; t14a t6 + shufps m2, m3, m4, q1032 ; t2a t10 + vpblendd m3, m4, 0xcc ; t3a t11 + psubsw m4, m2, m3 ; out8 -out9 + paddsw m3, m2 ; -out7 out6 + paddsw m2, m5, m11 ; -out5 out4 + psubsw m5, m11 ; out10 -out11 + REPX {pmulhrsw x, m8}, m2, m3, m4, m5 + ret + +INV_TXFM_8X16_FN flipadst, dct +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst +INV_TXFM_8X16_FN flipadst, identity + +cglobal iflipadst_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_8X16_LOAD_COEFS + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end + vpbroadcastd m9, [o(pw_16384)] + pslld m10, m9, 17 + psubw m10, m9 ; -16384, 16384 + vperm2i128 m9, m4, m0, 0x31 + vinserti128 m0, m4, xm0, 1 + vperm2i128 m8, m5, m1, 0x31 + vinserti128 m4, m5, xm1, 1 + vperm2i128 m5, m7, m3, 0x31 + vinserti128 m3, m7, xm3, 1 + vinserti128 m1, m6, xm2, 1 + vperm2i128 m6, m6, m2, 0x31 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + punpcklwd m0, m3, m1 + punpckhwd m3, m1 + jmp m(idct_8x16_internal_8bpc).pass1_end2 +.pass2: + call m(iadst_8x16_internal_8bpc).main + call m(iadst_8x16_internal_8bpc).main_pass2_end + vpbroadcastd m8, [o(pw_2048)] + vpbroadcastd xm9, [o(pw_4096)] + psubw m8, m9 + vpermq m9, m0, q3120 + vpermq m0, m7, q2031 + vpermq m7, m1, q3120 + vpermq m1, m6, q2031 + vpermq m6, m2, q3120 + vpermq m2, m5, q2031 + vpermq m5, m3, q3120 + vpermq m3, m4, q2031 + pmulhrsw m0, m8 + pmulhrsw m1, m8 + pmulhrsw m2, m8 + pmulhrsw m3, m8 + pmulhrsw m4, m5, m8 + pmulhrsw m5, m6, m8 + pmulhrsw m6, m7, m8 + pmulhrsw m7, m9, m8 + jmp m(idct_8x16_internal_8bpc).end3 + +INV_TXFM_8X16_FN identity, dct +INV_TXFM_8X16_FN identity, adst +INV_TXFM_8X16_FN identity, flipadst +INV_TXFM_8X16_FN identity, identity + +%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] + pmulhrsw m%2, m%3, m%1 +%if %0 == 4 ; if downshifting by 1 + pmulhrsw m%2, m%4 +%else + paddsw m%1, m%1 +%endif + paddsw m%1, m%2 +%endmacro + +cglobal iidentity_8x16_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 + mova xm3, [cq+16*0] + mova xm2, [cq+16*2] + add cq, 16*8 + vinserti128 m3, [cq+16*0], 1 + vinserti128 m2, [cq+16*2], 1 + vpbroadcastd m9, [o(pw_2896x8)] + mova xm4, [cq-16*4] + mova xm5, [cq-16*2] + vinserti128 m4, [cq+16*4], 1 + vinserti128 m5, [cq+16*6], 1 + mova xm7, [cq-16*7] + mova xm6, [cq-16*5] + vinserti128 m7, [cq+16*1], 1 + vinserti128 m6, [cq+16*3], 1 + mova xm8, [cq-16*3] + mova xm0, [cq-16*1] + vinserti128 m8, [cq+16*5], 1 + vinserti128 m0, [cq+16*7], 1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m5 + punpckhwd m4, m5 + punpcklwd m5, m7, m6 + punpckhwd m7, m6 + punpcklwd m6, m8, m0 + punpckhwd m8, m0 + REPX {pmulhrsw x, m9}, m1, m2, m3, m4, m5, m6, m7, m8 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + punpckldq m4, m5, m6 + punpckhdq m5, m6 + punpckldq m6, m7, m8 + punpckhdq m7, m8 + jmp tx2q +.pass2: + vpbroadcastd m8, [o(pw_1697x16)] + REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7 + jmp m(idct_8x16_internal_8bpc).end + +%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2] + pmovzxbw m%3, [dstq+%5] +%ifnum %1 + paddw m%3, m%1 +%else + paddw m%3, %1 +%endif + pmovzxbw m%4, [dstq+%6] +%ifnum %2 + paddw m%4, m%2 +%else + paddw m%4, %2 +%endif + packuswb m%3, m%4 + vpermq m%3, m%3, q3120 + mova [dstq+%5], xm%3 + vextracti128 [dstq+%6], m%3, 1 +%endmacro + +%macro INV_TXFM_16X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x4 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + or r3d, 4 +.dconly: + pmulhrsw xm0, xm2 + movd xm2, [pw_2048] ; intentionally rip-relative + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + pxor m3, m3 +.dconly_loop: + mova xm1, [dstq+strideq*0] + vinserti128 m1, [dstq+strideq*1], 1 + punpckhbw m2, m1, m3 + punpcklbw m1, m3 + paddw m2, m0 + paddw m1, m0 + packuswb m1, m2 + mova [dstq+strideq*0], xm1 + vextracti128 [dstq+strideq*1], m1, 1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +%endif +%endmacro + +INV_TXFM_16X4_FN dct, dct +INV_TXFM_16X4_FN dct, adst +INV_TXFM_16X4_FN dct, flipadst +INV_TXFM_16X4_FN dct, identity + +cglobal idct_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 + mova xm0, [cq+16*0] + mova xm1, [cq+16*1] + mova xm2, [cq+16*2] + mova xm3, [cq+16*3] + mova xm4, [cq+16*4] + mova xm5, [cq+16*5] + mova xm6, [cq+16*6] + mova xm7, [cq+16*7] + call m(idct_4x16_internal_8bpc).main + vinserti128 m6, m2, xm6, 1 + vinserti128 m2, m0, xm4, 1 + vinserti128 m0, m1, xm5, 1 + vinserti128 m1, m3, xm7, 1 + punpcklwd m3, m2, m6 + punpckhwd m2, m6 + vpbroadcastd m6, [o(pw_16384)] + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + mova m1, m6 + jmp m(iadst_16x4_internal_8bpc).pass1_end +.pass2: + call .main + jmp m(iadst_16x4_internal_8bpc).end +ALIGN function_align +cglobal_label .main + vpbroadcastd m6, [o(pd_2048)] + IDCT4_1D 0, 1, 2, 3, 4, 5, 6 + ret + +INV_TXFM_16X4_FN adst, dct +INV_TXFM_16X4_FN adst, adst +INV_TXFM_16X4_FN adst, flipadst +INV_TXFM_16X4_FN adst, identity + +cglobal iadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q1230 + vpermq m3, [cq+32*3], q2103 + vpermq m1, [cq+32*1], q1230 + vpermq m2, [cq+32*2], q2103 + call m(iadst_4x16_internal_8bpc).main2 + call m(iadst_4x16_internal_8bpc).main_pass1_end + punpcklwd m4, m3, m1 + punpcklwd m5, m2, m0 + punpckhwd m0, m1 + punpckhwd m2, m3 + vpbroadcastd m1, [o(pw_16384)] + vinserti128 m3, m0, xm2, 1 + vperm2i128 m2, m0, m2, 0x31 + vinserti128 m0, m4, xm5, 1 + vperm2i128 m4, m4, m5, 0x31 + psubw m6, m7, m1 +.pass1_end: + pmulhrsw m3, m1 + pmulhrsw m2, m6 + pmulhrsw m4, m1 + pmulhrsw m0, m6 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + call .main +.end: + vpbroadcastd m4, [o(pw_2048)] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + WIN64_RESTORE_XMM +.end2: + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 +.end3: + WRITE_16X2 0, 1, 4, 5, strideq*0, strideq*1 + lea dstq, [dstq+strideq*2] + WRITE_16X2 2, 3, 4, 5, strideq*0, strideq*1 + RET +ALIGN function_align +cglobal_label .main + vpbroadcastd m6, [o(pw_m3344_3344)] + vpbroadcastd m7, [o(pw_3803_1321)] + vpbroadcastd m8, [o(pw_m1321_2482)] + vpbroadcastd m9, [o(pw_2482_3344)] + punpcklwd m4, m2, m0 ; in2 in0 l + punpckhwd m2, m0 ; in2 in0 h + psrld m5, m6, 16 + pmaddwd m10, m6, m4 ; t2:02 l + pmaddwd m6, m2 ; t2:02 h + pmaddwd m0, m7, m4 ; t0:02 l + pmaddwd m7, m2 ; t0:02 h + pmaddwd m4, m8 ; t1:02 l + pmaddwd m8, m2 ; t1:02 h + punpckhwd m2, m3, m1 ; in3 in1 h + punpcklwd m3, m1 ; in3 in1 l + pmaddwd m1, m5, m2 ; t2:3 h + pmaddwd m5, m3 ; t2:3 l + paddd m6, m1 + vpbroadcastd m1, [o(pd_2048)] + paddd m10, m5 + pmaddwd m5, m9, m3 + pmaddwd m9, m2 + paddd m0, m1 + paddd m7, m1 + paddd m0, m5 ; t0 + t3 + 2048 l + paddd m7, m9 ; t0 + t3 + 2048 h + vpbroadcastd m9, [o(pw_m3803_3344)] + pmaddwd m5, m9, m2 + pmaddwd m9, m3 + paddd m10, m1 ; t2 + 2048 l + paddd m6, m1 ; t2 + 2048 h + paddd m5, m1 ; t1:13 + 2048 h + paddd m1, m9 ; t1:13 + 2048 l + vpbroadcastd m9, [o(pw_m3803_m6688)] + pmaddwd m2, m9 + pmaddwd m3, m9 + paddd m5, m8 ; t1 + t3 + 2048 h + paddd m1, m4 ; t1 + t3 + 2048 l + paddd m8, m7 + paddd m4, m0 + paddd m2, m8 ; t0 + t1 - t3 + 2048 h + paddd m3, m4 ; t0 + t1 - t3 + 2048 l + REPX {psrad x, 12}, m10, m6, m0, m7, m5, m1, m2, m3 + packssdw m0, m7 + packssdw m1, m5 + packssdw m3, m2 + packssdw m2, m10, m6 + ret + +INV_TXFM_16X4_FN flipadst, dct +INV_TXFM_16X4_FN flipadst, adst +INV_TXFM_16X4_FN flipadst, flipadst +INV_TXFM_16X4_FN flipadst, identity + +cglobal iflipadst_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q1230 + vpermq m3, [cq+32*3], q2103 + vpermq m1, [cq+32*1], q1230 + vpermq m2, [cq+32*2], q2103 + call m(iadst_4x16_internal_8bpc).main2 + call m(iadst_4x16_internal_8bpc).main_pass1_end + punpckhwd m4, m3, m2 + punpckhwd m5, m1, m0 + punpcklwd m0, m2 + punpcklwd m1, m3 + vpbroadcastd m6, [o(pw_16384)] + vinserti128 m3, m0, xm1, 1 + vperm2i128 m2, m0, m1, 0x31 + vinserti128 m0, m4, xm5, 1 + vperm2i128 m4, m4, m5, 0x31 + psubw m1, m7, m6 + jmp m(iadst_16x4_internal_8bpc).pass1_end +ALIGN function_align +.pass2: + call m(iadst_16x4_internal_8bpc).main + vpbroadcastd m4, [o(pw_2048)] + REPX {pmulhrsw x, m4}, m3, m2, m1, m0 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 + WRITE_16X2 3, 2, 4, 5, strideq*0, strideq*1 + lea dstq, [dstq+strideq*2] + WRITE_16X2 1, 0, 4, 5, strideq*0, strideq*1 + RET + +INV_TXFM_16X4_FN identity, dct +INV_TXFM_16X4_FN identity, adst +INV_TXFM_16X4_FN identity, flipadst +INV_TXFM_16X4_FN identity, identity + +cglobal iidentity_16x4_internal_8bpc, 0, 5, 11, dst, stride, c, eob, tx2 + mova xm2, [cq+16*0] + mova xm4, [cq+16*1] + vinserti128 m2, [cq+16*4], 1 + vinserti128 m4, [cq+16*5], 1 + mova xm0, [cq+16*2] + mova xm1, [cq+16*3] + vinserti128 m0, [cq+16*6], 1 + vinserti128 m1, [cq+16*7], 1 + vpbroadcastd m7, [o(pw_1697x16)] + vpbroadcastd m8, [o(pw_16384)] + punpcklwd m3, m2, m4 + punpckhwd m2, m4 + punpcklwd m4, m0, m1 + punpckhwd m0, m1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + pmulhrsw m0, m7, m1 + pmulhrsw m5, m7, m2 + pmulhrsw m6, m7, m3 + pmulhrsw m7, m4 + REPX {pmulhrsw x, m8}, m0, m5, m6, m7 + paddsw m1, m0 + paddsw m2, m5 + paddsw m3, m6 + paddsw m4, m7 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + jmp tx2q +.pass2: + vpbroadcastd m7, [o(pw_1697x8)] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(iadst_16x4_internal_8bpc).end + +%macro INV_TXFM_16X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x8 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + or r3d, 8 + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly +%endif +%endmacro + +%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd + vpbroadcastd m8, [o(pw_2896x8)] + vpermq m0, [cq+32*0], q3120 + add cq, 32*4 + vpermq m7, [cq+32*3], q%1 + vpermq m1, [cq-32*3], q%1 + vpermq m6, [cq+32*2], q3120 + vpermq m2, [cq-32*2], q3120 + vpermq m5, [cq+32*1], q%1 + vpermq m3, [cq-32*1], q%1 + vpermq m4, [cq+32*0], q3120 + REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4 +%endmacro + +INV_TXFM_16X8_FN dct, dct +INV_TXFM_16X8_FN dct, adst +INV_TXFM_16X8_FN dct, flipadst +INV_TXFM_16X8_FN dct, identity + +cglobal idct_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_16X8_LOAD_COEFS 3120 + call m(idct_8x16_internal_8bpc).main + vpbroadcastd m10, [o(pw_16384)] + punpckhwd m8, m0, m2 + punpcklwd m0, m2 + punpckhwd m2, m1, m3 + punpcklwd m1, m3 + punpcklwd m9, m4, m6 + punpckhwd m4, m6 + punpcklwd m6, m5, m7 + punpckhwd m5, m7 + REPX {pmulhrsw x, m10}, m8, m1, m4, m6 +.pass1_end: + REPX {pmulhrsw x, m10}, m0, m2, m9, m5 + punpckhwd m3, m0, m8 + punpcklwd m0, m8 + punpckhwd m8, m2, m1 + punpcklwd m2, m1 + punpcklwd m7, m9, m4 + punpckhwd m9, m4 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m8 + punpckhdq m3, m8 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckldq m8, m9, m5 + punpckhdq m9, m5 + vperm2i128 m4, m0, m6, 0x31 + vinserti128 m0, xm6, 1 + vperm2i128 m5, m1, m7, 0x31 + vinserti128 m1, xm7, 1 + vperm2i128 m6, m2, m8, 0x31 + vinserti128 m2, xm8, 1 + vperm2i128 m7, m3, m9, 0x31 + vinserti128 m3, xm9, 1 + jmp tx2q +.pass2: + call .main + vpbroadcastd m8, [o(pw_2048)] +.end: + REPX {pmulhrsw x, m8}, m0, m2, m4, m6 +.end2: + REPX {pmulhrsw x, m8}, m1, m3, m5, m7 + lea r3, [strideq*3] + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r3 +.end3: + pxor m0, m0 + REPX {mova [cq+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 +.end4: + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r3 + RET +ALIGN function_align +cglobal_label .main + vpbroadcastd m10, [o(pd_2048)] +.main2: + IDCT8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 + ret + +INV_TXFM_16X8_FN adst, dct +INV_TXFM_16X8_FN adst, adst +INV_TXFM_16X8_FN adst, flipadst +INV_TXFM_16X8_FN adst, identity + +cglobal iadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_16X8_LOAD_COEFS 1302 + call m(iadst_8x16_internal_8bpc).main2 + call m(iadst_8x16_internal_8bpc).main_pass1_end + psubw m11, m9, m10 + punpcklwd m8, m0, m2 + punpckhwd m0, m2 + punpckhwd m2, m1, m3 + punpcklwd m1, m3 + punpcklwd m9, m4, m6 + punpckhwd m4, m6 + punpckhwd m6, m5, m7 + punpcklwd m5, m7 + REPX {pmulhrsw x, m11}, m8, m1, m4, m6 + jmp m(idct_16x8_internal_8bpc).pass1_end +ALIGN function_align +.pass2: + call .main + call .main_pass2_end + pxor m8, m8 + psubw m8, m9 + REPX {pmulhrsw x, m9}, m0, m2, m4, m6 + jmp m(idct_16x8_internal_8bpc).end2 +ALIGN function_align +cglobal_label .main + vpbroadcastd m10, [o(pd_2048)] + ITX_MULSUB_2W 7, 0, 8, 9, 10, 401, 4076 ; t1a, t0a + ITX_MULSUB_2W 3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a + ITX_MULSUB_2W 1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a + ITX_MULSUB_2W 5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a + psubsw m8, m2, m6 ; t6 + paddsw m2, m6 ; t2 + psubsw m6, m0, m4 ; t4 + paddsw m0, m4 ; t0 + psubsw m4, m5, m1 ; t7 + paddsw m5, m1 ; t3 + psubsw m1, m7, m3 ; t5 + paddsw m7, m3 ; t1 + ITX_MULSUB_2W 6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a + ITX_MULSUB_2W 4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a + psubsw m9, m6, m8 ; t7 + paddsw m6, m8 ; out6 + psubsw m3, m7, m5 ; t3 + paddsw m7, m5 ; -out7 + psubsw m5, m0, m2 ; t2 + paddsw m0, m2 ; out0 + psubsw m2, m1, m4 ; t6 + paddsw m1, m4 ; -out1 + ret +ALIGN function_align +.main_pass1_end: + vpbroadcastd m11, [o(pw_m2896_2896)] + vpbroadcastd m12, [o(pw_2896_2896)] + punpckhwd m4, m3, m5 + punpcklwd m3, m5 + pmaddwd m5, m11, m4 + pmaddwd m4, m12 + pmaddwd m8, m11, m3 + pmaddwd m3, m12 + REPX {paddd x, m10}, m5, m4, m8, m3 + REPX {psrad x, 12 }, m5, m8, m4, m3 + packssdw m3, m4 ; -out3 + packssdw m4, m8, m5 ; out4 + punpcklwd m5, m9, m2 + punpckhwd m9, m2 + pmaddwd m2, m12, m5 + pmaddwd m5, m11 + pmaddwd m12, m9 + pmaddwd m11, m9 + REPX {paddd x, m10}, m2, m5, m12, m11 + REPX {psrad x, 12 }, m2, m12, m5, m11 + packssdw m2, m12 ; out2 + packssdw m5, m11 ; -out5 + ret +ALIGN function_align +cglobal_label .main_pass2_end + vpbroadcastd m8, [o(pw_2896x8)] + psubsw m4, m5, m3 + paddsw m3, m5 + psubsw m5, m2, m9 + paddsw m2, m9 + pmulhrsw m2, m8 ; out2 + pmulhrsw m3, m8 ; -out3 + pmulhrsw m4, m8 ; out4 + pmulhrsw m5, m8 ; -out5 + vpbroadcastd m9, [o(pw_2048)] + ret + +INV_TXFM_16X8_FN flipadst, dct +INV_TXFM_16X8_FN flipadst, adst +INV_TXFM_16X8_FN flipadst, flipadst +INV_TXFM_16X8_FN flipadst, identity + +cglobal iflipadst_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 + ITX_16X8_LOAD_COEFS 1302 + call m(iadst_8x16_internal_8bpc).main2 + call m(iadst_8x16_internal_8bpc).main_pass1_end + psubw m9, m10 + punpcklwd m8, m6, m4 + punpckhwd m6, m4 + punpcklwd m4, m7, m5 + punpckhwd m7, m5 + punpckhwd m5, m3, m1 + punpcklwd m3, m1 + punpckhwd m1, m2, m0 + punpcklwd m2, m0 + REPX {pmulhrsw x, m10}, m8, m4, m5, m1 + REPX {pmulhrsw x, m9 }, m6, m7, m3, m2 + punpcklwd m0, m7, m4 + punpckhwd m7, m4 + punpckhwd m4, m6, m8 + punpcklwd m6, m8 + punpckhwd m8, m3, m5 + punpcklwd m3, m5 + punpcklwd m5, m2, m1 + punpckhwd m2, m1 + punpckhdq m1, m0, m6 + punpckldq m0, m6 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckhdq m4, m3, m5 + punpckldq m3, m5 + punpckldq m5, m8, m2 + punpckhdq m8, m2 + vinserti128 m2, m6, xm5, 1 + vperm2i128 m6, m5, 0x31 + vperm2i128 m5, m1, m4, 0x31 + vinserti128 m1, xm4, 1 + vperm2i128 m4, m0, m3, 0x31 + vinserti128 m0, xm3, 1 + vinserti128 m3, m7, xm8, 1 + vperm2i128 m7, m8, 0x31 + jmp tx2q +.pass2: + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end + pxor m8, m8 + psubw m8, m9 + pmulhrsw m10, m7, m8 + pmulhrsw m7, m0, m9 + pmulhrsw m0, m6, m9 + pmulhrsw m6, m1, m8 + pmulhrsw m1, m5, m8 + pmulhrsw m5, m2, m9 + pmulhrsw m2, m4, m9 + pmulhrsw m4, m3, m8 + lea r3, [strideq*3] + WRITE_16X2 10, 0, 8, 9, strideq*0, strideq*1 + WRITE_16X2 1, 2, 0, 1, strideq*2, r3 + jmp m(idct_16x8_internal_8bpc).end3 + +INV_TXFM_16X8_FN identity, dct +INV_TXFM_16X8_FN identity, adst +INV_TXFM_16X8_FN identity, flipadst +INV_TXFM_16X8_FN identity, identity + +cglobal iidentity_16x8_internal_8bpc, 0, 5, 13, dst, stride, c, eob, tx2 + mova xm7, [cq+16*0] + mova xm2, [cq+16*1] + add cq, 16*8 + vpbroadcastd m3, [o(pw_2896x8)] + vinserti128 m7, [cq+16*0], 1 + vinserti128 m2, [cq+16*1], 1 + mova xm6, [cq-16*6] + mova xm4, [cq-16*5] + vinserti128 m6, [cq+16*2], 1 + vinserti128 m4, [cq+16*3], 1 + mova xm8, [cq-16*4] + mova xm5, [cq-16*3] + vinserti128 m8, [cq+16*4], 1 + vinserti128 m5, [cq+16*5], 1 + mova xm0, [cq-16*2] + mova xm1, [cq-16*1] + vinserti128 m0, [cq+16*6], 1 + vinserti128 m1, [cq+16*7], 1 + vpbroadcastd m10, [o(pw_1697x16)] + vpbroadcastd m11, [o(pw_16384)] + REPX {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1 + punpcklwd m3, m7, m2 + punpckhwd m7, m2 + punpcklwd m2, m6, m4 + punpckhwd m6, m4 + punpcklwd m4, m8, m5 + punpckhwd m8, m5 + punpcklwd m5, m0, m1 + punpckhwd m0, m1 + punpckldq m1, m3, m2 + punpckhdq m3, m2 + punpckldq m2, m4, m5 + punpckhdq m4, m5 + punpckldq m5, m7, m6 + punpckhdq m7, m6 + punpckldq m6, m8, m0 + punpckhdq m8, m0 + REPX {IDTX16 x, 0, 10, 11}, 1, 3, 2, 4, 5, 7, 6, 8 + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m6 + punpckhqdq m5, m6 + punpcklqdq m6, m7, m8 + punpckhqdq m7, m8 + jmp tx2q +.pass2: + vpbroadcastd m8, [o(pw_4096)] + jmp m(idct_16x8_internal_8bpc).end + +%define o_base pw_5 + 128 + +%macro INV_TXFM_16X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x16 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + or r3d, 16 + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly +%endif +%endmacro + +%macro ITX_16X16_LOAD_COEFS 0 + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + add cq, 32*8 + mova m4, [cq-32*4] + mova m5, [cq-32*3] + mova m6, [cq-32*2] + mova m7, [cq-32*1] + mova m8, [cq+32*0] + mova m9, [cq+32*1] + mova m10, [cq+32*2] + mova m11, [cq+32*3] + mova m12, [cq+32*4] + mova m13, [cq+32*5] + mova m14, [cq+32*6] + mova m15, [cq+32*7] + mova [rsp], m15 +%endmacro + +INV_TXFM_16X16_FN dct, dct +INV_TXFM_16X16_FN dct, adst +INV_TXFM_16X16_FN dct, flipadst +INV_TXFM_16X16_FN dct, identity + +cglobal idct_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 + ITX_16X16_LOAD_COEFS + call .main +.pass1_end: + vpbroadcastd m1, [o(pw_8192)] + REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 + vextracti128 [rsp+16*5], m8, 1 + mova [rsp+16*1], xm8 +.pass1_end2: + vextracti128 [rsp+16*4], m0, 1 + mova [rsp+16*0], xm0 + REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15 + pmulhrsw m1, [rsp+32*1] + vperm2i128 m8, m1, m9, 0x31 + vinserti128 m1, xm9, 1 + vperm2i128 m9, m2, m10, 0x31 + vinserti128 m2, xm10, 1 + vperm2i128 m10, m3, m11, 0x31 + vinserti128 m3, xm11, 1 + vperm2i128 m11, m4, m12, 0x31 + vinserti128 m4, xm12, 1 + vperm2i128 m12, m5, m13, 0x31 + vinserti128 m5, xm13, 1 + vperm2i128 m13, m6, m14, 0x31 + vinserti128 m6, xm14, 1 + vperm2i128 m14, m7, m15, 0x31 + vinserti128 m7, xm15, 1 + mova m15, [rsp+32*2] +.pass1_end3: + punpcklwd m0, m9, m10 + punpckhwd m9, m10 + punpcklwd m10, m15, m8 + punpckhwd m15, m8 + punpckhwd m8, m11, m12 + punpcklwd m11, m12 + punpckhwd m12, m13, m14 + punpcklwd m13, m14 + punpckhdq m14, m11, m13 + punpckldq m11, m13 + punpckldq m13, m15, m9 + punpckhdq m15, m9 + punpckldq m9, m10, m0 + punpckhdq m10, m0 + punpckhdq m0, m8, m12 + punpckldq m8, m12 + punpcklqdq m12, m13, m8 + punpckhqdq m13, m8 + punpcklqdq m8, m9, m11 + punpckhqdq m9, m11 + punpckhqdq m11, m10, m14 + punpcklqdq m10, m14 + punpcklqdq m14, m15, m0 + punpckhqdq m15, m0 + mova m0, [rsp] + mova [rsp], m15 + punpckhwd m15, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckhdq m6, m5, m7 + punpckldq m5, m7 + punpckldq m7, m15, m1 + punpckhdq m15, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m15 + punpcklqdq m6, m15 + jmp tx2q +.pass2: + call .main +.end: + vpbroadcastd m1, [o(pw_2048)] + REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 + mova [rsp], m6 +.end2: + REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15 + pmulhrsw m1, [rsp+32*1] + lea r3, [strideq*3] + WRITE_16X2 0, 1, 6, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 [rsp], 7, 0, 1, strideq*2, r3 +.end3: + pxor m2, m2 + REPX {mova [cq+32*x], m2}, -8, -7, -6, -5, -4, -3, -2, -1 + lea dstq, [dstq+strideq*4] + WRITE_16X2 8, 9, 0, 1, strideq*0, strideq*1 + WRITE_16X2 10, 11, 0, 1, strideq*2, r3 + REPX {mova [cq+32*x], m2}, 0, 1, 2, 3, 4, 5, 6, 7 + lea dstq, [dstq+strideq*4] + WRITE_16X2 12, 13, 0, 1, strideq*0, strideq*1 + WRITE_16X2 14, 15, 0, 1, strideq*2, r3 + RET +ALIGN function_align +cglobal_label .main + vpbroadcastd m15, [o(pd_2048)] + mova [rsp+gprsize+32*1], m1 + mova [rsp+gprsize+32*2], m9 + IDCT8_1D 0, 2, 4, 6, 8, 10, 12, 14, 1, 9, 15 + mova m1, [rsp+gprsize+32*2] ; in9 + mova [rsp+gprsize+32*2], m14 ; tmp7 + mova m9, [rsp+gprsize+32*1] ; in1 + mova [rsp+gprsize+32*1], m10 ; tmp5 + mova m14, [rsp+gprsize+32*0] ; in15 + mova [rsp+gprsize+32*0], m6 ; tmp3 + IDCT16_1D_ODDHALF 9, 3, 5, 7, 1, 11, 13, 14, 6, 10, 15 + mova m6, [rsp+gprsize+32*1] ; tmp5 + psubsw m15, m0, m14 ; out15 + paddsw m0, m14 ; out0 + psubsw m14, m2, m13 ; out14 + paddsw m2, m13 ; out1 + mova [rsp+gprsize+32*1], m2 + psubsw m13, m4, m11 ; out13 + paddsw m2, m4, m11 ; out2 + psubsw m11, m8, m7 ; out11 + paddsw m4, m8, m7 ; out4 + mova m7, [rsp+gprsize+32*2] ; tmp7 + psubsw m10, m6, m5 ; out10 + paddsw m5, m6 ; out5 + psubsw m8, m7, m9 ; out8 + paddsw m7, m9 ; out7 + psubsw m9, m12, m3 ; out9 + paddsw m6, m12, m3 ; out6 + mova m3, [rsp+gprsize+32*0] ; tmp3 + psubsw m12, m3, m1 ; out12 + paddsw m3, m1 ; out3 + ret + +INV_TXFM_16X16_FN adst, dct +INV_TXFM_16X16_FN adst, adst +INV_TXFM_16X16_FN adst, flipadst + +cglobal iadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 + ITX_16X16_LOAD_COEFS + call .main + call .main_pass1_end + pmulhrsw m0, m1, [cq+32*0] + pmulhrsw m2, m1, [cq+32*1] + REPX {pmulhrsw x, m1}, m4, m6, m8, m10 + pmulhrsw m12, m1, [cq+32*2] + pmulhrsw m14, m1, [cq+32*3] + vextracti128 [rsp+16*5], m8, 1 + mova [rsp+16*1], xm8 + pxor m8, m8 + psubw m1, m8, m1 + jmp m(idct_16x16_internal_8bpc).pass1_end2 +ALIGN function_align +.pass2: + call .main + call .main_pass2_end + REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 + mova [rsp+32*0], m6 + pxor m6, m6 + psubw m1, m6, m1 + jmp m(idct_16x16_internal_8bpc).end2 +ALIGN function_align +cglobal_label .main + vpbroadcastd m15, [o(pd_2048)] + mova [rsp+gprsize+32*1], m0 + mova [rsp+gprsize+32*2], m4 + ITX_MULSUB_2W 13, 2, 0, 4, 15, 995, 3973 ; t3, t2 + ITX_MULSUB_2W 9, 6, 0, 4, 15, 2440, 3290 ; t7, t6 + ITX_MULSUB_2W 5, 10, 0, 4, 15, 3513, 2106 ; t11, t10 + ITX_MULSUB_2W 1, 14, 0, 4, 15, 4052, 601 ; t15, t14 + psubsw m0, m2, m10 ; t10a + paddsw m2, m10 ; t2a + psubsw m10, m13, m5 ; t11a + paddsw m13, m5 ; t3a + psubsw m5, m6, m14 ; t14a + paddsw m6, m14 ; t6a + psubsw m14, m9, m1 ; t15a + paddsw m9, m1 ; t7a + ITX_MULSUB_2W 0, 10, 1, 4, 15, 3406, 2276 ; t11, t10 + ITX_MULSUB_2W 14, 5, 1, 4, 15, 2276, 3406 ; t14, t15 + psubsw m1, m10, m14 ; t14a + paddsw m10, m14 ; t10a + psubsw m14, m0, m5 ; t15a + paddsw m0, m5 ; t11a + psubsw m5, m2, m6 ; t6 + paddsw m2, m6 ; t2 + psubsw m6, m13, m9 ; t7 + paddsw m13, m9 ; t3 + ITX_MULSUB_2W 6, 5, 4, 9, 15, 3784, 1567 ; t6a, t7a + ITX_MULSUB_2W 14, 1, 4, 9, 15, 3784, 1567 ; t14, t15 + mova m9, [rsp+gprsize+32*0] ; in15 + mova [rsp+gprsize+32*0], m10 ; t10a + mova m4, [rsp+gprsize+32*1] ; in0 + mova [rsp+gprsize+32*1], m6 ; t6a + mova m6, [rsp+gprsize+32*2] ; in4 + mova [rsp+gprsize+32*2], m2 ; t2 + ITX_MULSUB_2W 9, 4, 2, 10, 15, 201, 4091 ; t1, t0 + ITX_MULSUB_2W 11, 6, 2, 10, 15, 1751, 3703 ; t5, t4 + ITX_MULSUB_2W 7, 8, 2, 10, 15, 3035, 2751 ; t9, t8 + ITX_MULSUB_2W 3, 12, 2, 10, 15, 3857, 1380 ; t13, t12 + psubsw m10, m4, m8 ; t8a + paddsw m8, m4 ; t0a + psubsw m4, m9, m7 ; t9a + paddsw m9, m7 ; t1a + psubsw m7, m6, m12 ; t12a + paddsw m6, m12 ; t4a + psubsw m12, m11, m3 ; t13a + paddsw m11, m3 ; t5a + ITX_MULSUB_2W 10, 4, 2, 3, 15, 799, 4017 ; t9, t8 + ITX_MULSUB_2W 12, 7, 2, 3, 15, 4017, 799 ; t12, t13 + psubsw m3, m9, m11 ; t5 + paddsw m9, m11 ; t1 + psubsw m11, m4, m12 ; t12a + paddsw m4, m12 ; t8a + paddsw m12, m8, m6 ; t0 + psubsw m8, m6 ; t4 + paddsw m6, m10, m7 ; t9a + psubsw m10, m7 ; t13a + ITX_MULSUB_2W 8, 3, 2, 7, 15, 1567, 3784 ; t5a, t4a + ITX_MULSUB_2W 11, 10, 2, 7, 15, 1567, 3784 ; t13, t12 + mova m7, [rsp+gprsize+32*0] ; t10a + mova m2, [rsp+gprsize+32*1] ; t6a + paddsw m15, m9, m13 ; -out15 + psubsw m9, m13 ; t3a + paddsw m13, m11, m1 ; -out13 + psubsw m11, m1 ; t15a + psubsw m1, m4, m7 ; t10 + paddsw m7, m4 ; -out1 + psubsw m4, m3, m2 ; t6 + paddsw m3, m2 ; -out3 + paddsw m2, m10, m14 ; out2 + psubsw m10, m14 ; t14a + paddsw m14, m6, m0 ; out14 + psubsw m6, m0 ; t11 + mova m0, [rsp+gprsize+32*2] ; t2 + mova [rsp+gprsize+32*1], m7 + psubsw m7, m12, m0 ; t2a + paddsw m0, m12 ; out0 + paddsw m12, m8, m5 ; out12 + psubsw m8, m5 ; t7 + ret +ALIGN function_align +.main_pass1_end: + mova [cq+32*0], m0 + mova [cq+32*1], m2 + mova [cq+32*2], m12 + mova [cq+32*3], m14 + vpbroadcastd m14, [pw_m2896_2896] + vpbroadcastd m12, [pw_2896_2896] + vpbroadcastd m2, [pd_2048] + punpcklwd m5, m11, m10 + punpckhwd m11, m10 + pmaddwd m10, m14, m5 + pmaddwd m0, m14, m11 + pmaddwd m5, m12 + pmaddwd m11, m12 + REPX {paddd x, m2}, m10, m0, m5, m11 + REPX {psrad x, 12}, m10, m0, m5, m11 + packssdw m10, m0 ; out10 + packssdw m5, m11 ; -out5 + punpcklwd m11, m8, m4 + punpckhwd m8, m4 + pmaddwd m4, m12, m11 + pmaddwd m0, m12, m8 + pmaddwd m11, m14 + pmaddwd m8, m14 + REPX {paddd x, m2}, m4, m0, m11, m8 + REPX {psrad x, 12}, m4, m0, m11, m8 + packssdw m4, m0 ; out4 + packssdw m11, m8 ; -out11 + punpcklwd m8, m9, m7 + punpckhwd m9, m7 + pmaddwd m7, m12, m8 + pmaddwd m0, m12, m9 + pmaddwd m8, m14 + pmaddwd m9, m14 + REPX {paddd x, m2}, m7, m0, m8, m9 + REPX {psrad x, 12}, m7, m0, m8, m9 + packssdw m7, m0 ; -out7 + packssdw m8, m9 ; out8 + punpckhwd m0, m6, m1 + punpcklwd m6, m1 + pmaddwd m1, m14, m0 + pmaddwd m9, m14, m6 + pmaddwd m0, m12 + pmaddwd m6, m12 + REPX {paddd x, m2}, m1, m9, m0, m6 + REPX {psrad x, 12}, m1, m9, m0, m6 + packssdw m9, m1 ; -out7 + packssdw m6, m0 ; out8 + vpbroadcastd m1, [o(pw_8192)] + ret +ALIGN function_align +cglobal_label .main_pass2_end + ; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to + ; 16-bit here will produce the same result as using 32-bit intermediates. + paddsw m5, m10, m11 ; -out5 + psubsw m10, m11 ; out10 + psubsw m11, m4, m8 ; -out11 + paddsw m4, m8 ; out4 + psubsw m8, m7, m9 ; out8 + paddsw m7, m9 ; -out7 + psubsw m9, m1, m6 ; -out9 + paddsw m6, m1 ; out6 + vpbroadcastd m1, [o(pw_2896x8)] + REPX {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11 + vpbroadcastd m1, [o(pw_2048)] + ret + +INV_TXFM_16X16_FN flipadst, dct +INV_TXFM_16X16_FN flipadst, adst +INV_TXFM_16X16_FN flipadst, flipadst + +cglobal iflipadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 + ITX_16X16_LOAD_COEFS + call m(iadst_16x16_internal_8bpc).main + call m(iadst_16x16_internal_8bpc).main_pass1_end + pmulhrsw m6, m1 + pmulhrsw m2, m1, m8 + mova [rsp+32*2], m6 + pmulhrsw m6, m1, m4 + pmulhrsw m4, m1, m10 + pmulhrsw m8, m1, [cq+32*3] + pmulhrsw m10, m1, [cq+32*2] + pmulhrsw m12, m1, [cq+32*1] + pmulhrsw m14, m1, [cq+32*0] + pxor m0, m0 + psubw m0, m1 + REPX {pmulhrsw x, m0}, m3, m5, m7, m11, m15 + pmulhrsw m1, m0, m9 + pmulhrsw m9, m0, m13 + pmulhrsw m0, [rsp+32*1] + mova [rsp+16*0], xm15 + mova [rsp+16*1], xm7 + vperm2i128 m15, m15, m7, 0x31 + vinserti128 m7, m2, xm14, 1 + vperm2i128 m14, m2, m14, 0x31 + vinserti128 m2, m9, xm5, 1 + vperm2i128 m9, m9, m5, 0x31 + vinserti128 m5, m4, xm12, 1 + vperm2i128 m12, m4, m12, 0x31 + vinserti128 m4, m11, xm3, 1 + vperm2i128 m11, m11, m3, 0x31 + vinserti128 m3, m10, xm6, 1 + vperm2i128 m10, m10, m6, 0x31 + vinserti128 m6, m1, xm0, 1 + vperm2i128 m13, m1, m0, 0x31 + vinserti128 m1, m8, [rsp+32*2], 1 + vperm2i128 m8, m8, [rsp+32*2], 0x31 + jmp m(idct_16x16_internal_8bpc).pass1_end3 +.pass2: + call m(iadst_16x16_internal_8bpc).main + call m(iadst_16x16_internal_8bpc).main_pass2_end + pmulhrsw m0, m1 + pmulhrsw m8, m1 + mova [rsp+32*0], m0 + mova [rsp+32*2], m8 + pxor m0, m0 + psubw m0, m1 + pmulhrsw m8, m0, m7 + pmulhrsw m7, m0, m9 + pmulhrsw m9, m1, m6 + pmulhrsw m6, m1, m10 + pmulhrsw m10, m0, m5 + pmulhrsw m5, m0, m11 + pmulhrsw m11, m1, m4 + pmulhrsw m4, m1, m12 + pmulhrsw m12, m0, m3 + pmulhrsw m3, m0, m13 + pmulhrsw m13, m1, m2 + pmulhrsw m1, m14 + pmulhrsw m14, m0, [rsp+32*1] + pmulhrsw m0, m15 + lea r3, [strideq*3] + WRITE_16X2 0, 1, 2, 0, strideq*0, strideq*1 + mova m15, [rsp+32*0] + WRITE_16X2 3, 4, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + WRITE_16X2 5, 6, 0, 1, strideq*0, strideq*1 + WRITE_16X2 7, [rsp+32*2], 0, 1, strideq*2, r3 + jmp m(idct_16x16_internal_8bpc).end3 + +%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16 + pmulhrsw m%2, m%3, m%1 + psraw m%2, 1 + pavgw m%1, m%2 ; signs are guaranteed to be equal +%endmacro + +INV_TXFM_16X16_FN identity, dct +INV_TXFM_16X16_FN identity, identity + +cglobal iidentity_16x16_internal_8bpc, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 + vpbroadcastd m7, [o(pw_1697x16)] + mova xm0, [cq+16* 0] + vinserti128 m0, [cq+16*16], 1 + mova xm15, [cq+16* 1] + vinserti128 m15, [cq+16*17], 1 + mova xm1, [cq+16* 2] + vinserti128 m1, [cq+16*18], 1 + mova xm8, [cq+16* 3] + vinserti128 m8, [cq+16*19], 1 + mova xm2, [cq+16* 4] + vinserti128 m2, [cq+16*20], 1 + mova xm9, [cq+16* 5] + vinserti128 m9, [cq+16*21], 1 + mova xm3, [cq+16* 6] + vinserti128 m3, [cq+16*22], 1 + mova xm10, [cq+16* 7] + add cq, 16*16 + vinserti128 m10, [cq+16* 7], 1 + mova xm4, [cq-16* 8] + vinserti128 m4, [cq+16* 8], 1 + mova xm11, [cq-16* 7] + vinserti128 m11, [cq+16* 9], 1 + mova xm5, [cq-16* 6] + vinserti128 m5, [cq+16*10], 1 + mova xm12, [cq-16* 5] + vinserti128 m12, [cq+16*11], 1 + mova xm13, [cq-16* 3] + vinserti128 m13, [cq+16*13], 1 + mova xm14, [cq-16* 1] + vinserti128 m14, [cq+16*15], 1 + REPX {IDTX16B x, 6, 7}, 0, 15, 1, 8, 2, 9, 3, \ + 10, 4, 11, 5, 12, 13, 14 + mova xm6, [cq-16* 4] + vinserti128 m6, [cq+16*12], 1 + mova [rsp], m0 + IDTX16B 6, 0, 7 + mova xm0, [cq-16* 2] + vinserti128 m0, [cq+16*14], 1 + pmulhrsw m7, m0 + psraw m7, 1 + pavgw m7, m0 + jmp m(idct_16x16_internal_8bpc).pass1_end3 +ALIGN function_align +.pass2: + vpbroadcastd m15, [o(pw_1697x16)] + mova [rsp+32*1], m0 + REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14 + mova m0, [rsp+32*1] + mova [rsp+32*1], m1 + IDTX16 0, 1, 15 + mova m1, [rsp+32*0] + pmulhrsw m15, m1 + paddsw m1, m1 + paddsw m15, m1 + jmp m(idct_16x16_internal_8bpc).end + +%define o_base deint_shuf + 128 + +%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2 +%if %3 + vpbroadcastd m15, [o(pw_2896x8)] + pmulhrsw m0, m15, [%1+%2*0] + pmulhrsw m1, m15, [%1+%2*1] + pmulhrsw m2, m15, [%1+%2*2] + pmulhrsw m3, m15, [%1+%2*3] + pmulhrsw m4, m15, [%1+%2*4] + pmulhrsw m5, m15, [%1+%2*5] + pmulhrsw m6, m15, [%1+%2*6] + pmulhrsw m7, m15, [%1+%2*7] +%else + mova m0, [%1+%2*0] + mova m1, [%1+%2*1] + mova m2, [%1+%2*2] + mova m3, [%1+%2*3] + mova m4, [%1+%2*4] + mova m5, [%1+%2*5] + mova m6, [%1+%2*6] + mova m7, [%1+%2*7] +%endif +%endmacro + +%macro LOAD_8ROWS_H 2-3 0 ; src, stride, is_rect2 +%if %3 +%if %3 == 1 + vpbroadcastd m15, [o(pw_2896x8)] +%endif + pmulhrsw m8, m15, [%1+%2*0] + pmulhrsw m9, m15, [%1+%2*1] + pmulhrsw m10, m15, [%1+%2*2] + pmulhrsw m11, m15, [%1+%2*3] + pmulhrsw m12, m15, [%1+%2*4] + pmulhrsw m13, m15, [%1+%2*5] + pmulhrsw m14, m15, [%1+%2*6] + pmulhrsw m15, [%1+%2*7] +%else + mova m8, [%1+%2*0] + mova m9, [%1+%2*1] + mova m10, [%1+%2*2] + mova m11, [%1+%2*3] + mova m12, [%1+%2*4] + mova m13, [%1+%2*5] + mova m14, [%1+%2*6] + mova m15, [%1+%2*7] +%endif +%endmacro + +%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4] + vpbroadcastd m%3, [r5-pw_201_4091x8+pw_%4_%5x8] + punpcklwd m%1, m%2, m%2 + pmulhrsw m%1, m%3 + vpbroadcastd m%3, [r5-pw_201_4091x8+pw_%6_%7x8] + punpckhwd m%2, m%2 + pmulhrsw m%2, m%3 +%endmacro + +cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jz .dconly + PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob + %undef cmp + cmp eobd, 106 + jle .fast + LOAD_8ROWS cq+32*1, 32*2 + call m(idct_16x8_internal_8bpc).main + vperm2i128 m11, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + vperm2i128 m4, m1, m5, 0x31 + vinserti128 m1, xm5, 1 + vperm2i128 m5, m2, m6, 0x31 + vinserti128 m2, xm6, 1 + vperm2i128 m6, m3, m7, 0x31 + vinserti128 m3, xm7, 1 + pxor m7, m7 + REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15 + punpckhwd m7, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpcklwd m3, m11, m4 + punpckhwd m11, m4 + punpckhwd m4, m5, m6 + punpcklwd m5, m6 + punpckhdq m6, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m5 + punpckhdq m3, m5 + punpckhdq m5, m11, m4 + punpckldq m11, m4 + punpckldq m4, m7, m1 + punpckhdq m7, m1 + punpckhqdq m12, m6, m0 + punpcklqdq m0, m6 ; out4 + punpckhqdq m13, m7, m4 + punpcklqdq m4, m7 ; out5 + punpckhqdq m14, m3, m2 + punpcklqdq m2, m3 ; out6 + punpckhqdq m15, m5, m11 + punpcklqdq m11, m5 ; out7 + mova [rsp+32*0], m0 + mova [rsp+32*1], m4 + mova [rsp+32*2], m2 +.fast: + LOAD_8ROWS cq+32*0, 32*2 + call m(idct_16x8_internal_8bpc).main + vperm2i128 m8, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + vperm2i128 m4, m1, m5, 0x31 + vinserti128 m1, xm5, 1 + vperm2i128 m5, m2, m6, 0x31 + vinserti128 m2, xm6, 1 + vperm2i128 m6, m3, m7, 0x31 + vinserti128 m3, xm7, 1 + vpbroadcastd m9, [o(pw_8192)] + pxor m7, m7 + REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14 + punpckhwd m7, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m8, m4 + punpcklwd m8, m4 + punpckhwd m4, m5, m6 + punpcklwd m5, m6 + punpckhdq m6, m0, m2 + punpckldq m0, m2 + punpckldq m2, m8, m5 + punpckhdq m8, m5 + punpckhdq m5, m3, m4 + punpckldq m3, m4 + punpckhdq m4, m7, m1 + punpckldq m7, m1 + punpcklqdq m1, m7, m4 + punpckhqdq m7, m4 ; out9 + punpckhqdq m4, m2, m8 ; out10 + punpcklqdq m2, m8 + punpckhqdq m8, m3, m5 + punpcklqdq m3, m5 + punpckhqdq m5, m0, m6 ; out8 + punpcklqdq m0, m6 + REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m7 + cmp eobd, 106 + jg .full + mova [rsp+32*0], m5 + mova [rsp+32*1], m7 + mova [rsp+32*2], m4 + pmulhrsw m11, m9, m8 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call .main_fast + jmp .pass2 +.dconly: + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + or r3d, 32 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly +.full: + REPX {pmulhrsw x, m9}, m12, m13, m14, m15 + pmulhrsw m6, m9, [rsp+32*2] + mova [rsp+32*2], m4 + pmulhrsw m4, m9, [rsp+32*0] + mova [rsp+32*0], m5 + pmulhrsw m5, m9, [rsp+32*1] + mova [rsp+32*1], m7 + pmulhrsw m7, m9, m11 + pmulhrsw m11, m9, m8 + call .main +.pass2: + vpbroadcastd m12, [o(pw_2048)] + REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m13, m14, m15 + pmulhrsw m12, [rsp] + REPX {vpermq x, x, q3120}, m0, m2, m4, m6, m8, m10, m12, m14 + REPX {vpermq x, x, q2031}, m1, m3, m5, m7, m9, m11, m13, m15 + mova [rsp+32*0], m4 + mova [rsp+32*1], m6 + lea r3, [strideq*3] + WRITE_8X4 0, 1, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 2, 3, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 [rsp+32*0], 5, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 [rsp+32*1], 7, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 8, 9, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 10, 11, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 12, 13, 4, 6 + lea dstq, [dstq+strideq*4] + WRITE_8X4 14, 15, 4, 6 + RET +ALIGN function_align +cglobal_label .main_fast ; bottom half is zero + call m(idct_8x16_internal_8bpc).main + mova m8, [rsp+gprsize+0*32] + mova [rsp+gprsize+0*32], m0 + mova m9, [rsp+gprsize+1*32] + mova [rsp+gprsize+1*32], m1 + mova m0, [rsp+gprsize+2*32] + mova [rsp+gprsize+2*32], m6 + lea r5, [r6-(o_base)+pw_201_4091x8] + ITX_UNPACK_MULHRSW 1, 8, 6, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a + ITX_UNPACK_MULHRSW 15, 9, 6, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a + ITX_UNPACK_MULHRSW 14, 0, 6, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a + ITX_UNPACK_MULHRSW 13, 11, 6, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a + jmp .main2 +ALIGN function_align +cglobal_label .main + call m(idct_8x16_internal_8bpc).main + mova m8, [rsp+gprsize+0*32] + mova [rsp+gprsize+0*32], m0 + mova m9, [rsp+gprsize+1*32] + mova [rsp+gprsize+1*32], m1 + mova m0, [rsp+gprsize+2*32] + mova [rsp+gprsize+2*32], m6 + punpcklwd m1, m15, m8 ; in31 in1 + punpckhwd m8, m15 ; in3 in29 + punpcklwd m15, m14, m9 ; in27 in5 + punpckhwd m9, m14 ; in7 in25 + punpcklwd m14, m13, m0 ; in23 in9 + punpckhwd m0, m13 ; in11 in21 + punpcklwd m13, m12, m11 ; in19 in13 + punpckhwd m11, m12 ; in15 in17 + ITX_MUL2X_PACK 1, 6, 12, 10, 201, 4091, 3 ; t16a, t31a + ITX_MUL2X_PACK 8, 6, 12, 10, 4052, 601, 3 ; t23a, t24a + ITX_MUL2X_PACK 15, 6, 12, 10, 995, 3973, 3 ; t20a, t27a + ITX_MUL2X_PACK 9, 6, 12, 10, 3857, 1380, 3 ; t19a, t28a + ITX_MUL2X_PACK 14, 6, 12, 10, 1751, 3703, 3 ; t18a, t29a + ITX_MUL2X_PACK 0, 6, 12, 10, 3513, 2106, 3 ; t21a, t26a + ITX_MUL2X_PACK 13, 6, 12, 10, 2440, 3290, 3 ; t22a, t25a + ITX_MUL2X_PACK 11, 6, 12, 10, 3035, 2751, 3 ; t17a, t30a +.main2: + psubsw m6, m1, m11 ; t17 t30 + paddsw m1, m11 ; t16 t31 + psubsw m11, m9, m14 ; t18 t29 + paddsw m9, m14 ; t19 t28 + psubsw m14, m15, m0 ; t21 t26 + paddsw m15, m0 ; t20 t27 + psubsw m0, m8, m13 ; t22 t25 + paddsw m8, m13 ; t23 t24 + ITX_MUL2X_PACK 6, 12, 13, 10, 799, 4017, 3 ; t17a t30a + ITX_MUL2X_PACK 11, 12, 13, 10, m4017, 799, 3 ; t18a t29a + ITX_MUL2X_PACK 14, 12, 13, 10, 3406, 2276, 3 ; t21a t26a + ITX_MUL2X_PACK 0, 12, 13, 10, m2276, 3406, 3 ; t22a t25a + psubsw m13, m1, m9 ; t19a t28a + paddsw m1, m9 ; t16a t31a + psubsw m9, m8, m15 ; t20a t27a + paddsw m8, m15 ; t23a t24a + psubsw m15, m6, m11 ; t18 t29 + paddsw m6, m11 ; t17 t30 + psubsw m11, m0, m14 ; t21 t26 + paddsw m0, m14 ; t22 t25 + ITX_MUL2X_PACK 15, 12, 14, 10, 1567, 3784, 3 ; t18a t29a + ITX_MUL2X_PACK 13, 12, 14, 10, 1567, 3784, 3 ; t19 t28 + ITX_MUL2X_PACK 9, 12, 14, 10, m3784, 1567, 3 ; t20 t27 + ITX_MUL2X_PACK 11, 12, 14, 10, m3784, 1567, 3 ; t21a t26a + vbroadcasti128 m12, [o(deint_shuf)] + psubsw m14, m1, m8 ; t23 t24 + paddsw m1, m8 ; t16 t31 + psubsw m8, m6, m0 ; t22a t25a + paddsw m6, m0 ; t17a t30a + psubsw m0, m15, m11 ; t21 t26 + paddsw m15, m11 ; t18 t29 + psubsw m11, m13, m9 ; t20a t27a + paddsw m13, m9 ; t19a t28a + REPX {pshufb x, m12}, m1, m6, m15, m13 + ITX_MUL2X_PACK 14, 9, 12, 10, 2896, 2896 ; t24a t23a + vpbroadcastd m9, [o(pw_m2896_2896)] + ITX_MUL2X_PACK 8, 12, _, 10, 12, 9, 4 ; t22 t25 + vpbroadcastd m12, [o(pw_2896_2896)] + ITX_MUL2X_PACK 0, 12, _, 10, 12, 9, 4 ; t21a t26a + vpbroadcastd m12, [o(pw_2896_2896)] + ITX_MUL2X_PACK 11, 9, _, 10, 9, 12, 4 ; t27 t20 + shufps m9, m14, m8, q1032 ; t23a t22 + vpblendd m14, m8, 0xcc ; t24a t25 + shufps m8, m11, m0, q1032 ; t20 t21a + vpblendd m11, m0, 0xcc ; t27 t26a + punpcklqdq m0, m1, m6 ; t16 t17a + punpckhqdq m1, m6 ; t31 t30a + psubsw m10, m5, m8 ; out20 out21 + paddsw m5, m8 ; out11 out10 + psubsw m6, m3, m14 ; out24 out25 + paddsw m3, m14 ; out7 out6 + psubsw m8, m7, m0 ; out16 out17 + paddsw m7, m0 ; out15 out14 + mova m0, [rsp+gprsize+0*32] + punpcklqdq m12, m13, m15 ; t19a t18 + punpckhqdq m13, m15 ; t28a t29 + psubsw m15, m0, m1 ; out31 out30 + paddsw m0, m1 ; out0 out1 + mova m1, [rsp+gprsize+1*32] + mova [rsp+gprsize+0*32], m6 + mova m6, [rsp+gprsize+2*32] + psubsw m14, m1, m13 ; out28 out29 + paddsw m1, m13 ; out3 out2 + psubsw m13, m2, m11 ; out27 out26 + paddsw m2, m11 ; out4 out5 + psubsw m11, m4, m9 ; out23 out22 + paddsw m4, m9 ; out8 out9 + psubsw m9, m6, m12 ; out19 out18 + paddsw m6, m12 ; out12 out13 + ret + +%macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2] + vbroadcasti128 m%1, [cq+16*%3] + vbroadcasti128 m%2, [cq+16*%4] + shufpd m%1, m%2, 0x0c +%endmacro + +cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + or r3d, 8 +.dconly: + pmulhrsw xm0, xm2 + movd xm2, [pw_2048] ; intentionally rip-relative + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + pxor m3, m3 +.dconly_loop: + mova m1, [dstq] + punpckhbw m2, m1, m3 + punpcklbw m1, m3 + paddw m2, m0 + paddw m1, m0 + packuswb m1, m2 + mova [dstq], m1 + add dstq, strideq + dec r3d + jg .dconly_loop + RET +.normal: + PROLOGUE 0, 4, 16, 32*3, dst, stride, c, eob + %undef cmp + LOAD_PACKED_16X2 0, 7, 0, 2 ; in0 in2 + LOAD_PACKED_16X2 4, 7, 1, 3 ; in1 in3 + LOAD_PACKED_16X2 1, 7, 4, 6 ; in4 in6 + LOAD_PACKED_16X2 5, 7, 5, 7 ; in5 in7 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 + add cq, 16*16 + LOAD_PACKED_16X2 2, 7, -8, -6 ; in8 in10 + LOAD_PACKED_16X2 6, 7, -7, -5 ; in9 in11 + LOAD_PACKED_16X2 3, 7, -4, -2 ; in12 in14 + LOAD_PACKED_16X2 11, 7, -3, -1 ; in13 in15 + REPX {mova [cq+32*x], m8}, -4, -3, -2, -1 + mova [rsp+32*0], m4 + mova [rsp+32*1], m5 + mova [rsp+32*2], m6 + cmp eobd, 106 + jg .full + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast + jmp .pass2 +.full: + LOAD_PACKED_16X2 4, 7, 0, 2 ; in16 in18 + LOAD_PACKED_16X2 12, 7, 3, 1 ; in19 in17 + LOAD_PACKED_16X2 5, 7, 4, 6 ; in20 in22 + LOAD_PACKED_16X2 13, 7, 7, 5 ; in23 in21 + REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 + add cq, 16*8 + LOAD_PACKED_16X2 6, 7, 0, 2 ; in24 in26 + LOAD_PACKED_16X2 14, 7, 3, 1 ; in27 in25 + LOAD_PACKED_16X2 7, 8, 4, 6 ; in28 in30 + LOAD_PACKED_16X2 15, 8, 7, 5 ; in31 in29 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 0, 1, 2, 3 + call m(inv_txfm_add_dct_dct_8x32_8bpc).main +.pass2: + vpbroadcastd m12, [o(pw_8192)] + REPX {pmulhrsw x, m12}, m8, m9, m10, m11, m13, m14, m15 + mova [rsp+32*1], m9 + mova [rsp+32*2], m10 + punpckhwd m9, m0, m2 + punpcklwd m0, m2 + punpckhwd m2, m1, m3 + punpcklwd m1, m3 + punpcklwd m10, m4, m6 + punpckhwd m4, m6 + punpcklwd m6, m5, m7 + punpckhwd m5, m7 + punpckhwd m3, m0, m9 + punpcklwd m0, m9 + punpckhwd m9, m2, m1 + punpcklwd m2, m1 + punpcklwd m7, m10, m4 + punpckhwd m10, m4 + punpcklwd m4, m5, m6 + punpckhwd m5, m6 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m9 + punpckhdq m3, m9 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckldq m9, m10, m5 + punpckhdq m10, m5 + REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m6, m7, m9, m10 + pmulhrsw m12, [rsp+32*0] + mova [rsp+32*0], m8 + vperm2i128 m4, m0, m6, 0x31 + vinserti128 m0, xm6, 1 + vperm2i128 m5, m1, m7, 0x31 + vinserti128 m1, xm7, 1 + vperm2i128 m6, m2, m9, 0x31 + vinserti128 m2, xm9, 1 + vperm2i128 m7, m3, m10, 0x31 + vinserti128 m3, xm10, 1 + call m(idct_16x8_internal_8bpc).main + vpbroadcastd m8, [o(pw_2048)] + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + lea r2, [strideq*3] + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r2 + lea r3, [dstq+strideq*4] + %define dstq r3 + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r2 + mova m0, [rsp+32*0] + mova m1, [rsp+32*1] + mova m2, [rsp+32*2] + punpckhwd m7, m0, m2 + punpcklwd m0, m2 + punpckhwd m2, m1, m11 + punpcklwd m1, m11 + punpckhwd m4, m12, m14 + punpcklwd m12, m14 + punpckhwd m5, m13, m15 + punpcklwd m13, m15 + punpckhwd m3, m0, m7 + punpcklwd m0, m7 + punpckhwd m9, m2, m1 + punpcklwd m2, m1 + punpcklwd m7, m12, m4 + punpckhwd m12, m4 + punpcklwd m4, m5, m13 + punpckhwd m5, m13 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckldq m2, m3, m9 + punpckhdq m3, m9 + punpckldq m6, m7, m4 + punpckhdq m7, m4 + punpckldq m9, m12, m5 + punpckhdq m12, m5 + vperm2i128 m4, m0, m6, 0x31 + vinserti128 m0, xm6, 1 + vperm2i128 m5, m1, m7, 0x31 + vinserti128 m1, xm7, 1 + vperm2i128 m6, m2, m9, 0x31 + vinserti128 m2, xm9, 1 + vperm2i128 m7, m3, m12, 0x31 + vinserti128 m3, xm12, 1 + call m(idct_16x8_internal_8bpc).main2 + vpbroadcastd m8, [o(pw_2048)] + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + add r0, 16 + add r3, 16 + %define dstq r0 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r2 + %define dstq r3 + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r2 + RET + +cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 5, 11, dst, stride, c, eob + vpbroadcastd m9, [pw_5] + lea r4, [strideq*3] + sub eobd, 107 ; loop_iterations = 1 + (eobd >= 107) +.loop: + mova xm0,[cq+16* 0] + mova xm1, [cq+16* 4] + vinserti128 m0, [cq+16* 1], 1 + vinserti128 m1, [cq+16* 5], 1 + pxor m8, m8 + mova [cq+32*0], m8 + mova [cq+32*2], m8 + add cq, 16*16 + mova xm2, [cq-16* 8] + mova xm3, [cq-16* 4] + vinserti128 m2, [cq-16* 7], 1 + vinserti128 m3, [cq-16* 3], 1 + mova xm4, [cq+16* 0] + mova xm5, [cq+16* 4] + vinserti128 m4, [cq+16* 1], 1 + vinserti128 m5, [cq+16* 5], 1 + mova xm6, [cq+16* 8] + mova xm7, [cq+16*12] + vinserti128 m6, [cq+16* 9], 1 + vinserti128 m7, [cq+16*13], 1 + REPX {mova [cq+32*x], m8}, -4, -2, 0, 2, 4, 6 + REPX {paddsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + call .transpose8x8 + REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_8X4 0, 4, 8, 10, strideq*8, strideq*4, r4*4 + add dstq, strideq + WRITE_8X4 1, 5, 0, 4, strideq*8, strideq*4, r4*4 + add dstq, strideq + WRITE_8X4 2, 6, 0, 4, strideq*8, strideq*4, r4*4 + add dstq, strideq + WRITE_8X4 3, 7, 0, 4, strideq*8, strideq*4, r4*4 + add dstq, strideq + sub cq, 16*16-32 + lea dstq, [dstq+r4*4] + add eobd, 0x80000000 + jnc .loop + RET +ALIGN function_align +.transpose8x8: + punpckhwd m8, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckhdq m6, m5, m7 + punpckldq m5, m7 + punpckldq m7, m8, m1 + punpckhdq m8, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m8 + punpcklqdq m6, m8 + ret + +cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 10, dst, stride, c, eob + add cq, 16*8 + vpbroadcastd m9, [pw_4096] + lea r4, [strideq*3] + lea r5, [dstq+strideq*4] + sub eobd, 107 +.loop: + mova xm0, [cq-16*8] + mova xm1, [cq-16*7] + vinserti128 m0, [cq+16*0], 1 + vinserti128 m1, [cq+16*1], 1 + mova xm2, [cq-16*6] + mova xm3, [cq-16*5] + vinserti128 m2, [cq+16*2], 1 + vinserti128 m3, [cq+16*3], 1 + mova xm4, [cq-16*4] + mova xm5, [cq-16*3] + vinserti128 m4, [cq+16*4], 1 + vinserti128 m5, [cq+16*5], 1 + mova xm6, [cq-16*2] + mova xm7, [cq-16*1] + vinserti128 m6, [cq+16*6], 1 + vinserti128 m7, [cq+16*7], 1 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r4 + %define dstq r5 + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r4 + add cq, 16*16 + add r0, 16 + add r5, 16 + add eobd, 0x80000000 + jnc .loop + RET + +%define o_base pw_5 + 128 + +%macro LOAD_16ROWS 2-4 0, 1 ; src, stride, is_rect2, zero_coefs +%if %3 + vpbroadcastd m15, [o(pw_2896x8)] + pmulhrsw m0, m15, [%1+%2* 0] + pmulhrsw m1, m15, [%1+%2* 1] + pmulhrsw m2, m15, [%1+%2* 2] + pmulhrsw m3, m15, [%1+%2* 3] + pmulhrsw m4, m15, [%1+%2* 4] + pmulhrsw m5, m15, [%1+%2* 5] + pmulhrsw m6, m15, [%1+%2* 6] + pmulhrsw m7, m15, [%1+%2* 7] + pmulhrsw m8, m15, [%1+%2* 8] + pmulhrsw m9, m15, [%1+%2* 9] + pmulhrsw m10, m15, [%1+%2*10] + pmulhrsw m11, m15, [%1+%2*11] + pmulhrsw m12, m15, [%1+%2*12] + pmulhrsw m13, m15, [%1+%2*13] + pmulhrsw m14, m15, [%1+%2*14] + pmulhrsw m15, [%1+%2*15] +%else + mova m0, [%1+%2* 0] + mova m1, [%1+%2* 1] + mova m2, [%1+%2* 2] + mova m3, [%1+%2* 3] + mova m4, [%1+%2* 4] + mova m5, [%1+%2* 5] + mova m6, [%1+%2* 6] + mova m7, [%1+%2* 7] + mova m8, [%1+%2* 8] + mova m9, [%1+%2* 9] + mova m10, [%1+%2*10] + mova m11, [%1+%2*11] + mova m12, [%1+%2*12] + mova m13, [%1+%2*13] + mova m14, [%1+%2*14] + mova m15, [%1+%2*15] +%endif + mova [rsp], m15 +%if %4 + pxor m15, m15 + REPX {mova [%1+%2*x], m15}, 0, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14, 15 +%endif +%endmacro + +%macro IDCT32_PASS2_END 7 ; coefs[1-2], tmp[1-2], rnd, offset[1-2] + mova m%4, [%2] + paddsw m%3, m%1, m%4 + psubsw m%1, m%4 + pmovzxbw m%4, [dstq+%6] + pmulhrsw m%3, m%5 + pmulhrsw m%1, m%5 + paddw m%3, m%4 + pmovzxbw m%4, [r2+%7] + paddw m%1, m%4 + packuswb m%3, m%1 + vpermq m%3, m%3, q3120 + mova [dstq+%6], xm%3 + vextracti128 [r2+%7], m%3, 1 +%endmacro + +cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*35, dst, stride, c, eob, tmp1, tmp2, \ + base, tmp3 + %undef cmp + LOAD_16ROWS cq, 64, 1 + call m(idct_16x16_internal_8bpc).main + lea tmp1q, [rsp+32*7] + lea tmp2q, [tmp1q+32*8] + lea tmp3q, [tmp1q+32*16] + mova m1, [rsp+32*1] + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + vpbroadcastd m7, [o(pw_16384)] + call .transpose_2x8x8_round + mova m15, [rsp+32*0] + mova [tmp3q-32*4+ 0], xm0 + vextracti128 [tmp3q+32*0+ 0], m0, 1 + mova [tmp3q-32*3+ 0], xm2 + vextracti128 [tmp3q+32*1+ 0], m2, 1 + mova [tmp3q-32*2+ 0], xm4 + vextracti128 [tmp3q+32*2+ 0], m4, 1 + mova [tmp3q-32*1+ 0], xm6 + vextracti128 [tmp3q+32*3+ 0], m6, 1 + mova [tmp3q-32*4+16], xm8 + vextracti128 [tmp3q+32*0+16], m8, 1 + mova [tmp3q-32*3+16], xm10 + vextracti128 [tmp3q+32*1+16], m10, 1 + mova [tmp3q-32*2+16], xm12 + vextracti128 [tmp3q+32*2+16], m12, 1 + mova [tmp3q-32*1+16], xm14 + vextracti128 [tmp3q+32*3+16], m14, 1 + cmp eobd, 150 + jg .full + vinserti128 m0, m1, xm9, 1 + vperm2i128 m4, m1, m9, 0x31 + vinserti128 m2, m5, xm13, 1 + vperm2i128 m6, m5, m13, 0x31 + vinserti128 m1, m3, xm11, 1 + vperm2i128 m5, m3, m11, 0x31 + vinserti128 m3, m7, xm15, 1 + vperm2i128 m7, m7, m15, 0x31 + call .main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp .idct16 +.dconly: + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + or r3d, 32 + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly +.full: + mova [tmp1q-32*4], m1 + mova [tmp1q-32*3], m3 + mova [tmp1q-32*2], m5 + mova [tmp1q-32*1], m7 + mova [tmp1q+32*0], m9 + mova [tmp1q+32*1], m11 + mova [tmp1q+32*2], m13 + mova [tmp1q+32*3], m15 + LOAD_16ROWS cq+32, 64, 1 + call m(idct_16x16_internal_8bpc).main + lea r2, [tmp3q+32*8] + mova m1, [rsp+32*1] + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + vpbroadcastd m7, [o(pw_16384)] + call .transpose_2x8x8_round + mova m15, [rsp+32*0] + mova [r2-32*4+ 0], xm0 + vextracti128 [r2+32*0+ 0], m0, 1 + mova [r2-32*3+ 0], xm2 + vextracti128 [r2+32*1+ 0], m2, 1 + mova [r2-32*2+ 0], xm4 + vextracti128 [r2+32*2+ 0], m4, 1 + mova [r2-32*1+ 0], xm6 + vextracti128 [r2+32*3+ 0], m6, 1 + mova [r2-32*4+16], xm8 + vextracti128 [r2+32*0+16], m8, 1 + mova [r2-32*3+16], xm10 + vextracti128 [r2+32*1+16], m10, 1 + mova [r2-32*2+16], xm12 + vextracti128 [r2+32*2+16], m12, 1 + mova [r2-32*1+16], xm14 + vextracti128 [r2+32*3+16], m14, 1 + vinserti128 m8, m1, xm9, 1 + vperm2i128 m12, m1, m9, 0x31 + mova xm0, [tmp1q-32*4] + mova xm1, [tmp1q-32*3] + vinserti128 m0, [tmp1q+32*0], 1 + vinserti128 m1, [tmp1q+32*1], 1 + vinserti128 m10, m5, xm13, 1 + vperm2i128 m14, m5, m13, 0x31 + mova xm4, [tmp1q-32*4+16] + mova xm5, [tmp1q-32*3+16] + vinserti128 m4, [tmp1q+32*0+16], 1 + vinserti128 m5, [tmp1q+32*1+16], 1 + vinserti128 m9, m3, xm11, 1 + vperm2i128 m13, m3, m11, 0x31 + mova xm2, [tmp1q-32*2] + mova xm3, [tmp1q-32*1] + vinserti128 m2, [tmp1q+32*2], 1 + vinserti128 m3, [tmp1q+32*3], 1 + vinserti128 m11, m7, xm15, 1 + vperm2i128 m15, m7, m15, 0x31 + mova xm6, [tmp1q-32*2+16] + mova xm7, [tmp1q-32*1+16] + vinserti128 m6, [tmp1q+32*2+16], 1 + vinserti128 m7, [tmp1q+32*3+16], 1 + call .main_oddhalf + LOAD_8ROWS_H r2-32*4, 32 +.idct16: + LOAD_8ROWS tmp3q-32*4, 32 + mova [rsp], m15 + call m(idct_16x16_internal_8bpc).main + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + call .pass2_end + RET +ALIGN function_align +cglobal_label .main_oddhalf_fast ; lower half is zero + mova [rsp+gprsize+32*1], m7 + pxor m7, m7 + mova [rsp+gprsize+32*0], m7 + mova [rsp+gprsize+32*2], m7 + vpbroadcastd m11, [o(pw_3703x8)] + vpbroadcastd m7, [o(pw_1751x8)] + vpbroadcastd m12, [o(pw_m1380x8)] + vpbroadcastd m8, [o(pw_3857x8)] + vpbroadcastd m13, [o(pw_3973x8)] + vpbroadcastd m15, [o(pw_995x8)] + pmulhrsw m11, m4 ; t29a + pmulhrsw m4, m7 ; t18a + pmulhrsw m12, m3 ; t19a + pmulhrsw m3, m8 ; t28a + pmulhrsw m13, m2 ; t27a + pmulhrsw m2, m15 ; t20a + vpbroadcastd m10, [o(pw_m2106x8)] + vpbroadcastd m7, [o(pw_3513x8)] + vpbroadcastd m9, [o(pw_3290x8)] + vpbroadcastd m8, [o(pw_2440x8)] + vpbroadcastd m14, [o(pw_m601x8)] + vpbroadcastd m15, [o(pw_4052x8)] + pmulhrsw m10, m5 ; t21a + pmulhrsw m5, m7 ; t26a + pmulhrsw m9, m6 ; t25a + pmulhrsw m6, m8 ; t22a + pmulhrsw m14, m1 ; t23a + pmulhrsw m1, m15 ; t24a + vpbroadcastd m15, [o(pd_2048)] + jmp .main2 +ALIGN function_align +cglobal_label .main_oddhalf + mova [rsp+gprsize+32*0], m15 + mova [rsp+gprsize+32*1], m7 + mova [rsp+gprsize+32*2], m8 + vpbroadcastd m15, [o(pd_2048)] + ITX_MULSUB_2W 4, 11, 7, 8, 15, 1751, 3703 ; t18a, t29a + ITX_MULSUB_2W 12, 3, 7, 8, 15, 3857, 1380 ; t19a, t28a + ITX_MULSUB_2W 2, 13, 7, 8, 15, 995, 3973 ; t20a, t27a + ITX_MULSUB_2W 10, 5, 7, 8, 15, 3513, 2106 ; t21a, t26a + ITX_MULSUB_2W 6, 9, 7, 8, 15, 2440, 3290 ; t22a, t25a + ITX_MULSUB_2W 14, 1, 7, 8, 15, 4052, 601 ; t23a, t24a +.main2: + psubsw m7, m12, m4 ; t18 + paddsw m12, m4 ; t19 + psubsw m4, m2, m10 ; t21 + paddsw m2, m10 ; t20 + psubsw m10, m14, m6 ; t22 + paddsw m14, m6 ; t23 + psubsw m6, m1, m9 ; t25 + paddsw m1, m9 ; t24 + psubsw m9, m13, m5 ; t26 + paddsw m13, m5 ; t27 + psubsw m5, m3, m11 ; t29 + paddsw m3, m11 ; t28 + ITX_MULSUB_2W 5, 7, 8, 11, 15, m4017, 799 ; t18a, t29a + ITX_MULSUB_2W 9, 4, 8, 11, 15, 3406, 2276 ; t21a, t26a + ITX_MULSUB_2W 6, 10, 8, 11, 15, m2276, 3406 ; t22a, t25a + psubsw m8, m14, m2 ; t20a + paddsw m14, m2 ; t23a + psubsw m2, m1, m13 ; t27a + paddsw m1, m13 ; t24a + psubsw m13, m6, m9 ; t21 + paddsw m6, m9 ; t22 + psubsw m9, m10, m4 ; t26 + paddsw m10, m4 ; t25 + ITX_MULSUB_2W 2, 8, 4, 11, 15, m3784, 1567 ; t20, t27 + ITX_MULSUB_2W 9, 13, 4, 11, 15, m3784, 1567 ; t21a, t26a + mova m4, [rsp+gprsize+32*0] ; in31 + mova [rsp+gprsize+32*0], m6 ; t22 + mova m6, [rsp+gprsize+32*1] ; in15 + mova [rsp+gprsize+32*1], m14 ; t23a + mova m14, [rsp+gprsize+32*2] ; in17 + mova [rsp+gprsize+32*2], m1 ; t24a + ITX_MULSUB_2W 0, 4, 1, 11, 15, 201, 4091 ; t16a, t31a + ITX_MULSUB_2W 14, 6, 1, 11, 15, 3035, 2751 ; t17a, t30a + psubsw m1, m0, m14 ; t17 + paddsw m0, m14 ; t16 + psubsw m14, m4, m6 ; t30 + paddsw m4, m6 ; t31 + ITX_MULSUB_2W 14, 1, 6, 11, 15, 799, 4017 ; t17a, t30a + psubsw m6, m0, m12 ; t19a + paddsw m0, m12 ; t16a + psubsw m12, m4, m3 ; t28a + paddsw m4, m3 ; t31a + psubsw m3, m14, m5 ; t18 + paddsw m14, m5 ; t17 + psubsw m5, m1, m7 ; t29 + paddsw m1, m7 ; t30 + ITX_MULSUB_2W 5, 3, 7, 11, 15, 1567, 3784 ; t18a, t29a + ITX_MULSUB_2W 12, 6, 7, 11, 15, 1567, 3784 ; t19, t28 + psubsw m7, m1, m10 ; t25a + paddsw m1, m10 ; t30a + psubsw m10, m5, m9 ; t21 + paddsw m5, m9 ; t18 + psubsw m9, m12, m2 ; t20a + paddsw m12, m2 ; t19a + psubsw m2, m3, m13 ; t26 + paddsw m3, m13 ; t29 + psubsw m13, m6, m8 ; t27a + paddsw m6, m8 ; t28a + mova [tmp1q-32*2], m5 + mova [tmp1q-32*1], m12 + mova [tmp2q+32*0], m6 + mova [tmp2q+32*1], m3 + mova [tmp2q+32*2], m1 + mova m5, [rsp+gprsize+32*0] ; t22 + mova m6, [rsp+gprsize+32*1] ; t23 + mova m3, [rsp+gprsize+32*2] ; t24a + psubsw m1, m14, m5 ; t22a + paddsw m14, m5 ; t17a + psubsw m5, m0, m6 ; t23 + paddsw m0, m6 ; t16 + psubsw m6, m4, m3 ; t24 + paddsw m4, m3 ; t31 + vpbroadcastd m8, [o(pw_m2896_2896)] + vpbroadcastd m3, [o(pw_2896_2896)] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m14 + mova [tmp2q+32*3], m4 + ITX_MULSUB_2W 13, 9, 0, 4, 15, 3, 8 ; t20, t27 + ITX_MULSUB_2W 2, 10, 0, 4, 15, 3, 8 ; t21a, t26a + ITX_MULSUB_2W 7, 1, 0, 4, 15, 3, 8 ; t22, t25 + ITX_MULSUB_2W 6, 5, 0, 4, 15, 3, 8 ; t23a, t24a + mova [tmp1q+32*0], m13 + mova [tmp1q+32*1], m2 + mova [tmp1q+32*2], m7 + mova [tmp1q+32*3], m6 + mova [tmp2q-32*4], m5 + mova [tmp2q-32*3], m1 + mova [tmp2q-32*2], m10 + mova [tmp2q-32*1], m9 + ret +ALIGN function_align +.transpose_2x8x8_round: + punpckhwd m6, m12, m13 + punpcklwd m12, m13 + punpckhwd m13, m8, m9 + punpcklwd m8, m9 + punpckhwd m9, m14, m15 + punpcklwd m14, m15 + punpckhwd m15, m10, m11 + punpcklwd m10, m11 + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5 + punpckhdq m11, m8, m10 + punpckldq m8, m10 + punpckldq m10, m12, m14 + punpckhdq m12, m14 + punpckhdq m14, m13, m15 + punpckldq m13, m15 + punpckldq m15, m6, m9 + punpckhdq m6, m9 + punpckhqdq m9, m8, m10 + punpcklqdq m8, m10 + punpcklqdq m10, m11, m12 + punpckhqdq m11, m12 + punpcklqdq m12, m13, m15 + punpckhqdq m13, m15 + punpckhqdq m15, m14, m6 + punpcklqdq m14, m6 + pmulhrsw m6, m7, [rsp+gprsize+32*0] + REPX {pmulhrsw x, m7}, m8, m9, m10, m11, m12, m13, m14, m15 + pmulhrsw m7, [rsp+gprsize+32*1] + mova [rsp+gprsize+32*0], m15 + punpckhwd m15, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckhdq m6, m5, m7 + punpckldq m5, m7 + punpckldq m7, m15, m1 + punpckhdq m15, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m15 + punpcklqdq m6, m15 + ret +ALIGN function_align +.pass2_end: + mova [rsp+gprsize+32*0], m7 + mova [rsp+gprsize+32*2], m15 + vpbroadcastd m15, [o(pw_2048)] + IDCT32_PASS2_END 0, tmp2q+32*3, 1, 7, 15, strideq*0, r3*4 + IDCT32_PASS2_END 4, tmp2q-32*1, 0, 7, 15, strideq*4, strideq*8 + IDCT32_PASS2_END 8, tmp1q+32*3, 0, 4, 15, strideq*8, strideq*4 + IDCT32_PASS2_END 12, tmp1q-32*1, 0, 4, 15, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m1, [rsp+gprsize+32*1] + IDCT32_PASS2_END 1, tmp2q+32*2, 0, 4, 15, strideq*0, r3*4 + IDCT32_PASS2_END 5, tmp2q-32*2, 0, 4, 15, strideq*4, strideq*8 + IDCT32_PASS2_END 9, tmp1q+32*2, 0, 4, 15, strideq*8, strideq*4 + IDCT32_PASS2_END 13, tmp1q-32*2, 0, 4, 15, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + IDCT32_PASS2_END 2, tmp2q+32*1, 0, 4, 15, strideq*0, r3*4 + IDCT32_PASS2_END 6, tmp2q-32*3, 0, 4, 15, strideq*4, strideq*8 + IDCT32_PASS2_END 10, tmp1q+32*1, 0, 4, 15, strideq*8, strideq*4 + IDCT32_PASS2_END 14, tmp1q-32*3, 0, 4, 15, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m7, [rsp+gprsize+32*0] + mova m1, [rsp+gprsize+32*2] + IDCT32_PASS2_END 3, tmp2q+32*0, 0, 4, 15, strideq*0, r3*4 + IDCT32_PASS2_END 7, tmp2q-32*4, 0, 4, 15, strideq*4, strideq*8 + IDCT32_PASS2_END 11, tmp1q+32*0, 0, 4, 15, strideq*8, strideq*4 + IDCT32_PASS2_END 1, tmp1q-32*4, 0, 4, 15, r3*4, strideq*0 + ret + +; Perform the final sumsub step and YMM lane shuffling +%macro IDCT32_PASS1_END 4 ; row[1-2], tmp[1-2] + mova m%3, [tmp2q+32*( 3-%1)] + psubsw m%4, m%1, m%3 + paddsw m%1, m%3 + mova m%3, [tmp1q+32*(11-%2)] + mova [tmp1q+32*(11-%2)+16], xm%4 + vextracti128 [tmp2q+32*( 3-%1)+16], m%4, 1 + paddsw m%4, m%2, m%3 + psubsw m%2, m%3 + mova [tmp1q+32*(11-%2)], xm%2 + vextracti128 [tmp2q+32*( 3-%1)], m%2, 1 + vperm2i128 m%2, m%1, m%4, 0x31 + vinserti128 m%1, xm%4, 1 +%endmacro + +cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + or r3d, 16 + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly +.normal: + PROLOGUE 0, 6, 16, 32*19, dst, stride, c, eob, tmp1, tmp2 + vpbroadcastd m15, [o(pw_2896x8)] + pmulhrsw m0, m15, [cq+32* 1] + pmulhrsw m1, m15, [cq+32* 3] + pmulhrsw m2, m15, [cq+32* 5] + pmulhrsw m3, m15, [cq+32* 7] + pmulhrsw m4, m15, [cq+32* 9] + pmulhrsw m5, m15, [cq+32*11] + pmulhrsw m6, m15, [cq+32*13] + pmulhrsw m7, m15, [cq+32*15] + pmulhrsw m8, m15, [cq+32*17] + pmulhrsw m9, m15, [cq+32*19] + pmulhrsw m10, m15, [cq+32*21] + pmulhrsw m11, m15, [cq+32*23] + pmulhrsw m12, m15, [cq+32*25] + pmulhrsw m13, m15, [cq+32*27] + pmulhrsw m14, m15, [cq+32*29] + pmulhrsw m15, [cq+32*31] + lea tmp1q, [rsp+32*7] + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + LOAD_16ROWS cq+32*0, 32*2, 1, 0 + pxor m15, m15 + mov r3d, 8 +.zero_loop: + mova [cq+32*0], m15 + mova [cq+32*1], m15 + mova [cq+32*2], m15 + mova [cq+32*3], m15 + add cq, 32*4 + dec r3d + jg .zero_loop + call m(idct_16x16_internal_8bpc).main + call .pass1_end + lea r2, [strideq*3] + mov r3, dstq +.pass2: + vpbroadcastd m7, [o(pw_16384)] + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round + call m(idct_16x16_internal_8bpc).main + mova [rsp+32*2], m15 + vpbroadcastd m15, [o(pw_2048)] + REPX {pmulhrsw x, m15}, m2, m3, m0 + WRITE_16X2 2, 3, 1, 2, strideq*2, r2 + pmulhrsw m1, m15, [rsp+32*1] + WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1 + lea dstq, [dstq+strideq*4] + REPX {pmulhrsw x, m15}, m4, m5, m6, m7 + WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1 + WRITE_16X2 6, 7, 2, 3, strideq*2, r2 + lea dstq, [dstq+strideq*4] + REPX {pmulhrsw x, m15}, m8, m9, m10, m11 + WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1 + WRITE_16X2 10, 11, 2, 3, strideq*2, r2 + lea dstq, [dstq+strideq*4] + REPX {pmulhrsw x, m15}, m11, m12, m13, m14 + pmulhrsw m15, [rsp+32*2] + WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1 + WRITE_16X2 14, 15, 2, 3, strideq*2, r2 + test r3, r3 + jnz .right_half + RET +.right_half: + LOAD_8ROWS tmp1q-32*4, 32 + LOAD_8ROWS_H tmp2q-32*4, 32 + lea dstq, [r3+16] + xor r3d, r3d + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + jmp .pass2 +ALIGN function_align +.pass1_end: + mova [rsp+gprsize+32*0], m9 + IDCT32_PASS1_END 0, 8, 1, 9 + IDCT32_PASS1_END 2, 10, 1, 9 + IDCT32_PASS1_END 3, 11, 1, 9 + IDCT32_PASS1_END 4, 12, 1, 9 + IDCT32_PASS1_END 5, 13, 1, 9 + IDCT32_PASS1_END 6, 14, 1, 9 + IDCT32_PASS1_END 7, 15, 1, 9 + mova m1, [rsp+gprsize+32*1] + mova m9, [rsp+gprsize+32*0] + mova [rsp+gprsize+32*0], m6 + mova [rsp+gprsize+32*1], m7 + IDCT32_PASS1_END 1, 9, 6, 7 + ret + +cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 5, 13, dst, stride, c, eob +%undef cmp + lea r6, [o_base] + vpbroadcastd m9, [o(pw_2896x8)] + vpbroadcastd m10, [o(pw_1697x16)] + vpbroadcastd m12, [o(pw_8192)] + cmp eobd, 43 ; if (eob > 43) + setg r4b ; iteration_count++ + cmp eobd, 150 ; if (eob > 150) + setg al ; iteration_count++ + add eobd, -279 ; if (eob > 278) + adc r4b, al ; iteration_count++ + lea r3, [strideq*3] + mov r6, cq + paddw m11, m12, m12 ; pw_16384 +.loop: + mova xm0, [cq+64* 0] + mova xm1, [cq+64* 1] + vinserti128 m0, [cq+64* 8], 1 + vinserti128 m1, [cq+64* 9], 1 + mova xm2, [cq+64* 2] + mova xm3, [cq+64* 3] + vinserti128 m2, [cq+64*10], 1 + vinserti128 m3, [cq+64*11], 1 + mova xm4, [cq+64* 4] + mova xm5, [cq+64* 5] + vinserti128 m4, [cq+64*12], 1 + vinserti128 m5, [cq+64*13], 1 + mova xm6, [cq+64* 6] + mova xm7, [cq+64* 7] + vinserti128 m6, [cq+64*14], 1 + vinserti128 m7, [cq+64*15], 1 + REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + add cq, 16 + dec r4b + jge .loop + sub cq, 32 + pxor m0, m0 + mov r0d, 8 + cmp cq, r6 + ja .zero_loop +.zero_loop_half: + mova [r6+64*0], m0 + mova [r6+64*1], m0 + add r6, 64*4 + mova [r6-64*2], m0 + mova [r6-64*1], m0 + sub r0d, 2 + jg .zero_loop_half + RET +.zero_loop: + mova [r6+32*0], m0 + mova [r6+32*1], m0 + mova [r6+32*2], m0 + mova [r6+32*3], m0 + add r6, 32*4 + dec r0d + jg .zero_loop + RET + +cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 12, dst, stride, c, eob +%undef cmp + lea r6, [o_base] + vpbroadcastd m9, [o(pw_2896x8)] + vpbroadcastd m10, [o(pw_1697x16)] + vpbroadcastd m11, [o(pw_2048)] + cmp eobd, 35 ; if (eob > 35) + setg r4b ; iteration_count++ + cmp eobd, 150 ; if (eob > 150) + setg r3b ; iteration_count += 2 + lea r4d, [r4+r3*2] + lea r3, [strideq*3] + mov r5, dstq + mov r6, cq +.loop: + mova xm0, [cq+32* 0] + mova xm1, [cq+32* 1] + vinserti128 m0, [cq+32* 8], 1 + vinserti128 m1, [cq+32* 9], 1 + mova xm2, [cq+32* 2] + mova xm3, [cq+32* 3] + vinserti128 m2, [cq+32*10], 1 + vinserti128 m3, [cq+32*11], 1 + mova xm4, [cq+32* 4] + mova xm5, [cq+32* 5] + vinserti128 m4, [cq+32*12], 1 + vinserti128 m5, [cq+32*13], 1 + mova xm6, [cq+32* 6] + mova xm7, [cq+32* 7] + vinserti128 m6, [cq+32*14], 1 + vinserti128 m7, [cq+32*15], 1 + REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {paddsw x, x }, m0, m1, m2, m3, m4, m5, m6, m7 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + REPX {IDTX16 x, 8, 10}, 0, 1, 2, 3, 4, 5, 6, 7 + REPX {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r3 + lea dstq, [dstq+strideq*4] + add cq, 16 + dec r4b + jl .ret + test r4b, 1 + jz .loop + add cq, 32*15 + lea dstq, [r5+16] + jmp .loop +.ret: + sub cd, eax + pxor m0, m0 + add cd, 384 +.zero_loop: + mova [r6+32*0], m0 + mova [r6+32*1], m0 + mova [r6+32*2], m0 + mova [r6+32*3], m0 + add r6, 32*4 + sub cd, 128 + jge .zero_loop + RET + +cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + or r3d, 32 + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly +.normal: + PROLOGUE 0, 9, 16, 32*67, dst, stride, c, eob, tmp1, tmp2, \ + base, tmp3, tmp4 + %undef cmp + lea tmp1q, [rsp+32*7] + lea tmp2q, [tmp1q+32*8] + sub eobd, 136 + mov tmp4d, eobd +.pass1_loop: + LOAD_8ROWS cq+64*1, 64*2 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15 + test tmp4d, tmp4d + jl .fast + LOAD_8ROWS_H cq+64*17, 64*2 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + LOAD_8ROWS_H cq+64*16, 64*2 + pxor m0, m0 + REPX {mova [cq+64*x], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \ + 24, 25, 26, 27, 28, 29, 30, 31 + mova [rsp], m15 + jmp .idct16 +.fast: + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 +.idct16: + LOAD_8ROWS cq+64*0, 64*2 + pxor m15, m15 + REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 + call m(idct_16x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end + vpbroadcastd m7, [o(pw_8192)] + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round + lea tmp3q, [tmp1q+32*32] + mova m15, [rsp] + mova [tmp3q-32*4], m0 + mova [tmp3q-32*3], m2 + mova [tmp3q-32*2], m4 + mova [tmp3q-32*1], m6 + mova [tmp3q+32*0], m8 + mova [tmp3q+32*1], m10 + mova [tmp3q+32*2], m12 + mova [tmp3q+32*3], m14 + add tmp3q, 32*8 + mova [tmp3q-32*4], m1 + mova [tmp3q-32*3], m3 + mova [tmp3q-32*2], m5 + mova [tmp3q-32*1], m7 + mova [tmp3q+32*0], m9 + mova [tmp3q+32*1], m11 + mova [tmp3q+32*2], m13 + mova [tmp3q+32*3], m15 + vpbroadcastd m9, [o(pw_8192)] + pmulhrsw m0, m9, [tmp1q-32*4] + pmulhrsw m1, m9, [tmp1q-32*3] + pmulhrsw m2, m9, [tmp1q-32*2] + pmulhrsw m3, m9, [tmp1q-32*1] + pmulhrsw m4, m9, [tmp1q+32*0] + pmulhrsw m5, m9, [tmp1q+32*1] + pmulhrsw m6, m9, [tmp1q+32*2] + pmulhrsw m7, m9, [tmp1q+32*3] + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + mova [tmp1q-32*4], m0 + pmulhrsw m0, m9, [tmp2q-32*4] + mova [tmp2q-32*4], m1 + pmulhrsw m1, m9, [tmp2q-32*3] + mova [tmp1q-32*3], m2 + pmulhrsw m2, m9, [tmp2q-32*2] + mova [tmp2q-32*3], m3 + pmulhrsw m3, m9, [tmp2q-32*1] + mova [tmp1q-32*2], m4 + pmulhrsw m4, m9, [tmp2q+32*0] + mova [tmp2q-32*2], m5 + pmulhrsw m5, m9, [tmp2q+32*1] + mova [tmp1q-32*1], m6 + pmulhrsw m6, m9, [tmp2q+32*2] + mova [tmp2q-32*1], m7 + pmulhrsw m7, m9, [tmp2q+32*3] + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + mova [tmp1q+32*0], m0 + mova [tmp2q+32*0], m1 + mova [tmp1q+32*1], m2 + mova [tmp2q+32*1], m3 + mova [tmp1q+32*2], m4 + mova [tmp2q+32*2], m5 + mova [tmp1q+32*3], m6 + mova [tmp2q+32*3], m7 + add cq, 32 + add tmp1q, 32*16 + add tmp2q, 32*16 + add eobd, 0x80000000 + jnc .pass1_loop + add tmp1q, 32*24 + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + test tmp4d, tmp4d + jge .pass2_loop + add tmp1q, 32*16 + add tmp2q, 32*16 + add tmp3q, 32*16 +.pass2_loop: + LOAD_8ROWS tmp2q-32*4, 32 + test tmp4d, tmp4d + jl .fast2 + LOAD_8ROWS_H tmp3q-32*4, 32 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + sub tmp3q, 32*8 + LOAD_8ROWS_H tmp3q-32*4, 32 + sub tmp3q, 32*16 + jmp .pass2_loop_end +.fast2: + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + sub tmp3q, 32*24 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 +.pass2_loop_end: + LOAD_8ROWS tmp3q-32*4, 32 + mova [rsp], m15 + call m(idct_16x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end + lea tmp3q, [tmp1q-32*32] + cmp tmp2q, tmp3q + jb .ret + sub tmp2q, 32*32 + sub dstq, r3 + lea r2, [r2+r3+16] + add dstq, 16 + jmp .pass2_loop +.ret: + RET + +cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 10, dst, stride, c, eob + %undef cmp + vpbroadcastd m9, [pw_8192] + sub eobd, 136 ; if (eob < 136) + shr eobd, 30 ; topleft 16x16 only + lea eobd, [eobq*2-8] + lea r4, [strideq*3] + mov r5, dstq + lea r6, [cq+32] +.loop: + mova xm0, [cq+64* 0] + mova xm1, [cq+64* 1] + vinserti128 m0, [cq+64* 8], 1 + vinserti128 m1, [cq+64* 9], 1 + mova xm2, [cq+64* 2] + mova xm3, [cq+64* 3] + vinserti128 m2, [cq+64*10], 1 + vinserti128 m3, [cq+64*11], 1 + mova xm4, [cq+64* 4] + mova xm5, [cq+64* 5] + vinserti128 m4, [cq+64*12], 1 + vinserti128 m5, [cq+64*13], 1 + mova xm6, [cq+64* 6] + mova xm7, [cq+64* 7] + vinserti128 m6, [cq+64*14], 1 + vinserti128 m7, [cq+64*15], 1 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 + WRITE_16X2 2, 3, 0, 1, strideq*2, r4 + lea dstq, [dstq+strideq*4] + WRITE_16X2 4, 5, 0, 1, strideq*0, strideq*1 + WRITE_16X2 6, 7, 0, 1, strideq*2, r4 + lea dstq, [dstq+strideq*4] + add cq, 16 + inc eobd + jz .ret + test eobd, 3 + jnz .loop + add cq, 64*15 + lea dstq, [r5+16] + jmp .loop +.ret: + pxor m0, m0 + mov r0d, 16 + cmp cq, r6 + jne .zero_loop +.zero_loop_topleft: + mova [r6-32*1], m0 + mova [r6+32*1], m0 + mova [r6+32*3], m0 + mova [r6+32*5], m0 + add r6, 64*4 + sub r0d, 4 + jg .zero_loop_topleft + RET +.zero_loop: + mova [r6-32*1], m0 + mova [r6+32*0], m0 + mova [r6+32*1], m0 + mova [r6+32*2], m0 + add r6, 32*4 + dec r0d + jg .zero_loop + RET + +%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4]) +%if %1 & 1 + mova m%5, [tmp2q-32*(51-%1)] ; idct16 out 0+n + mova m%4, [tmp1q-32*(14+%1)] ; idct32 out31-n +%else + mova m%5, [tmp1q-32*(45-%1)] + mova m%4, [tmp2q-32*(20+%1)] +%endif + psubsw m%6, m%5, m%4 ; idct32 out31-n + paddsw m%5, m%4 ; idct32 out 0+n + psubsw m%4, m%6, m%3 ; out32+n + paddsw m%6, m%3 ; out31-n + psubsw m%3, m%5, m%2 ; out63-n + paddsw m%5, m%2 ; out 0+n +%if %0 == 6 ; pass 1 +%if %1 & 1 + mova [tmp2q-32*(19-%1)], m%4 + mova [tmp1q-32*(14+%1)], m%6 + mova [tmp1q+32*(18-%1)], m%3 + mova [tmp2q-32*(51-%1)], m%5 +%else + mova [tmp1q-32*(13-%1)], m%4 + mova [tmp2q-32*(20+%1)], m%6 + mova [tmp2q+32*(12-%1)], m%3 + mova [tmp1q-32*(45-%1)], m%5 +%endif +%else ; pass 2 + REPX {pmulhrsw x, m14}, m%4, m%6, m%3, m%5 +%if %1 & 1 + %define %%d0 r2 + %define %%d1 dstq +%else + %define %%d0 dstq + %define %%d1 r2 +%endif + pmovzxbw m%2, [%%d0+%9 ] + paddw m%2, m%4 + pmovzxbw m%4, [%%d1+%8 ] + paddw m%4, m%6 + pmovzxbw m%6, [%%d1+%10] + paddw m%3, m%6 + pmovzxbw m%6, [%%d0+%7 ] + paddw m%5, m%6 + packuswb m%2, m%4 + packuswb m%3, m%5 + vpermq m%2, m%2, q3120 + vpermq m%3, m%3, q3120 + mova [%%d0+%9 ], xm%2 + vextracti128 [%%d1+%8 ], m%2, 1 + mova [%%d1+%10], xm%3 + vextracti128 [%%d0+%7 ], m%3, 1 +%endif +%endmacro + +cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + or r3d, 64 + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly +.normal: + PROLOGUE 0, 10, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 + %undef cmp + lea tmp1q, [rsp+32*23] + lea tmp2q, [tmp1q+32*24] + sub eobd, 151 + mov r7d, eobd +.pass1_loop: + LOAD_16ROWS cq, 64 + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + vpbroadcastd m7, [o(pw_8192)] + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round + mova m15, [rsp+32*0] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m2 + mova [tmp1q-32*2], m4 + mova [tmp1q-32*1], m6 + mova [tmp1q+32*0], m8 + mova [tmp1q+32*1], m10 + mova [tmp1q+32*2], m12 + mova [tmp1q+32*3], m14 + mova [tmp2q-32*4], m1 + mova [tmp2q-32*3], m3 + mova [tmp2q-32*2], m5 + mova [tmp2q-32*1], m7 + mova [tmp2q+32*0], m9 + mova [tmp2q+32*1], m11 + mova [tmp2q+32*2], m13 + mova [tmp2q+32*3], m15 + add cq, 32 + add tmp1q, 32*8 + add tmp2q, 32*8 + add eobd, 0x80000000 + jnc .pass1_loop + lea r2, [rsp+32*23] + mova xm0, [r2-32*4+ 0] + mova xm1, [r2-32*2+ 0] + vinserti128 m0, [r2+32*0+ 0], 1 + vinserti128 m1, [r2+32*2+ 0], 1 + mova xm2, [r2-32*4+16] + mova xm3, [r2-32*2+16] + vinserti128 m2, [r2+32*0+16], 1 + vinserti128 m3, [r2+32*2+16], 1 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 + test r7d, r7d + jl .fast + lea r3, [r2+32*8] + mova xm4, [r3-32*4+ 0] + mova xm5, [r3-32*2+ 0] + vinserti128 m4, [r3+32*0+ 0], 1 + vinserti128 m5, [r3+32*2+ 0], 1 + mova xm6, [r3-32*4+16] + mova xm7, [r3-32*2+16] + vinserti128 m6, [r3+32*0+16], 1 + vinserti128 m7, [r3+32*2+16], 1 +.fast: + mova [rsp], m8 + lea tmp1q, [rsp+32*7] + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + mova xm0, [r2-32*3+ 0] + mova xm1, [r2-32*1+ 0] + vinserti128 m0, [r2+32*1+ 0], 1 + vinserti128 m1, [r2+32*3+ 0], 1 + mova xm2, [r2-32*3+16] + mova xm3, [r2-32*1+16] + vinserti128 m2, [r2+32*1+16], 1 + vinserti128 m3, [r2+32*3+16], 1 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + test r7d, r7d + jl .fast2 + mova xm4, [r3-32*3+ 0] + mova xm5, [r3-32*1+ 0] + vinserti128 m4, [r3+32*1+ 0], 1 + vinserti128 m5, [r3+32*3+ 0], 1 + mova xm6, [r3-32*3+16] + mova xm7, [r3-32*1+16] + vinserti128 m6, [r3+32*1+16], 1 + vinserti128 m7, [r3+32*3+16], 1 +.fast2: + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + add r2, 32*24 + vpbroadcastd m15, [o(pd_2048)] + add tmp1q, 32*16 + add tmp2q, 32*32 + mova xm0, [r2-32*4+ 0] + mova xm3, [r2-32*1+16] + vinserti128 m0, [r2+32*0+ 0], 1 + vinserti128 m3, [r2+32*3+16], 1 + mova xm4, [r2-32*4+16] + mova xm7, [r2-32*1+ 0] + vinserti128 m4, [r2+32*0+16], 1 + vinserti128 m7, [r2+32*3+ 0], 1 + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r7d, r7d + jl .fast3 + add r3, 32*24 + mova xm1, [r3-32*1+16] + mova xm2, [r3-32*4+ 0] + vinserti128 m1, [r3+32*3+16], 1 + vinserti128 m2, [r3+32*0+ 0], 1 + mova xm5, [r3-32*1+ 0] + mova xm6, [r3-32*4+16] + vinserti128 m5, [r3+32*3+ 0], 1 + vinserti128 m6, [r3+32*0+16], 1 +.fast3: + add r6, o_idct64_offset + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + add r6, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova xm0, [r2-32*2+ 0] + mova xm3, [r2-32*3+16] + vinserti128 m0, [r2+32*2+ 0], 1 + vinserti128 m3, [r2+32*1+16], 1 + mova xm4, [r2-32*2+16] + mova xm7, [r2-32*3+ 0] + vinserti128 m4, [r2+32*2+16], 1 + vinserti128 m7, [r2+32*1+ 0], 1 + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r7d, r7d + jl .fast4 + mova xm1, [r3-32*3+16] + mova xm2, [r3-32*2+ 0] + vinserti128 m1, [r3+32*1+16], 1 + vinserti128 m2, [r3+32*2+ 0], 1 + mova xm5, [r3-32*3+ 0] + mova xm6, [r3-32*2+16] + vinserti128 m5, [r3+32*1+ 0], 1 + vinserti128 m6, [r3+32*2+16], 1 +.fast4: + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2 + RET +ALIGN function_align +%define o_base idct64_mul - 8 +cglobal_label .main_part1 + ; idct64 steps 1-5: + ; in1/31/17/15/ 9/23/25/ 7 -> + ; t32a/33/34a/35/36/37a/38/39a/56a/57/58a/59/60/61a/62/63a + ; in5/27/21/11/13/19/29/ 3 -> + ; t40a/41/42a/43/44/45a/46/47a/48a/49/50a/51/52/53a/54/55a + vpbroadcastd m11, [o(idct64_mul+4* 0)] + vpbroadcastd m13, [o(idct64_mul+4* 1)] + vpbroadcastd m10, [o(idct64_mul+4* 4)] + vpbroadcastd m12, [o(idct64_mul+4* 5)] + pmulhrsw m11, m0 ; t63a + pmulhrsw m0, m13 ; t32a + pmulhrsw m10, m1 ; t62a + pmulhrsw m1, m12 ; t33a + vpbroadcastd m9, [o(idct64_mul+4* 8)] + vpbroadcastd m13, [o(idct64_mul+4* 9)] + vpbroadcastd m8, [o(idct64_mul+4*12)] + vpbroadcastd m12, [o(idct64_mul+4*13)] + pmulhrsw m9, m2 ; t61a + pmulhrsw m2, m13 ; t34a + pmulhrsw m8, m3 ; t60a + pmulhrsw m3, m12 ; t35a + psubsw m12, m0, m1 ; t33 + paddsw m0, m1 ; t32 + psubsw m1, m3, m2 ; t34 + paddsw m3, m2 ; t35 + psubsw m2, m8, m9 ; t61 + paddsw m8, m9 ; t60 + psubsw m9, m11, m10 ; t62 + paddsw m11, m10 ; t63 + ITX_MULSUB_2W 2, 1, 10, 13, 15, m4076, 401 ; t34a, t61a + vpbroadcastd m14, [o(pw_401_4076)] + ITX_MULSUB_2W 9, 12, 10, 13, 15, 14, 13 ; t33a, t62a + psubsw m10, m0, m3 ; t35a + paddsw m0, m3 ; t32a + psubsw m3, m11, m8 ; t60a + paddsw m11, m8 ; t63a + psubsw m8, m9, m2 ; t34 + paddsw m9, m2 ; t33 + psubsw m2, m12, m1 ; t61 + paddsw m12, m1 ; t62 + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m9 + mova [tmp2q+32*2], m12 + mova [tmp2q+32*3], m11 + vpbroadcastd m13, [o(pw_m4017_799)] + vpbroadcastd m14, [o(pw_799_4017)] + ITX_MULSUB_2W 2, 8, 0, 1, 15, 14, 13 ; t34a, t61a + ITX_MULSUB_2W 3, 10, 0, 1, 15, 14, 13 ; t35, t60 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp2q+32*0], m10 + mova [tmp2q+32*1], m8 + vpbroadcastd m3, [o(idct64_mul+4*16)] + vpbroadcastd m11, [o(idct64_mul+4*17)] + vpbroadcastd m2, [o(idct64_mul+4*20)] + vpbroadcastd m10, [o(idct64_mul+4*21)] + vpbroadcastd m1, [o(idct64_mul+4*24)] + vpbroadcastd m9, [o(idct64_mul+4*25)] + vpbroadcastd m0, [o(idct64_mul+4*28)] + vpbroadcastd m8, [o(idct64_mul+4*29)] + pmulhrsw m3, m4 ; t59a + pmulhrsw m4, m11 ; t36a + pmulhrsw m2, m5 ; t58a + pmulhrsw m5, m10 ; t37a + pmulhrsw m1, m6 ; t57a + pmulhrsw m6, m9 ; t38a + pmulhrsw m0, m7 ; t56a + pmulhrsw m7, m8 ; t39a + psubsw m8, m4, m5 ; t37 + paddsw m4, m5 ; t36 + psubsw m5, m7, m6 ; t38 + paddsw m7, m6 ; t39 + psubsw m6, m0, m1 ; t57 + paddsw m0, m1 ; t56 + psubsw m1, m3, m2 ; t58 + paddsw m3, m2 ; t59 + ITX_MULSUB_2W 6, 5, 2, 9, 15, m2598, 3166 ; t38a, t57a + vpbroadcastd m10, [o(pw_3166_2598)] + ITX_MULSUB_2W 1, 8, 2, 9, 15, 10, 9 ; t37a, t58a + psubsw m2, m7, m4 ; t36a + paddsw m7, m4 ; t39a + psubsw m4, m0, m3 ; t59a + paddsw m0, m3 ; t56a + psubsw m3, m6, m1 ; t37 + paddsw m6, m1 ; t38 + psubsw m1, m5, m8 ; t58 + paddsw m5, m8 ; t57 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + mova [tmp2q-32*4], m0 + mova [tmp2q-32*3], m5 + vpbroadcastd m6, [o(pw_m799_m4017)] + vpbroadcastd m7, [o(pw_m4017_799)] + ITX_MULSUB_2W 4, 2, 0, 5, 15, 7, 6 ; t36, t59 + ITX_MULSUB_2W 1, 3, 0, 5, 15, 7, 6 ; t37a, t58a + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m1 + mova [tmp2q-32*2], m3 + mova [tmp2q-32*1], m2 + ret +%define o_base pw_5 + 128 +.main_part2_pass1: ; idct64 steps 6-9 + idct16/32/64 sumsub + sub r6, o_idct64_offset + 8 + vpbroadcastd m11, [o(pw_1567_3784)] + vpbroadcastd m12, [o(pw_m3784_1567)] + vpbroadcastd m13, [o(pw_2896_2896)] + vpbroadcastd m14, [o(pw_m2896_2896)] +.main_part2_pass1_loop: + call .main_part2_internal + IDCT64_PART2_END 0, 7, 0, 6, 9, 10 + IDCT64_PART2_END 7, 8, 5, 0, 6, 7 + IDCT64_PART2_END 8, 2, 1, 0, 6, 7 + IDCT64_PART2_END 15, 3, 4, 0, 6, 7 + cmp tmp1q, tmp2q + jne .main_part2_pass1_loop + ret +cglobal_label .main_part2_internal + mova m0, [tmp1q-32*12] ; t32a + mova m6, [tmp2q-32*13] ; t39a + mova m1, [tmp1q-32* 4] ; t40a + mova m5, [tmp2q+32* 3] ; t55a + add tmp1q, 32 + sub tmp2q, 32 + mova m2, [tmp1q+32* 3] ; t48a + mova m4, [tmp2q-32* 4] ; t47a + mova m3, [tmp1q+32*11] ; t56a + mova m7, [tmp2q+32*12] ; t63a + psubsw m8, m0, m6 ; t39 + paddsw m0, m6 ; t32 + psubsw m6, m4, m1 ; t40 + paddsw m4, m1 ; t47 + psubsw m1, m2, m5 ; t55 + paddsw m2, m5 ; t48 + psubsw m5, m7, m3 ; t56 + paddsw m7, m3 ; t63 + ITX_MULSUB_2W 5, 8, 3, 9, 15, 11, 12 ; t39a, t56a + vpbroadcastd m9, [o(pw_m1567_m3784)] + ITX_MULSUB_2W 1, 6, 3, 9, 15, 12, 9 ; t40a, t55a + psubsw m3, m0, m4 ; t47a + paddsw m0, m4 ; t32a + psubsw m4, m7, m2 ; t48a + paddsw m7, m2 ; t63a + psubsw m2, m5, m1 ; t40 + paddsw m5, m1 ; t39 + psubsw m1, m8, m6 ; t55 + paddsw m8, m6 ; t56 + ITX_MULSUB_2W 4, 3, 6, 9, 15, 13, 14 ; t47, t48 + ITX_MULSUB_2W 1, 2, 6, 9, 15, 13, 14 ; t40a, t55a + ret +.main_part2_pass2: + sub r6, o_idct64_offset + 8 + vpbroadcastd m11, [o(pw_1567_3784)] + vpbroadcastd m12, [o(pw_m3784_1567)] + vpbroadcastd m13, [o(pw_2896_2896)] + lea r9, [strideq*5] ; stride*5 + lea r3, [r9+strideq*1] ; stride*6 + lea r7, [r9+strideq*2] ; stride*7 + lea r8, [r3+strideq*2] ; stride*8 + lea r2, [dstq+r7] +.main_part2_pass2_loop: + vpbroadcastd m14, [o(pw_m2896_2896)] + call .main_part2_internal + vpbroadcastd m14, [o(pw_2048)] + IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*4, r7*8 + IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*4, r7*8 + IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8 + IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*2, r9*8, r3*8 + add dstq, strideq + sub r2, strideq + cmp tmp1q, tmp2q + jne .main_part2_pass2_loop + ret + +cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + or r3d, 16 +.dconly: + pmulhrsw xm0, xm2 + movd xm2, [o(pw_2048)] + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + pxor m1, m1 +.dconly_loop: + mova m2, [dstq+32*0] + mova m3, [dstq+32*1] + punpckhbw m4, m2, m1 + punpcklbw m2, m1 + punpckhbw m5, m3, m1 + punpcklbw m3, m1 + paddw m4, m0 + paddw m2, m0 + paddw m5, m0 + paddw m3, m0 + packuswb m2, m4 + packuswb m3, m5 + mova [dstq+32*0], m2 + mova [dstq+32*1], m3 + add dstq, strideq + dec r3d + jg .dconly_loop + RET +.normal: + PROLOGUE 0, 7, 16, 32*67, dst, stride, c, eob, tmp1, tmp2 + LOAD_8ROWS cq+32*0, 32*4 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + lea tmp1q, [rsp+32*7] + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + LOAD_8ROWS cq+32*2, 32*4 + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + vpbroadcastd m15, [o(pd_2048)] + add tmp1q, 32*16 + add tmp2q, 32*32 + mova m0, [cq+32* 1] + mova m1, [cq+32*31] + mova m2, [cq+32*17] + mova m3, [cq+32*15] + mova m4, [cq+32* 9] + mova m5, [cq+32*23] + mova m6, [cq+32*25] + mova m7, [cq+32* 7] + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 + add r6, o_idct64_offset + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + add r6, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova m0, [cq+32* 5] + mova m1, [cq+32*27] + mova m2, [cq+32*21] + mova m3, [cq+32*11] + mova m4, [cq+32*13] + mova m5, [cq+32*19] + mova m6, [cq+32*29] + mova m7, [cq+32* 3] + pxor m8, m8 + REPX {mova [cq+32*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1 + sub tmp1q, 32*36 + lea r2, [strideq*3] + mov tmp2d, 4 +.pass2_loop: + lea r3, [tmp1q-32*8] + mova xm0, [r3 -32*4] + mova xm1, [r3 -32*3] + vinserti128 m0, [tmp1q-32*4], 1 + vinserti128 m1, [tmp1q-32*3], 1 + mova xm2, [r3 -32*2] + mova xm3, [r3 -32*1] + vinserti128 m2, [tmp1q-32*2], 1 + vinserti128 m3, [tmp1q-32*1], 1 + mova xm4, [r3 +32*0] + mova xm5, [r3 +32*1] + vinserti128 m4, [tmp1q+32*0], 1 + vinserti128 m5, [tmp1q+32*1], 1 + mova xm6, [r3 +32*2] + mova xm7, [r3 +32*3] + vinserti128 m6, [tmp1q+32*2], 1 + vinserti128 m7, [tmp1q+32*3], 1 + mova xm8, [r3 -32*4+16] + mova xm9, [r3 -32*3+16] + vinserti128 m8, [tmp1q-32*4+16], 1 + vinserti128 m9, [tmp1q-32*3+16], 1 + mova xm10, [r3 -32*2+16] + mova xm11, [r3 -32*1+16] + vinserti128 m10, [tmp1q-32*2+16], 1 + vinserti128 m11, [tmp1q-32*1+16], 1 + mova xm12, [r3 +32*0+16] + mova xm13, [r3 +32*1+16] + vinserti128 m12, [tmp1q+32*0+16], 1 + vinserti128 m13, [tmp1q+32*1+16], 1 + mova xm14, [r3 +32*2+16] + mova xm15, [r3 +32*3+16] + vinserti128 m14, [tmp1q+32*2+16], 1 + vinserti128 m15, [tmp1q+32*3+16], 1 + mova [rsp+32*0], m6 + mova [rsp+32*1], m7 + vpbroadcastd m7, [o(pw_8192)] + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round + call m(idct_16x16_internal_8bpc).main + mova [rsp+32*0], m15 + vpbroadcastd m15, [o(pw_2048)] + REPX {pmulhrsw x, m15}, m0, m2, m3, m4, m5, m6, m7 + WRITE_16X2 2, 3, 1, 2, strideq*2, r2 + pmulhrsw m1, m15, [rsp+32*1] + WRITE_16X2 0, 1, 2, 3, strideq*0, strideq*1 + lea r3, [dstq+strideq*4] + %define dstq r3 + WRITE_16X2 4, 5, 2, 3, strideq*0, strideq*1 + WRITE_16X2 6, 7, 2, 3, strideq*2, r2 + REPX {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14 + lea r3, [r3+strideq*4] + WRITE_16X2 8, 9, 2, 3, strideq*0, strideq*1 + WRITE_16X2 10, 11, 2, 3, strideq*2, r2 + pmulhrsw m15, [rsp+32*0] + lea r3, [r3+strideq*4] + WRITE_16X2 12, 13, 2, 3, strideq*0, strideq*1 + WRITE_16X2 14, 15, 2, 3, strideq*2, r2 + add tmp1q, 32*16 + add r0, 16 + dec tmp2d + jg .pass2_loop + RET + +cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + or r3d, 64 + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly +.normal: + PROLOGUE 0, 11, 16, 32*99, dst, stride, c, eob, tmp1, tmp2 + lea tmp1q, [rsp+32*7] + lea r10d, [eobq-136] + sar r10d, 31 +.pass1_loop: + lea tmp2q, [tmp1q+32*16] + LOAD_8ROWS cq+64*1, 64*2, 1 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 1, 3, 5, 7, 9, 11, 13, 15 + test r10b, r10b + jnz .fast + LOAD_8ROWS_H cq+64*17, 64*2, 2 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + LOAD_8ROWS_H cq+64*16, 64*2, 1 + mova [rsp], m15 + pxor m15, m15 + REPX {mova [cq+64*x], m15}, 16, 17, 18, 19, 20, 21, 22, 23, \ + 24, 25, 26, 27, 28, 29, 30, 31 + jmp .idct16 +.fast: + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 +.idct16: + LOAD_8ROWS cq+64*0, 64*2, 1 + pxor m15, m15 + REPX {mova [cq+64*x], m15}, 0, 2, 4, 6, 8, 10, 12, 14 + call m(idct_16x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_32x16_8bpc).pass1_end + vpbroadcastd m7, [o(pw_16384)] + call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round + lea r3, [tmp1q+32*48] + mova m15, [rsp] + mova [r3-32*4], m0 + mova [r3-32*3], m2 + mova [r3-32*2], m4 + mova [r3-32*1], m6 + mova [r3+32*0], m8 + mova [r3+32*1], m10 + mova [r3+32*2], m12 + mova [r3+32*3], m14 + add r3, 32*24 + mova [r3-32*4], m1 + mova [r3-32*3], m3 + mova [r3-32*2], m5 + mova [r3-32*1], m7 + mova [r3+32*0], m9 + mova [r3+32*1], m11 + mova [r3+32*2], m13 + mova [r3+32*3], m15 + vpbroadcastd m9, [o(pw_16384)] + pmulhrsw m0, m9, [tmp1q-32*4] + pmulhrsw m1, m9, [tmp1q-32*3] + pmulhrsw m2, m9, [tmp1q-32*2] + pmulhrsw m3, m9, [tmp1q-32*1] + pmulhrsw m4, m9, [tmp1q+32*0] + pmulhrsw m5, m9, [tmp1q+32*1] + pmulhrsw m6, m9, [tmp1q+32*2] + pmulhrsw m7, m9, [tmp1q+32*3] + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + mova [tmp1q-32*4], m0 + pmulhrsw m0, m9, [tmp2q-32*4] + mova [tmp2q-32*4], m1 + pmulhrsw m1, m9, [tmp2q-32*3] + mova [tmp1q-32*3], m2 + pmulhrsw m2, m9, [tmp2q-32*2] + mova [tmp2q-32*3], m3 + pmulhrsw m3, m9, [tmp2q-32*1] + mova [tmp1q-32*2], m4 + pmulhrsw m4, m9, [tmp2q+32*0] + mova [tmp2q-32*2], m5 + pmulhrsw m5, m9, [tmp2q+32*1] + mova [tmp1q-32*1], m6 + pmulhrsw m6, m9, [tmp2q+32*2] + mova [tmp2q-32*1], m7 + pmulhrsw m7, m9, [tmp2q+32*3] + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + mova [tmp1q+32*0], m0 + mova [tmp2q+32*0], m1 + mova [tmp1q+32*1], m2 + mova [tmp2q+32*1], m3 + mova [tmp1q+32*2], m4 + mova [tmp2q+32*2], m5 + mova [tmp1q+32*3], m6 + mova [tmp2q+32*3], m7 + add cq, 32 + add tmp1q, 32*8 + add r10d, 0x80000000 + jnc .pass1_loop + lea r2, [rsp+32*55] + lea r7, [r2+32*24] +.pass2_loop: + lea r3, [r2+32*8] + lea r8, [r7+32*8] + mova m0, [r2-32*4] + mova m1, [r2-32*2] + mova m2, [r2+32*0] + mova m3, [r2+32*2] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 + test r10b, r10b + jnz .fast2 + mova m4, [r3-32*4] + mova m5, [r3-32*2] + mova m6, [r3+32*0] + mova m7, [r3+32*2] +.fast2: + mova [rsp], m8 + lea tmp1q, [rsp+32*39] + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + mova m0, [r2-32*3] + mova m1, [r2-32*1] + mova m2, [r2+32*1] + mova m3, [r2+32*3] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + test r10b, r10b + jnz .fast3 + mova m4, [r3-32*3] + mova m5, [r3-32*1] + mova m6, [r3+32*1] + mova m7, [r3+32*3] +.fast3: + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + vpbroadcastd m15, [o(pd_2048)] + add tmp1q, 32*16 + add tmp2q, 32*32 + mova m0, [r7-32*4] + mova m3, [r7+32*3] + mova m4, [r7+32*0] + mova m7, [r7-32*1] + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r10b, r10b + jnz .fast4 + mova m1, [r8+32*3] + mova m2, [r8-32*4] + mova m5, [r8-32*1] + mova m6, [r8+32*0] +.fast4: + add r6, o_idct64_offset + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + add r6, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova m0, [r7-32*2] + mova m3, [r7+32*1] + mova m4, [r7+32*2] + mova m7, [r7-32*3] + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r10b, r10b + jnz .fast5 + mova m1, [r8+32*1] + mova m2, [r8-32*2] + mova m5, [r8-32*3] + mova m6, [r8+32*2] +.fast5: + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2 + add r10d, 0x80000000 + jc .ret + lea r2, [rsp+32*7] + lea r7, [r2+32*16] + sub dstq, r8 + lea dstq, [dstq+strideq*4+16] + jmp .pass2_loop +.ret: + RET + +cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_16384)] + mov [cq], eobd + pmulhrsw xm0, xm1 + or r3d, 32 + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly +.normal: + PROLOGUE 0, 9, 16, 32*131, dst, stride, c, eob, tmp1, tmp2, \ + base, tmp3, tmp4 + lea tmp1q, [rsp+32*7] + lea tmp4d, [eobq-136] +.pass1_loop: + LOAD_8ROWS cq+64*0, 64*4, 1 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + LOAD_8ROWS cq+64*2, 64*4, 1 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + vpbroadcastd m15, [o(pd_2048)] + add tmp1q, 32*16 + add tmp2q, 32*32 + vpbroadcastd m7, [o(pw_2896x8)] + pmulhrsw m0, m7, [cq+64* 1] + pmulhrsw m1, m7, [cq+64*31] + pmulhrsw m2, m7, [cq+64*17] + pmulhrsw m3, m7, [cq+64*15] + pmulhrsw m4, m7, [cq+64* 9] + pmulhrsw m5, m7, [cq+64*23] + pmulhrsw m6, m7, [cq+64*25] + pmulhrsw m7, [cq+64* 7] + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 + add r6, o_idct64_offset + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + vpbroadcastd m7, [o(pw_2896x8-(o_idct64_offset))] + add r6, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + pmulhrsw m0, m7, [cq+64* 5] + pmulhrsw m1, m7, [cq+64*27] + pmulhrsw m2, m7, [cq+64*21] + pmulhrsw m3, m7, [cq+64*11] + pmulhrsw m4, m7, [cq+64*13] + pmulhrsw m5, m7, [cq+64*19] + pmulhrsw m6, m7, [cq+64*29] + pmulhrsw m7, [cq+64* 3] + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1 + sub tmp1q, 32*44 + vpbroadcastd m10, [o(pw_16384)] + call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave + add cq, 32 + add tmp4d, 0x80000000 + jnc .pass1_loop + lea tmp1q, [rsp+32*15] + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + mov tmp4b, 4 +.pass2_loop: + lea tmp2q, [tmp1q+32*64] + LOAD_8ROWS tmp1q-32*4, 32 + test tmp4d, 0x40000000 + jnz .fast + LOAD_8ROWS_H tmp2q-32*4, 32 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + lea tmp3q, [tmp2q-32*8] + LOAD_8ROWS_H tmp3q-32*4, 32 + mova [rsp], m15 + jmp .idct16 +.fast: + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 +.idct16: + lea tmp3q, [tmp1q-32*8] + LOAD_8ROWS tmp3q-32*4, 32 + call m(idct_16x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_16x32_8bpc).pass2_end + add tmp1q, 32*16 + sub dstq, r3 + lea r2, [r2+r3+16] + add dstq, 16 + dec tmp4b + jg .pass2_loop + RET +ALIGN function_align +.transpose_round_interleave: + mov tmp3d, 4 +.loop: + lea tmp2q, [tmp1q+32*8] + mova xm0, [tmp1q-32*4] + mova xm1, [tmp1q-32*3] + vinserti128 m0, [tmp2q-32*4], 1 + vinserti128 m1, [tmp2q-32*3], 1 + mova xm2, [tmp1q-32*2] + mova xm3, [tmp1q-32*1] + vinserti128 m2, [tmp2q-32*2], 1 + vinserti128 m3, [tmp2q-32*1], 1 + mova xm4, [tmp1q+32*0] + mova xm5, [tmp1q+32*1] + vinserti128 m4, [tmp2q+32*0], 1 + vinserti128 m5, [tmp2q+32*1], 1 + mova xm6, [tmp1q+32*2] + mova xm7, [tmp1q+32*3] + vinserti128 m6, [tmp2q+32*2], 1 + vinserti128 m7, [tmp2q+32*3], 1 + REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + mova xm8, [tmp1q-32*4+16] + mova xm9, [tmp1q-32*3+16] + vinserti128 m8, [tmp2q-32*4+16], 1 + vinserti128 m9, [tmp2q-32*3+16], 1 + mova [tmp1q-32*4], m0 + mova [tmp2q-32*4], m1 + mova [tmp1q-32*3], m2 + mova [tmp2q-32*3], m3 + mova xm2, [tmp1q-32*2+16] + mova xm3, [tmp1q-32*1+16] + vinserti128 m2, [tmp2q-32*2+16], 1 + vinserti128 m3, [tmp2q-32*1+16], 1 + mova [tmp1q-32*2], m4 + mova [tmp2q-32*2], m5 + mova [tmp1q-32*1], m6 + mova [tmp2q-32*1], m7 + mova xm4, [tmp1q+32*0+16] + mova xm5, [tmp1q+32*1+16] + vinserti128 m4, [tmp2q+32*0+16], 1 + vinserti128 m5, [tmp2q+32*1+16], 1 + mova xm6, [tmp1q+32*2+16] + mova xm7, [tmp1q+32*3+16] + vinserti128 m6, [tmp2q+32*2+16], 1 + vinserti128 m7, [tmp2q+32*3+16], 1 + pmulhrsw m0, m8, m10 + pmulhrsw m1, m9, m10 + REPX {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7 + call m(inv_txfm_add_identity_identity_8x32_8bpc).transpose8x8 + mova [tmp1q+32*0], m0 + mova [tmp2q+32*0], m1 + mova [tmp1q+32*1], m2 + mova [tmp2q+32*1], m3 + mova [tmp1q+32*2], m4 + mova [tmp2q+32*2], m5 + mova [tmp1q+32*3], m6 + mova [tmp2q+32*3], m7 + add tmp1q, 32*16 + dec tmp3d + jg .loop + ret + +cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 4, 0, dst, stride, c, eob + lea r6, [o_base] + test eobd, eobd + jnz .normal + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_8192)] + mov [cq], eobd + or r3d, 64 + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly +.normal: + PROLOGUE 0, 11, 16, 32*199, dst, stride, c, eob, tmp1, tmp2 + lea tmp1q, [rsp+32*71] + lea r10d, [eobq-136] +.pass1_loop: + LOAD_8ROWS cq+64*0, 64*4 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 0, 4, 8, 12, 16, 20, 24, 28 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + LOAD_8ROWS cq+64*2, 64*4 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 2, 6, 10, 14, 18, 22, 26, 30 + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + vpbroadcastd m15, [o(pd_2048)] + add tmp1q, 32*16 + add tmp2q, 32*32 + mova m0, [cq+64* 1] + mova m1, [cq+64*31] + mova m2, [cq+64*17] + mova m3, [cq+64*15] + mova m4, [cq+64* 9] + mova m5, [cq+64*23] + mova m6, [cq+64*25] + mova m7, [cq+64* 7] + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 1, 31, 17, 15, 9, 23, 25, 7 + add r6, o_idct64_offset + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + add r6, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova m0, [cq+64* 5] + mova m1, [cq+64*27] + mova m2, [cq+64*21] + mova m3, [cq+64*11] + mova m4, [cq+64*13] + mova m5, [cq+64*19] + mova m6, [cq+64*29] + mova m7, [cq+64* 3] + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 5, 27, 21, 11, 13, 19, 29, 3 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass1 + sub tmp1q, 32*44 + vpbroadcastd m10, [o(pw_8192)] + call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_round_interleave + add cq, 32 + add r10d, 0x80000000 + jnc .pass1_loop + lea tmp1q, [rsp+32*7] + mov r10b, 4 +.pass2_loop: + lea r2, [tmp1q+32*64] + mova m0, [r2-32*4] + mova m1, [r2-32*2] + mova m2, [r2+32*0] + mova m3, [r2+32*2] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 + mova [rsp], m4 + test r10d, 0x40000000 + jnz .fast + lea r3, [r2+32*64] + mova m4, [r3-32*4] + mova m5, [r3-32*2] + mova m6, [r3+32*0] + mova m7, [r3+32*2] +.fast: + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + mova [tmp1q-32*4], m0 + mova [tmp1q-32*3], m1 + mova [tmp1q-32*2], m2 + mova [tmp1q-32*1], m3 + mova [tmp1q+32*0], m4 + mova [tmp1q+32*1], m5 + mova [tmp1q+32*2], m6 + mova [tmp1q+32*3], m7 + add tmp1q, 32*8 + mova [tmp1q-32*4], m8 + mova [tmp1q-32*3], m9 + mova [tmp1q-32*2], m10 + mova [tmp1q-32*1], m11 + mova [tmp1q+32*0], m12 + mova [tmp1q+32*1], m13 + mova [tmp1q+32*2], m14 + mova [tmp1q+32*3], m15 + mova m0, [r2-32*3] + mova m1, [r2-32*1] + mova m2, [r2+32*1] + mova m3, [r2+32*3] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + test r10d, 0x40000000 + jnz .fast2 + mova m4, [r3-32*3] + mova m5, [r3-32*1] + mova m6, [r3+32*1] + mova m7, [r3+32*3] +.fast2: + add tmp1q, 32*8 + lea tmp2q, [tmp1q+32*8] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + vpbroadcastd m15, [o(pd_2048)] + add r2, 32*8 + add r3, 32*8 + add tmp1q, 32*16 + add tmp2q, 32*32 + mova m0, [r2-32*4] ; 1 + mova m3, [r2+32*3] ; 15 + mova m4, [r2+32*0] ; 9 + mova m7, [r2-32*1] ; 7 + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r10d, 0x40000000 + jnz .fast3 + mova m1, [r3+32*3] ; 31 + mova m2, [r3-32*4] ; 17 + mova m5, [r3-32*1] ; 23 + mova m6, [r3+32*0] ; 25 +.fast3: + add r6, o_idct64_offset + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + add r6, 8 + add tmp1q, 32*8 + sub tmp2q, 32*8 + mova m0, [r2-32*2] ; 5 + mova m3, [r2+32*1] ; 11 + mova m4, [r2+32*2] ; 13 + mova m7, [r2-32*3] ; 3 + pxor m1, m1 + REPX {mova x, m1}, m2, m5, m6 + test r10d, 0x40000000 + jnz .fast4 + mova m1, [r3+32*1] ; 27 + mova m2, [r3-32*2] ; 21 + mova m5, [r3-32*3] ; 19 + mova m6, [r3+32*2] ; 29 +.fast4: + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_pass2 + sub tmp1q, 32*28 + sub dstq, r8 + lea dstq, [dstq+strideq*4+16] + dec r10b + jg .pass2_loop + RET + +%endif ; ARCH_X86_64 |