diff options
Diffstat (limited to 'third_party/dav1d/src/x86/itx_sse.asm')
-rw-r--r-- | third_party/dav1d/src/x86/itx_sse.asm | 6533 |
1 files changed, 6533 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/itx_sse.asm b/third_party/dav1d/src/x86/itx_sse.asm new file mode 100644 index 0000000000..ec7e3a52f4 --- /dev/null +++ b/third_party/dav1d/src/x86/itx_sse.asm @@ -0,0 +1,6533 @@ +; Copyright © 2018-2021, VideoLAN and dav1d authors +; Copyright © 2018, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + + +SECTION_RODATA 16 + +deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 + +deint_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +deint_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 + +%macro COEF_PAIR 2-3 0 ; !0 = m%1_m%2, 2 = no %2_%1 +pw_%1_m%2: times 4 dw %1, -%2 +%if %3 != 2 +pw_%2_%1: times 4 dw %2, %1 +%endif +%if %3 +pw_m%1_m%2: times 4 dw -%1, -%2 +%endif +%endmacro + +;adst4 +pw_1321_3803: times 4 dw 1321, 3803 +pw_2482_m1321: times 4 dw 2482, -1321 +pw_3344_2482: times 4 dw 3344, 2482 +pw_3344_m3803: times 4 dw 3344, -3803 +pw_3344_m3344: times 4 dw 3344, -3344 +pw_0_3344 times 4 dw 0, 3344 +pw_m6688_m3803: times 4 dw -6688, -3803 + +COEF_PAIR 2896, 2896 +COEF_PAIR 1567, 3784 +COEF_PAIR 799, 4017 +COEF_PAIR 3406, 2276 +COEF_PAIR 401, 4076 +COEF_PAIR 1931, 3612 +COEF_PAIR 3166, 2598 +COEF_PAIR 3920, 1189 +COEF_PAIR 3784, 1567, 1 +COEF_PAIR 995, 3973 +COEF_PAIR 1751, 3703 +COEF_PAIR 3513, 2106 +COEF_PAIR 3857, 1380 +COEF_PAIR 4017, 799, 1 +COEF_PAIR 201, 4091 +COEF_PAIR 2440, 3290 +COEF_PAIR 3035, 2751 +COEF_PAIR 4052, 601 +COEF_PAIR 2276, 3406, 1 +COEF_PAIR 4076, 401, 2 +COEF_PAIR 2598, 3166, 2 +COEF_PAIR 3612, 1931, 2 +COEF_PAIR 1189, 3920, 2 + +pd_2048: times 4 dd 2048 +pw_2048: times 8 dw 2048 +pw_m2048: times 8 dw -2048 +pw_4096: times 8 dw 4096 +pw_16384: times 8 dw 16384 +pw_m16384: times 8 dw -16384 +pw_1697x16: times 8 dw 1697*16 +pw_1697x8: times 8 dw 1697*8 +pw_2896x8: times 8 dw 2896*8 +pw_3344x8: times 8 dw 3344*8 +pw_8192: times 8 dw 8192 +pw_m8192: times 8 dw -8192 +pw_5: times 8 dw 5 +pw_201x8: times 8 dw 201*8 +pw_4091x8: times 8 dw 4091*8 +pw_m2751x8: times 8 dw -2751*8 +pw_3035x8: times 8 dw 3035*8 +pw_1751x8: times 8 dw 1751*8 +pw_3703x8: times 8 dw 3703*8 +pw_m1380x8: times 8 dw -1380*8 +pw_3857x8: times 8 dw 3857*8 +pw_995x8: times 8 dw 995*8 +pw_3973x8: times 8 dw 3973*8 +pw_m2106x8: times 8 dw -2106*8 +pw_3513x8: times 8 dw 3513*8 +pw_2440x8: times 8 dw 2440*8 +pw_3290x8: times 8 dw 3290*8 +pw_m601x8: times 8 dw -601*8 +pw_4052x8: times 8 dw 4052*8 + +pw_4095x8: times 8 dw 4095*8 +pw_101x8: times 8 dw 101*8 +pw_2967x8: times 8 dw 2967*8 +pw_m2824x8: times 8 dw -2824*8 +pw_3745x8: times 8 dw 3745*8 +pw_1660x8: times 8 dw 1660*8 +pw_3822x8: times 8 dw 3822*8 +pw_m1474x8: times 8 dw -1474*8 +pw_3996x8: times 8 dw 3996*8 +pw_897x8: times 8 dw 897*8 +pw_3461x8: times 8 dw 3461*8 +pw_m2191x8: times 8 dw -2191*8 +pw_3349x8: times 8 dw 3349*8 +pw_2359x8: times 8 dw 2359*8 +pw_4036x8: times 8 dw 4036*8 +pw_m700x8: times 8 dw -700*8 +pw_4065x8: times 8 dw 4065*8 +pw_501x8: times 8 dw 501*8 +pw_3229x8: times 8 dw 3229*8 +pw_m2520x8: times 8 dw -2520*8 +pw_3564x8: times 8 dw 3564*8 +pw_2019x8: times 8 dw 2019*8 +pw_3948x8: times 8 dw 3948*8 +pw_m1092x8: times 8 dw -1092*8 +pw_3889x8: times 8 dw 3889*8 +pw_1285x8: times 8 dw 1285*8 +pw_3659x8: times 8 dw 3659*8 +pw_m1842x8: times 8 dw -1842*8 +pw_3102x8: times 8 dw 3102*8 +pw_2675x8: times 8 dw 2675*8 +pw_4085x8: times 8 dw 4085*8 +pw_m301x8: times 8 dw -301*8 + +SECTION .text + +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +%if ARCH_X86_64 +%define o(x) x +%else +%define o(x) r5-$$+x ; PIC +%endif + +%macro WRITE_4X4 9 ;src[1-2], tmp[1-3], row[1-4] + lea r2, [dstq+strideq*2] +%assign %%i 1 +%rotate 5 +%rep 4 + %if %1 & 2 + CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) + %else + CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) + %endif + %assign %%i %%i + 1 + %rotate 1 +%endrep + + movd m%3, [%%row_adr1] ;dst0 + movd m%5, [%%row_adr2] ;dst1 + punpckldq m%3, m%5 ;high: dst1 :low: dst0 + movd m%4, [%%row_adr3] ;dst2 + movd m%5, [%%row_adr4] ;dst3 + punpckldq m%4, m%5 ;high: dst3 :low: dst2 + + pxor m%5, m%5 + punpcklbw m%3, m%5 ;extend byte to word + punpcklbw m%4, m%5 ;extend byte to word + + paddw m%3, m%1 ;high: dst1 + out1 ;low: dst0 + out0 + paddw m%4, m%2 ;high: dst3 + out3 ;low: dst2 + out2 + + packuswb m%3, m%4 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0 + + movd [%%row_adr1], m%3 ;store dst0 + out0 + pshuflw m%4, m%3, q1032 + movd [%%row_adr2], m%4 ;store dst1 + out1 + punpckhqdq m%3, m%3 + movd [%%row_adr3], m%3 ;store dst2 + out2 + psrlq m%3, 32 + movd [%%row_adr4], m%3 ;store dst3 + out3 +%endmacro + +%macro ITX4_END 4-5 2048 ; row[1-4], rnd +%if %5 + mova m2, [o(pw_%5)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 +%endif + + WRITE_4X4 0, 1, 2, 3, 4, %1, %2, %3, %4 + ret +%endmacro + +; flags: 1 = swap, 2: coef_regs, 4: no_pack +%macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags +%if %6 & 2 + pmaddwd m%2, m%4, m%1 + pmaddwd m%1, m%5 +%elif %6 & 1 + pmaddwd m%2, m%1, [o(pw_%5_%4)] + pmaddwd m%1, [o(pw_%4_m%5)] +%else + pmaddwd m%2, m%1, [o(pw_%4_m%5)] + pmaddwd m%1, [o(pw_%5_%4)] +%endif + paddd m%2, m%3 + paddd m%1, m%3 + psrad m%2, 12 + psrad m%1, 12 +%if %6 & 4 == 0 + packssdw m%1, m%2 +%endif +%endmacro + +%macro IDCT4_1D_PACKED 0-1 ;pw_2896x8 + mova m3, [o(pd_2048)] + punpckhwd m2, m0, m1 ;unpacked in1 in3 + punpcklwd m0, m1 ;unpacked in0 in2 + ITX_MUL2X_PACK 2, 1, 3, 1567, 3784 + ITX_MUL2X_PACK 0, 1, 3, 2896, 2896 + psubsw m1, m0, m2 ;high: out2 ;low: out3 + paddsw m0, m2 ;high: out1 ;low: out0 +%endmacro + +%macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack +cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, %4, dst, stride, coeff, eob, tx2 + %define %%p1 m(i%1_%3_internal_8bpc) +%if ARCH_X86_32 + LEA r5, $$ +%endif +%if has_epilogue +%ifidn %1_%2, dct_dct + test eobd, eobd + jz %%end +%endif + lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)] + call %%p1 + RET +%%end: +%else + lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)] +%ifidn %1_%2, dct_dct + test eobd, eobd + jnz %%p1 +%else + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endif +%endmacro + +%macro INV_TXFM_4X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x4, 6 +%ifidn %1_%2, dct_dct + pshuflw m0, [coeffq], q0000 + punpcklqdq m0, m0 + mova m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mov [coeffq], eobd ;0 + pmulhrsw m0, m1 + mova m1, m0 + TAIL_CALL m(iadst_4x4_internal_8bpc).end2 +%endif +%endmacro + +INIT_XMM ssse3 +; itx16 relies on dct_dct being the first function. If you change the order, adjust `itx8_start` in itx16. + +INV_TXFM_4X4_FN dct, dct +INV_TXFM_4X4_FN dct, adst +INV_TXFM_4X4_FN dct, flipadst +INV_TXFM_4X4_FN dct, identity + +cglobal idct_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m0, [coeffq+16*0] ;high: in1 ;low: in0 + mova m1, [coeffq+16*1] ;high: in3 ;low in2 + + IDCT4_1D_PACKED + + mova m2, [o(deint_shuf)] + shufps m3, m0, m1, q1331 + shufps m0, m1, q0220 + pshufb m0, m2 ;high: in1 ;low: in0 + pshufb m1, m3, m2 ;high: in3 ;low :in2 + jmp tx2q + +.pass2: + IDCT4_1D_PACKED + + pxor m2, m2 + mova [coeffq+16*0], m2 + mova [coeffq+16*1], m2 ;memset(coeff, 0, sizeof(*coeff) * sh * sw); + + ITX4_END 0, 1, 3, 2 + +INV_TXFM_4X4_FN adst, dct +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst +INV_TXFM_4X4_FN adst, identity + +cglobal iadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + call .main + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 ;high: in3 ;low :in2 + punpcklwd m0, m2 ;high: in1 ;low: in0 + jmp tx2q + +.pass2: + call .main + +.end: + pxor m2, m2 + mova [coeffq+16*0], m2 + mova [coeffq+16*1], m2 + +.end2: + ITX4_END 0, 1, 2, 3 + +ALIGN function_align +cglobal_label .main + punpcklwd m2, m0, m1 ;unpacked in0 in2 + punpckhwd m0, m1 ;unpacked in1 in3 + mova m3, m0 + pmaddwd m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2 + pmaddwd m0, [o(pw_0_3344)] ;3344 * in3 + paddd m1, m0 ;t2 + pmaddwd m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 + pmaddwd m2, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 + pmaddwd m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 + pmaddwd m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3 + paddd m4, m0 ;t0 + t3 + pmaddwd m3, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 + mova m0, [o(pd_2048)] + paddd m1, m0 ;t2 + 2048 + paddd m2, m0 + paddd m0, m4 ;t0 + t3 + 2048 + paddd m5, m2 ;t1 + t3 + 2048 + paddd m2, m4 + paddd m2, m3 ;t0 + t1 - t3 + 2048 + REPX {psrad x, 12}, m1, m0, m5, m2 + packssdw m0, m5 ;high: out1 ;low: out0 + packssdw m1, m2 ;high: out3 ;low: out3 + ret + +INV_TXFM_4X4_FN flipadst, dct +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst +INV_TXFM_4X4_FN flipadst, identity + +cglobal iflipadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + call m(iadst_4x4_internal_8bpc).main + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 ;high: in3 ;low :in2 + punpckhwd m1, m2 ;high: in1 ;low: in0 + jmp tx2q + +.pass2: + call m(iadst_4x4_internal_8bpc).main + +.end: + pxor m2, m2 + mova [coeffq+16*0], m2 + mova [coeffq+16*1], m2 + +.end2: + ITX4_END 3, 2, 1, 0 + +INV_TXFM_4X4_FN identity, dct +INV_TXFM_4X4_FN identity, adst +INV_TXFM_4X4_FN identity, flipadst +INV_TXFM_4X4_FN identity, identity + +cglobal iidentity_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + mova m3, [o(pw_1697x8)] + pmulhrsw m2, m0, m3 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 ;high: in3 ;low :in2 + punpcklwd m0, m2 ;high: in1 ;low: in0 + jmp tx2q + +.pass2: + mova m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_4x4_internal_8bpc).end + +%macro IWHT4_1D_PACKED 0 + punpckhqdq m3, m0, m1 ;low: in1 high: in3 + punpcklqdq m0, m1 ;low: in0 high: in2 + psubw m2, m0, m3 ;low: in0 - in1 high: in2 - in3 + paddw m0, m3 ;low: in0 + in1 high: in2 + in3 + punpckhqdq m2, m2 ;t2 t2 + punpcklqdq m0, m0 ;t0 t0 + psubw m1, m0, m2 + psraw m1, 1 ;t4 t4 + psubw m1, m3 ;low: t1/out2 high: t3/out1 + psubw m0, m1 ;high: out0 + paddw m2, m1 ;low: out3 +%endmacro + +INIT_XMM sse2 +cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, coeff + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + pxor m2, m2 + mova [coeffq+16*0], m2 + mova [coeffq+16*1], m2 + psraw m0, 2 + psraw m1, 2 + IWHT4_1D_PACKED + punpckhwd m0, m1 + punpcklwd m3, m1, m2 + punpckhdq m1, m0, m3 + punpckldq m0, m3 + IWHT4_1D_PACKED + shufpd m0, m2, 0x01 + ITX4_END 0, 3, 2, 1, 0 + +%macro IDCT8_1D_PACKED 0 + mova m6, [o(pd_2048)] + punpckhwd m4, m0, m3 ;unpacked in1 in7 + punpcklwd m0, m2 ;unpacked in0 in4 + punpckhwd m2, m1 ;unpacked in5 in3 + punpcklwd m1, m3 ;unpacked in2 in6 + ITX_MUL2X_PACK 4, 3, 6, 799, 4017 ;low: t7a high: t4a + ITX_MUL2X_PACK 2, 3, 6, 3406, 2276 ;low: t6a high: t5a + ITX_MUL2X_PACK 1, 3, 6, 1567, 3784 ;low: t3 high: t2 + psubsw m3, m4, m2 ;low: t6a high: t5a + paddsw m4, m2 ;low: t7 high: t4 + pshufb m3, [o(deint_shuf1)] + ITX_MUL2X_PACK 0, 2, 6, 2896, 2896 ;low: t0 high: t1 + ITX_MUL2X_PACK 3, 2, 6, 2896, 2896 ;low: t6 high: t5 + psubsw m2, m0, m1 ;low: tmp3 high: tmp2 + paddsw m0, m1 ;low: tmp0 high: tmp1 + punpcklqdq m1, m4, m3 ;low: t7 high: t6 + punpckhqdq m4, m3 ;low: t4 high: t5 + psubsw m3, m0, m1 ;low: out7 high: out6 + paddsw m0, m1 ;low: out0 high: out1 + paddsw m1, m2, m4 ;low: out3 high: out2 + psubsw m2, m4 ;low: out4 high: out5 +%endmacro + +;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +%macro ITX_MULSUB_2W 7-8 0 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2_in_tmp1 + punpckhwd m%4, m%1, m%2 + punpcklwd m%1, m%2 +%if %7 < 8 + pmaddwd m%2, m%7, m%1 + pmaddwd m%3, m%7, m%4 +%else + mova m%2, [o(pw_%7_%6)] +%if %8 + pmaddwd m%3, m%1, m%2 + pmaddwd m%2, m%4 +%else + pmaddwd m%3, m%4, m%2 + pmaddwd m%2, m%1 +%endif +%endif + paddd m%3, m%5 + paddd m%2, m%5 + psrad m%3, 12 + psrad m%2, 12 +%if %8 + packssdw m%3, m%2 +%else + packssdw m%2, m%3 ;dst2 +%endif +%if %7 < 8 + pmaddwd m%4, m%6 + pmaddwd m%1, m%6 +%elif %8 + mova m%2, [o(pw_%6_m%7)] + pmaddwd m%4, m%2 + pmaddwd m%1, m%2 +%else + mova m%3, [o(pw_%6_m%7)] + pmaddwd m%4, m%3 + pmaddwd m%1, m%3 +%endif + paddd m%4, m%5 + paddd m%1, m%5 + psrad m%4, 12 + psrad m%1, 12 + packssdw m%1, m%4 ;dst1 +%endmacro + +%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048 + ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, 1 ;t2, t3 + ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, 1 ;t1, t0 + psubsw m%3, m%1, m%2 ;out2 + paddsw m%2, m%1 ;out1 + paddsw m%1, m%5, m%4 ;out0 + psubsw m%4, m%5 ;out3 +%endmacro + +%macro WRITE_4X8 4 ;row[1-4] + WRITE_4X4 0, 1, 4, 5, 6, %1, %2, %3, %4 + lea dstq, [dstq+strideq*4] + WRITE_4X4 2, 3, 4, 5, 6, %1, %2, %3, %4 +%endmacro + +%macro INV_4X8 0 + punpckhwd m4, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + punpckhdq m1, m0, m2 ;low: in2 high: in3 + punpckldq m0, m2 ;low: in0 high: in1 + punpckldq m2, m3, m4 ;low: in4 high: in5 + punpckhdq m3, m4 ;low: in6 high: in7 +%endmacro + +%macro INV_TXFM_4X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x8, 8 +%ifidn %1_%2, dct_dct + pshuflw m0, [coeffq], q0000 + punpcklqdq m0, m0 + mova m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mov [coeffq], eobd + pmulhrsw m0, m1 + pmulhrsw m0, m1 + pmulhrsw m0, [o(pw_2048)] + mova m1, m0 + mova m2, m0 + mova m3, m0 + TAIL_CALL m(iadst_4x8_internal_8bpc).end3 +%endif +%endmacro + +INIT_XMM ssse3 +INV_TXFM_4X8_FN dct, dct +INV_TXFM_4X8_FN dct, adst +INV_TXFM_4X8_FN dct, flipadst +INV_TXFM_4X8_FN dct, identity + +cglobal idct_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + +.pass1: + call m(idct_8x4_internal_8bpc).main + jmp m(iadst_4x8_internal_8bpc).pass1_end + +.pass2: + call .main + shufps m1, m1, q1032 + shufps m3, m3, q1032 + mova m4, [o(pw_2048)] + jmp m(iadst_4x8_internal_8bpc).end2 + +ALIGN function_align +cglobal_label .main + IDCT8_1D_PACKED + ret + + +INV_TXFM_4X8_FN adst, dct +INV_TXFM_4X8_FN adst, adst +INV_TXFM_4X8_FN adst, flipadst +INV_TXFM_4X8_FN adst, identity + +cglobal iadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + +.pass1: + call m(iadst_8x4_internal_8bpc).main + +.pass1_end: + INV_4X8 + jmp tx2q + +.pass2: + shufps m0, m0, q1032 + shufps m1, m1, q1032 + call .main + mova m4, [o(pw_2048)] + pxor m5, m5 + psubw m5, m4 + +.end: + punpcklqdq m4, m5 + +.end2: + pmulhrsw m0, m4 + pmulhrsw m1, m4 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + pxor m5, m5 + mova [coeffq+16*0], m5 + mova [coeffq+16*1], m5 + mova [coeffq+16*2], m5 + mova [coeffq+16*3], m5 + +.end3: + WRITE_4X8 0, 1, 2, 3 + RET + +ALIGN function_align +cglobal_label .main + mova m6, [o(pd_2048)] + punpckhwd m4, m3, m0 ;unpacked in7 in0 + punpckhwd m5, m2, m1 ;unpacked in5 in2 + punpcklwd m1, m2 ;unpacked in3 in4 + punpcklwd m0, m3 ;unpacked in1 in6 + ITX_MUL2X_PACK 4, 2, 6, 401, 4076 ;low: t0a high: t1a + ITX_MUL2X_PACK 5, 2, 6, 1931, 3612 ;low: t2a high: t3a + ITX_MUL2X_PACK 1, 2, 6, 3166, 2598 ;low: t4a high: t5a + ITX_MUL2X_PACK 0, 2, 6, 3920, 1189 ;low: t6a high: t7a + + psubsw m3, m4, m1 ;low: t4 high: t5 + paddsw m4, m1 ;low: t0 high: t1 + psubsw m2, m5, m0 ;low: t6 high: t7 + paddsw m5, m0 ;low: t2 high: t3 + + shufps m1, m3, m2, q1032 + punpckhwd m2, m1 + punpcklwd m3, m1 + ITX_MUL2X_PACK 3, 0, 6, 1567, 3784, 1 ;low: t5a high: t4a + ITX_MUL2X_PACK 2, 0, 6, 3784, 1567 ;low: t7a high: t6a + + psubsw m1, m4, m5 ;low: t2 high: t3 + paddsw m4, m5 ;low: out0 high: -out7 + psubsw m5, m3, m2 ;low: t7 high: t6 + paddsw m3, m2 ;low: out6 high: -out1 + shufps m0, m4, m3, q3210 ;low: out0 high: -out1 + shufps m3, m4, q3210 ;low: out6 high: -out7 + + mova m2, [o(pw_2896_m2896)] + mova m7, [o(pw_2896_2896)] + shufps m4, m1, m5, q1032 ;low: t3 high: t7 + shufps m1, m5, q3210 ;low: t2 high: t6 + punpcklwd m5, m1, m4 + punpckhwd m1, m4 + pmaddwd m4, m2, m1 ;-out5 + pmaddwd m2, m5 ; out4 + pmaddwd m1, m7 ; out2 + pmaddwd m5, m7 ;-out3 + REPX {paddd x, m6}, m4, m2, m1, m5 + REPX {psrad x, 12}, m4, m2, m1, m5 + packssdw m1, m5 ;low: out2 high: -out3 + packssdw m2, m4 ;low: out4 high: -out5 + ret + +INV_TXFM_4X8_FN flipadst, dct +INV_TXFM_4X8_FN flipadst, adst +INV_TXFM_4X8_FN flipadst, flipadst +INV_TXFM_4X8_FN flipadst, identity + +cglobal iflipadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + +.pass1: + call m(iadst_8x4_internal_8bpc).main + + punpcklwd m4, m3, m2 + punpckhwd m3, m2 + punpcklwd m5, m1, m0 + punpckhwd m1, m0 + punpckldq m2, m3, m1 ;low: in4 high: in5 + punpckhdq m3, m1 ;low: in6 high: in7 + punpckldq m0, m4, m5 ;low: in0 high: in1 + punpckhdq m1, m4, m5 ;low: in2 high: in3 + jmp tx2q + +.pass2: + shufps m0, m0, q1032 + shufps m1, m1, q1032 + call m(iadst_4x8_internal_8bpc).main + + mova m4, m0 + mova m5, m1 + pshufd m0, m3, q1032 + pshufd m1, m2, q1032 + pshufd m2, m5, q1032 + pshufd m3, m4, q1032 + mova m5, [o(pw_2048)] + pxor m4, m4 + psubw m4, m5 + jmp m(iadst_4x8_internal_8bpc).end + +INV_TXFM_4X8_FN identity, dct +INV_TXFM_4X8_FN identity, adst +INV_TXFM_4X8_FN identity, flipadst +INV_TXFM_4X8_FN identity, identity + +cglobal iidentity_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + +.pass1: + mova m7, [o(pw_1697x8)] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(iadst_4x8_internal_8bpc).pass1_end + +.pass2: + mova m4, [o(pw_4096)] + jmp m(iadst_4x8_internal_8bpc).end2 + + +%macro WRITE_8X2 5 ;coefs[1-2], tmp[1-3] + movq m%3, [dstq ] + movq m%4, [dstq+strideq] + pxor m%5, m%5 + punpcklbw m%3, m%5 ;extend byte to word + punpcklbw m%4, m%5 ;extend byte to word +%ifnum %1 + paddw m%3, m%1 +%else + paddw m%3, %1 +%endif +%ifnum %2 + paddw m%4, m%2 +%else + paddw m%4, %2 +%endif + packuswb m%3, m%4 + movq [dstq ], m%3 + punpckhqdq m%3, m%3 + movq [dstq+strideq], m%3 +%endmacro + +%macro WRITE_8X4 7 ;coefs[1-4], tmp[1-3] + WRITE_8X2 %1, %2, %5, %6, %7 + lea dstq, [dstq+strideq*2] + WRITE_8X2 %3, %4, %5, %6, %7 +%endmacro + +%macro INV_TXFM_8X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x4, 8 +%ifidn %1_%2, dct_dct + pshuflw m0, [coeffq], q0000 + punpcklqdq m0, m0 + mova m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + pmulhrsw m0, m1 + mova m2, [o(pw_2048)] + pmulhrsw m0, m1 + pmulhrsw m0, m2 + mova m1, m0 + mova m2, m0 + mova m3, m0 + TAIL_CALL m(iadst_8x4_internal_8bpc).end2 +%endif +%endmacro + +INV_TXFM_8X4_FN dct, dct +INV_TXFM_8X4_FN dct, adst +INV_TXFM_8X4_FN dct, flipadst +INV_TXFM_8X4_FN dct, identity + +cglobal idct_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + + call m(idct_4x8_internal_8bpc).main + + mova m4, [o(deint_shuf1)] + mova m5, [o(deint_shuf2)] + pshufb m0, m4 + pshufb m1, m5 + pshufb m2, m4 + pshufb m3, m5 + punpckhdq m4, m0, m1 + punpckldq m0, m1 + punpckhdq m5, m2, m3 + punpckldq m2, m3 + punpckhqdq m1, m0, m2 ;in1 + punpcklqdq m0, m2 ;in0 + punpckhqdq m3, m4, m5 ;in3 + punpcklqdq m2 ,m4, m5 ;in2 + jmp tx2q + +.pass2: + call .main + jmp m(iadst_8x4_internal_8bpc).end + +ALIGN function_align +cglobal_label .main + mova m6, [o(pd_2048)] + IDCT4_1D 0, 1, 2, 3, 4, 5, 6 + ret + +INV_TXFM_8X4_FN adst, dct +INV_TXFM_8X4_FN adst, adst +INV_TXFM_8X4_FN adst, flipadst +INV_TXFM_8X4_FN adst, identity + +cglobal iadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + + shufps m0, m0, q1032 + shufps m1, m1, q1032 + call m(iadst_4x8_internal_8bpc).main + + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + pxor m5, m5 + psubsw m3, m5, m1 + psubsw m5, m4 + punpckhdq m4, m5, m3 + punpckldq m5, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckhwd m1, m0, m5 ;in1 + punpcklwd m0, m5 ;in0 + punpcklwd m2, m3, m4 ;in2 + punpckhwd m3, m4 ;in3 + jmp tx2q + +.pass2: + call .main + +.end: + mova m4, [o(pw_2048)] + pmulhrsw m0, m4 + pmulhrsw m1, m4 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + +.end2: + pxor m6, m6 + mova [coeffq+16*0], m6 + mova [coeffq+16*1], m6 + mova [coeffq+16*2], m6 + mova [coeffq+16*3], m6 +.end3: + WRITE_8X4 0, 1, 2, 3, 4, 5, 6 + RET + +ALIGN function_align +cglobal_label .main + punpckhwd m6, m0, m2 ;unpacked in0 in2 + punpcklwd m0, m2 ;unpacked in0 in2 + punpckhwd m7, m1, m3 ;unpacked in1 in3 + punpcklwd m1, m3 ;unpacked in1 in3 + + mova m2, [o(pw_3344_m3344)] + mova m4, [o(pw_0_3344)] + pmaddwd m3, m2, m6 ;3344 * in0 - 3344 * in2 + pmaddwd m5, m4, m7 ;3344 * in3 + pmaddwd m2, m0 + pmaddwd m4, m1 + paddd m3, m5 + paddd m2, m4 + mova m4, [o(pd_2048)] + paddd m3, m4 ;t2 + 2048 + paddd m2, m4 + psrad m3, 12 + psrad m2, 12 + packssdw m2, m3 ;out2 + + pmaddwd m4, m0, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 + pmaddwd m0, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 + pmaddwd m3, m1, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 + pmaddwd m5, m1, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3 + paddd m3, m4 ;t0 + t3 + + pmaddwd m1, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 + mova m4, [o(pd_2048)] + paddd m0, m4 + paddd m4, m3 ;t0 + t3 + 2048 + paddd m5, m0 ;t1 + t3 + 2048 + paddd m3, m0 + paddd m3, m1 ;t0 + t1 - t3 + 2048 + + psrad m4, 12 ;out0 + psrad m5, 12 ;out1 + psrad m3, 12 ;out3 + packssdw m0, m4, m5 ;low: out0 high: out1 + + pmaddwd m4, m6, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 + pmaddwd m6, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 + pmaddwd m1, m7, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 + pmaddwd m5, m7, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3 + paddd m1, m4 ;t0 + t3 + pmaddwd m7, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 + + mova m4, [o(pd_2048)] + paddd m6, m4 + paddd m4, m1 ;t0 + t3 + 2048 + paddd m5, m6 ;t1 + t3 + 2048 + paddd m1, m6 + paddd m1, m7 ;t0 + t1 - t3 + 2048 + + psrad m4, 12 ;out0 + psrad m5, 12 ;out1 + psrad m1, 12 ;out3 + packssdw m3, m1 ;out3 + packssdw m4, m5 ;low: out0 high: out1 + + punpckhqdq m1, m0, m4 ;out1 + punpcklqdq m0, m4 ;out0 + ret + +INV_TXFM_8X4_FN flipadst, dct +INV_TXFM_8X4_FN flipadst, adst +INV_TXFM_8X4_FN flipadst, flipadst +INV_TXFM_8X4_FN flipadst, identity + +cglobal iflipadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + + shufps m0, m0, q1032 + shufps m1, m1, q1032 + call m(iadst_4x8_internal_8bpc).main + + punpckhwd m5, m3, m2 + punpcklwd m3, m2 + punpckhwd m2, m1, m0 + punpcklwd m1, m0 + + pxor m0, m0 + psubsw m4, m0, m2 + psubsw m0, m5 + punpckhdq m2, m0, m4 + punpckldq m0, m4 + punpckhdq m4, m3, m1 + punpckldq m3, m1 + punpckhwd m1, m0, m3 ;in1 + punpcklwd m0, m3 ;in0 + punpckhwd m3, m2, m4 ;in3 + punpcklwd m2, m4 ;in2 + jmp tx2q + +.pass2: + call m(iadst_8x4_internal_8bpc).main + mova m4, m0 + mova m5, m1 + mova m0, m3 + mova m1, m2 + mova m2, m5 + mova m3, m4 + jmp m(iadst_8x4_internal_8bpc).end + +INV_TXFM_8X4_FN identity, dct +INV_TXFM_8X4_FN identity, adst +INV_TXFM_8X4_FN identity, flipadst +INV_TXFM_8X4_FN identity, identity + +cglobal iidentity_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [coeffq+16*0] + pmulhrsw m1, m3, [coeffq+16*1] + pmulhrsw m2, m3, [coeffq+16*2] + pmulhrsw m3, [coeffq+16*3] + paddsw m0, m0 + paddsw m1, m1 + paddsw m2, m2 + paddsw m3, m3 + + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhdq m5, m4, m1 + punpckldq m4, m1 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckhwd m1, m0, m4 ;in1 + punpcklwd m0, m4 ;in0 + punpcklwd m2, m3, m5 ;in2 + punpckhwd m3, m5 ;in3 + jmp tx2q + +.pass2: + mova m7, [o(pw_1697x8)] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(iadst_8x4_internal_8bpc).end + +%macro INV_TXFM_8X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x8, 8, 16*4 +%ifidn %1_%2, dct_dct + pshuflw m0, [coeffq], q0000 + punpcklwd m0, m0 + mova m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mova m2, [o(pw_16384)] + mov [coeffq], eobd + pmulhrsw m0, m2 + psrlw m2, 3 + pmulhrsw m0, m1 + pmulhrsw m0, m2 +.end: + mov r3d, 2 + lea tx2q, [o(m(inv_txfm_add_dct_dct_8x8_8bpc).end3)] +.loop: + WRITE_8X4 0, 0, 0, 0, 1, 2, 3 + lea dstq, [dstq+strideq*2] + dec r3d + jg .loop + jmp tx2q +.end3: + RET +%endif +%endmacro + +%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2 +%if %3 + mova m7, [o(pw_2896x8)] + pmulhrsw m0, m7, [%1+%2*0] + pmulhrsw m1, m7, [%1+%2*1] + pmulhrsw m2, m7, [%1+%2*2] + pmulhrsw m3, m7, [%1+%2*3] + pmulhrsw m4, m7, [%1+%2*4] + pmulhrsw m5, m7, [%1+%2*5] + pmulhrsw m6, m7, [%1+%2*6] + pmulhrsw m7, [%1+%2*7] +%else + mova m0, [%1+%2*0] + mova m1, [%1+%2*1] + mova m2, [%1+%2*2] + mova m3, [%1+%2*3] + mova m4, [%1+%2*4] + mova m5, [%1+%2*5] + mova m6, [%1+%2*6] + mova m7, [%1+%2*7] +%endif +%endmacro + +%macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048 + ITX_MULSUB_2W %1, %4, %5, %6, %7, 799, 4017 ;t4a, t7a + ITX_MULSUB_2W %3, %2, %5, %6, %7, 3406, 2276, 1 ;t5a, t6a + psubsw m%2, m%4, m%5 ;t6a + paddsw m%4, m%5 ;t7 + psubsw m%5, m%1, m%3 ;t5a + paddsw m%1, m%3 ;t4 + ITX_MULSUB_2W %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6 +%endmacro + +INV_TXFM_8X8_FN dct, dct +INV_TXFM_8X8_FN dct, adst +INV_TXFM_8X8_FN dct, flipadst +INV_TXFM_8X8_FN dct, identity + +cglobal idct_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq, 16 + +.pass1: + call .main + +.pass1_end: + mova m7, [o(pw_16384)] + +.pass1_end1: + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova [rsp+gprsize+16*1], m6 + +.pass1_end2: + REPX {pmulhrsw x, m7}, m1, m3, m5 + pmulhrsw m7, [rsp+gprsize+16*0] + +cglobal_label .pass1_end3 + punpcklwd m6, m1, m5 ;10 50 11 51 12 52 13 53 + punpckhwd m1, m5 ;14 54 15 55 16 56 17 57 + punpckhwd m5, m0, m4 ;04 44 05 45 06 46 07 47 + punpcklwd m0, m4 ;00 40 01 41 02 42 03 43 + punpckhwd m4, m3, m7 ;34 74 35 75 36 76 37 77 + punpcklwd m3, m7 ;30 70 31 71 32 72 33 73 + punpckhwd m7, m1, m4 ;16 36 56 76 17 37 57 77 + punpcklwd m1, m4 ;14 34 54 74 15 35 55 75 + punpckhwd m4, m6, m3 ;12 32 52 72 13 33 53 73 + punpcklwd m6, m3 ;10 30 50 70 11 31 51 71 + mova [rsp+gprsize+16*2], m6 + mova m6, [rsp+gprsize+16*1] + punpckhwd m3, m2, m6 ;24 64 25 65 26 66 27 67 + punpcklwd m2, m6 ;20 60 21 61 22 62 23 63 + punpckhwd m6, m5, m3 ;06 26 46 66 07 27 47 67 + punpcklwd m5, m3 ;04 24 44 64 05 25 45 65 + punpckhwd m3, m0, m2 ;02 22 42 62 03 23 43 63 + punpcklwd m0, m2 ;00 20 40 60 01 21 41 61 + + punpckhwd m2, m6, m7 ;07 17 27 37 47 57 67 77 + punpcklwd m6, m7 ;06 16 26 36 46 56 66 76 + mova [rsp+gprsize+16*0], m2 + punpcklwd m2, m3, m4 ;02 12 22 32 42 52 62 72 + punpckhwd m3, m4 ;03 13 23 33 43 53 63 73 + punpcklwd m4, m5, m1 ;04 14 24 34 44 54 64 74 + punpckhwd m5, m1 ;05 15 25 35 45 55 65 75 + mova m7, [rsp+gprsize+16*2] + punpckhwd m1, m0, m7 ;01 11 21 31 41 51 61 71 + punpcklwd m0, m7 ;00 10 20 30 40 50 60 70 + mova m7, [rsp+gprsize+16*0] + jmp tx2q + +.pass2: + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] + +.pass2_main: + call .main + +.end: + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova [rsp+gprsize+16*1], m6 + +.end2: + REPX {pmulhrsw x, m7}, m1, m3, m5 + pmulhrsw m7, [rsp+gprsize+16*0] + mova [rsp+gprsize+16*2], m5 + mova [rsp+gprsize+16*0], m7 + +.end3: + WRITE_8X4 0, 1, 2, 3, 5, 6, 7 + lea dstq, [dstq+strideq*2] + WRITE_8X4 4, [rsp+gprsize+16*2], [rsp+gprsize+16*1], [rsp+gprsize+16*0], 5, 6, 7 + jmp tx2q + +.end4: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + ret + +ALIGN function_align +cglobal_label .main + mova [rsp+gprsize*2+16*0], m7 + mova [rsp+gprsize*2+16*1], m3 + mova [rsp+gprsize*2+16*2], m1 + mova m7, [o(pd_2048)] + IDCT4_1D 0, 2, 4, 6, 1, 3, 7 + mova m3, [rsp+gprsize*2+16*2] + mova [rsp+gprsize*2+16*2], m2 + mova m2, [rsp+gprsize*2+16*1] + mova [rsp+gprsize*2+16*1], m4 + mova m4, [rsp+gprsize*2+16*0] + mova [rsp+gprsize*2+16*0], m6 + IDCT8_1D_ODDHALF 3, 2, 5, 4, 1, 6, 7 + mova m6, [rsp+gprsize*2+16*0] + psubsw m7, m0, m4 ;out7 + paddsw m0, m4 ;out0 + mova [rsp+gprsize*2+16*0], m7 + mova m1, [rsp+gprsize*2+16*2] + psubsw m4, m6, m3 ;out4 + paddsw m3, m6 ;out3 + mova m7, [rsp+gprsize*2+16*1] + psubsw m6, m1, m5 ;out6 + paddsw m1, m5 ;out1 + psubsw m5, m7, m2 ;out5 + paddsw m2, m7 ;out2 + ret + + +INV_TXFM_8X8_FN adst, dct +INV_TXFM_8X8_FN adst, adst +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity + +cglobal iadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq, 16 + +.pass1: + call .main + call .main_pass1_end + +.pass1_end: + mova m7, [o(pw_16384)] + +.pass1_end1: + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova [rsp+gprsize+16*1], m6 + pxor m6, m6 + psubw m6, m7 + mova m7, m6 + jmp m(idct_8x8_internal_8bpc).pass1_end2 + +ALIGN function_align +.pass2: + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] + +.pass2_main: + call .main + call .main_pass2_end + +.end: + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova [rsp+gprsize+16*1], m6 + pxor m6, m6 + psubw m6, m7 + mova m7, m6 + jmp m(idct_8x8_internal_8bpc).end2 + +ALIGN function_align +cglobal_label .main + mova [rsp+gprsize*2+16*0], m7 + mova [rsp+gprsize*2+16*1], m3 + mova [rsp+gprsize*2+16*2], m4 + mova m7, [o(pd_2048)] + ITX_MULSUB_2W 5, 2, 3, 4, 7, 1931, 3612 ;t3a, t2a + ITX_MULSUB_2W 1, 6, 3, 4, 7, 3920, 1189 ;t7a, t6a + paddsw m3, m2, m6 ;t2 + psubsw m2, m6 ;t6 + paddsw m4, m5, m1 ;t3 + psubsw m5, m1 ;t7 + ITX_MULSUB_2W 5, 2, 1, 6, 7, 3784, 1567 ;t6a, t7a + + mova m6, [rsp+gprsize*2+16*2] + mova [rsp+gprsize*2+16*2], m5 + mova m1, [rsp+gprsize*2+16*1] + mova [rsp+gprsize*2+16*1], m2 + mova m5, [rsp+gprsize*2+16*0] + mova [rsp+gprsize*2+16*0], m3 + ITX_MULSUB_2W 5, 0, 2, 3, 7, 401, 4076 ;t1a, t0a + ITX_MULSUB_2W 1, 6, 2, 3, 7, 3166, 2598 ;t5a, t4a + psubsw m2, m0, m6 ;t4 + paddsw m0, m6 ;t0 + paddsw m3, m5, m1 ;t1 + psubsw m5, m1 ;t5 + ITX_MULSUB_2W 2, 5, 1, 6, 7, 1567, 3784 ;t5a, t4a + + mova m7, [rsp+gprsize*2+16*0] + paddsw m1, m3, m4 ;-out7 + psubsw m3, m4 ;t3 + mova [rsp+gprsize*2+16*0], m1 + psubsw m4, m0, m7 ;t2 + paddsw m0, m7 ;out0 + mova m6, [rsp+gprsize*2+16*2] + mova m7, [rsp+gprsize*2+16*1] + paddsw m1, m5, m6 ;-out1 + psubsw m5, m6 ;t6 + paddsw m6, m2, m7 ;out6 + psubsw m2, m7 ;t7 + ret +ALIGN function_align +.main_pass1_end: + mova [rsp+gprsize*2+16*1], m1 + mova [rsp+gprsize*2+16*2], m6 + punpckhwd m1, m4, m3 + punpcklwd m4, m3 + punpckhwd m7, m5, m2 + punpcklwd m5, m2 + mova m2, [o(pw_2896_2896)] + mova m6, [o(pd_2048)] + pmaddwd m3, m2, m7 + pmaddwd m2, m5 + paddd m3, m6 + paddd m2, m6 + psrad m3, 12 + psrad m2, 12 + packssdw m2, m3 ;out2 + mova m3, [o(pw_2896_m2896)] + pmaddwd m7, m3 + pmaddwd m5, m3 + paddd m7, m6 + paddd m5, m6 + psrad m7, 12 + psrad m5, 12 + packssdw m5, m7 ;-out5 + mova m3, [o(pw_2896_2896)] + pmaddwd m7, m3, m1 + pmaddwd m3, m4 + paddd m7, m6 + paddd m3, m6 + psrad m7, 12 + psrad m3, 12 + packssdw m3, m7 ;-out3 + mova m7, [o(pw_2896_m2896)] + pmaddwd m1, m7 + pmaddwd m4, m7 + paddd m1, m6 + paddd m4, m6 + psrad m1, 12 + psrad m4, 12 + packssdw m4, m1 ;-out5 + mova m1, [rsp+gprsize*2+16*1] + mova m6, [rsp+gprsize*2+16*2] + ret +ALIGN function_align +cglobal_label .main_pass2_end + paddsw m7, m4, m3 ;t2 + t3 + psubsw m4, m3 ;t2 - t3 + paddsw m3, m5, m2 ;t6 + t7 + psubsw m5, m2 ;t6 - t7 + mova m2, [o(pw_2896x8)] + pmulhrsw m4, m2 ;out4 + pmulhrsw m5, m2 ;-out5 + pmulhrsw m7, m2 ;-out3 + pmulhrsw m2, m3 ;out2 + mova m3, m7 + ret + +INV_TXFM_8X8_FN flipadst, dct +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, flipadst +INV_TXFM_8X8_FN flipadst, identity + +cglobal iflipadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq, 16 + +.pass1: + call m(iadst_8x8_internal_8bpc).main + call m(iadst_8x8_internal_8bpc).main_pass1_end + +.pass1_end: + mova m7, [o(pw_m16384)] + +.pass1_end1: + pmulhrsw m1, m7 + mova [rsp+gprsize+16*1], m1 + mova m1, m6 + mova m6, m2 + pmulhrsw m2, m5, m7 + mova m5, m6 + mova m6, m4 + pmulhrsw m4, m3, m7 + mova m3, m6 + mova m6, m0 + mova m0, m7 + pxor m7, m7 + psubw m7, m0 + pmulhrsw m0, [rsp+gprsize+16*0] + REPX {pmulhrsw x, m7}, m1, m3, m5 + pmulhrsw m7, m6 + jmp m(idct_8x8_internal_8bpc).pass1_end3 + +ALIGN function_align +.pass2: + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] + +.pass2_main: + call m(iadst_8x8_internal_8bpc).main + call m(iadst_8x8_internal_8bpc).main_pass2_end + +.end: + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova [rsp+gprsize+16*2], m2 + mova m2, m0 + pxor m0, m0 + psubw m0, m7 + mova m7, m2 + pmulhrsw m1, m0 + pmulhrsw m2, m5, m0 + mova [rsp+gprsize+16*1], m1 + mova m5, m4 + mova m1, m6 + pmulhrsw m4, m3, m0 + pmulhrsw m0, [rsp+gprsize+16*0] + mova m3, m5 + mova [rsp+gprsize+16*0], m7 + jmp m(idct_8x8_internal_8bpc).end3 + +INV_TXFM_8X8_FN identity, dct +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq, 16 + mova [rsp+gprsize+16*1], m6 + jmp m(idct_8x8_internal_8bpc).pass1_end3 + +ALIGN function_align +.pass2: + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] + +.end: + pmulhrsw m7, [o(pw_4096)] + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_4096)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + mova [rsp+gprsize+16*2], m5 + mova [rsp+gprsize+16*1], m6 + jmp m(idct_8x8_internal_8bpc).end3 + + +%macro INV_TXFM_4X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x16, 8 +%ifidn %1_%2, dct_dct + pshuflw m0, [coeffq], q0000 + punpcklwd m0, m0 + mova m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mov [coeffq], eobd + pmulhrsw m0, [o(pw_16384)] + pmulhrsw m0, m1 + pmulhrsw m0, [o(pw_2048)] +.end: + WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 + lea dstq, [dstq+strideq*4] + WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 + lea dstq, [dstq+strideq*4] + WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 + lea dstq, [dstq+strideq*4] + WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 + RET +%endif +%endmacro + +INV_TXFM_4X16_FN dct, dct +INV_TXFM_4X16_FN dct, adst +INV_TXFM_4X16_FN dct, flipadst +INV_TXFM_4X16_FN dct, identity + +cglobal idct_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(idct_4x8_internal_8bpc).pass1)] + +.pass1: + mova m0, [coeffq+16*1] + mova m1, [coeffq+16*3] + mova m2, [coeffq+16*5] + mova m3, [coeffq+16*7] + push tx2q + lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_2)] + jmp r3 + +.pass1_2: + mova [coeffq+16*1], m0 + mova [coeffq+16*3], m1 + mova [coeffq+16*5], m2 + mova [coeffq+16*7], m3 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*2] + mova m2, [coeffq+16*4] + mova m3, [coeffq+16*6] + lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_end)] + jmp r3 + +.pass1_end: + pop tx2q + + mova m4, [coeffq+16*1] + mova m5, [coeffq+16*3] + mova m6, [coeffq+16*5] + mova m7, [o(pw_16384)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + + pmulhrsw m7, [coeffq+16*7] + mova [coeffq+16*7], m7 + jmp tx2q + +.pass2: + call m(idct_16x4_internal_8bpc).main + +.end: + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [coeffq+16*7] + mova [coeffq+16*4], m4 + +.end1: + mova [coeffq+16*5], m5 + mova [coeffq+16*6], m6 + mov r3, coeffq + WRITE_4X8 0, 1, 3, 2 + + mova m0, [r3+16*4] + mova m1, [r3+16*5] + mova m2, [r3+16*6] + mova m3, m7 + lea dstq, [dstq+strideq*4] + WRITE_4X8 0, 1, 3, 2 + +.end2: + pxor m7, m7 + REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + ret + +INV_TXFM_4X16_FN adst, dct +INV_TXFM_4X16_FN adst, adst +INV_TXFM_4X16_FN adst, flipadst +INV_TXFM_4X16_FN adst, identity + +cglobal iadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(iadst_4x8_internal_8bpc).pass1)] + jmp m(idct_4x16_internal_8bpc).pass1 + +.pass2: + call m(iadst_16x4_internal_8bpc).main + call m(iadst_16x4_internal_8bpc).main_pass2_end + + punpcklqdq m6, m5, m4 ;low: -out5 high: -out7 + punpckhqdq m4, m5 ;low: out8 high: out10 + punpcklqdq m5, m7, m2 ;low: out4 high: out6 + punpckhqdq m2, m7 ;low: -out9 high: -out11 + mova [coeffq+16*4], m2 + mova [coeffq+16*5], m6 + mova m2, [coeffq+16*6] + mova m6, [coeffq+16*7] + punpckhqdq m1, m6, m0 ;low: -out13 high: -out15 + punpcklqdq m0, m6 ;low: out0 high: out2 + punpckhqdq m6, m3, m2 ;low: out12 high: out14 + punpcklqdq m2, m3 ;low: -out1 high: -out3 + + mova m7, [o(pw_2048)] + +.end1: + REPX {pmulhrsw x, m7}, m0, m5, m4, m6 + pxor m3, m3 + psubw m3, m7 + mova m7, [coeffq+16*4] + REPX {pmulhrsw x, m3}, m2, m7, m1 + pmulhrsw m3, [coeffq+16*5] + mova [coeffq+16*7], m5 + + punpckhqdq m5, m4, m7 ;low: out10 high: out11 + punpcklqdq m4, m7 ;low: out8 high: out9 + punpckhqdq m7, m6, m1 ;low: out14 high: out15 + punpcklqdq m6, m1 ;low: out12 high: out13 + punpckhqdq m1, m0, m2 ;low: out2 high: out3 + punpcklqdq m0, m2 ;low: out0 high: out1 + mova [coeffq+16*4], m4 + mova m4, [coeffq+16*7] + punpcklqdq m2, m4, m3 ;low: out4 high: out5 + punpckhqdq m4, m3 ;low: out6 high: out7 + mova m3, m4 + +.end2: + mova [coeffq+16*5], m5 + mova [coeffq+16*6], m6 + mov r3, coeffq + WRITE_4X8 0, 1, 2, 3 + + mova m0, [r3+16*4] + mova m1, [r3+16*5] + mova m2, [r3+16*6] + mova m3, m7 + lea dstq, [dstq+strideq*4] + WRITE_4X8 0, 1, 2, 3 + +.end3: + pxor m7, m7 + REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + ret + + +INV_TXFM_4X16_FN flipadst, dct +INV_TXFM_4X16_FN flipadst, adst +INV_TXFM_4X16_FN flipadst, flipadst +INV_TXFM_4X16_FN flipadst, identity + +cglobal iflipadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(iflipadst_4x8_internal_8bpc).pass1)] + jmp m(idct_4x16_internal_8bpc).pass1 + +.pass2: + call m(iadst_16x4_internal_8bpc).main + call m(iadst_16x4_internal_8bpc).main_pass2_end + + punpckhqdq m6, m5, m4 ;low: out5 high: out7 + punpcklqdq m4, m5 ;low: -out8 high: -out10 + punpckhqdq m5, m7, m2 ;low: -out4 high: -out6 + punpcklqdq m2, m7 ;low: out9 high: out11 + mova [coeffq+16*4], m2 + mova [coeffq+16*5], m6 + mova m2, [coeffq+16*6] + mova m6, [coeffq+16*7] + punpcklqdq m1, m6, m0 ;low: out13 high: out15 + punpckhqdq m0, m6 ;low: -out0 high: -out2 + punpcklqdq m6, m3, m2 ;low: -out12 high: -out14 + punpckhqdq m2, m3 ;low: out1 high: out3 + + mova m7, [o(pw_m2048)] + jmp m(iadst_4x16_internal_8bpc).end1 + + +INV_TXFM_4X16_FN identity, dct +INV_TXFM_4X16_FN identity, adst +INV_TXFM_4X16_FN identity, flipadst +INV_TXFM_4X16_FN identity, identity + +%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394] + pmulhrsw m%2, m%3, m%1 +%if %0 == 4 ; if downshifting by 1 + pmulhrsw m%2, m%4 +%else + paddsw m%1, m%1 +%endif + paddsw m%1, m%2 +%endmacro + +cglobal iidentity_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m0, [coeffq+16*1] + mova m6, [o(pw_1697x8)] + mova m1, [coeffq+16*3] + mova m2, [coeffq+16*5] + mova m3, [coeffq+16*7] + pcmpeqw m7, m7 + mov r3, tx2q + lea tx2q, [o(.pass1_2)] +.pass1: + pmulhrsw m4, m6, m0 + pmulhrsw m5, m6, m1 + pavgw m4, m0 + pcmpeqw m0, m7 + pavgw m5, m1 + pcmpeqw m1, m7 + pandn m0, m4 + pmulhrsw m4, m6, m2 + pandn m1, m5 + pmulhrsw m5, m6, m3 + pavgw m4, m2 + pcmpeqw m2, m7 + pavgw m5, m3 + pcmpeqw m3, m7 + pandn m2, m4 + pandn m3, m5 + jmp m(iadst_4x8_internal_8bpc).pass1_end +.pass1_2: + mova [coeffq+16*1], m0 + mova [coeffq+16*3], m1 + mova [coeffq+16*5], m2 + mova [coeffq+16*7], m3 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*2] + mova m2, [coeffq+16*4] + mova m3, [coeffq+16*6] + lea tx2q, [o(.pass1_end)] + jmp .pass1 +.pass1_end: + mova m4, [coeffq+16*1] + mova m5, [coeffq+16*3] + mova m6, [coeffq+16*5] + jmp r3 +.pass2: + mova m7, [o(pw_1697x16)] + mova [coeffq+16*6], m6 + REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5 + mova m6, [coeffq+16*7] + IDTX16 6, 7, 7 + mova [coeffq+16*7], m6 + mova m6, [coeffq+16*6] + pmulhrsw m7, m6, [o(pw_1697x16)] + paddsw m6, m6 + paddsw m6, m7 + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [coeffq+16*7] + mova [coeffq+16*4], m4 + jmp m(iadst_4x16_internal_8bpc).end2 + + +%macro INV_TXFM_16X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x4, 8 +%ifidn %1_%2, dct_dct + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_16384)] + mov [coeffq], eobd + mov r2d, 2 + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x4_8bpc).end)] +.dconly: + pmulhrsw m0, m2 + movd m2, [o(pw_2048)] ;intentionally rip-relative + pmulhrsw m0, m1 + pmulhrsw m0, m2 + pshuflw m0, m0, q0000 + punpcklwd m0, m0 + pxor m5, m5 +.dconly_loop: + mova m1, [dstq] + mova m3, [dstq+strideq] + punpckhbw m2, m1, m5 + punpcklbw m1, m5 + punpckhbw m4, m3, m5 + punpcklbw m3, m5 + paddw m2, m0 + paddw m1, m0 + paddw m4, m0 + paddw m3, m0 + packuswb m1, m2 + packuswb m3, m4 + mova [dstq], m1 + mova [dstq+strideq], m3 + lea dstq, [dstq+strideq*2] + dec r2d + jg .dconly_loop + jmp tx2q +.end: + RET +%endif +%endmacro + +%macro LOAD_7ROWS 2 ;src, stride + mova m0, [%1+%2*0] + mova m1, [%1+%2*1] + mova m2, [%1+%2*2] + mova m3, [%1+%2*3] + mova m4, [%1+%2*4] + mova m5, [%1+%2*5] + mova m6, [%1+%2*6] +%endmacro + +%macro SAVE_7ROWS 2 ;src, stride + mova [%1+%2*0], m0 + mova [%1+%2*1], m1 + mova [%1+%2*2], m2 + mova [%1+%2*3], m3 + mova [%1+%2*4], m4 + mova [%1+%2*5], m5 + mova [%1+%2*6], m6 +%endmacro + +%macro IDCT16_1D_PACKED_ODDHALF 7 ;src[1-4], tmp[1-3] + punpckhwd m%5, m%4, m%1 ;packed in13 in3 + punpcklwd m%1, m%4 ;packed in1 in15 + punpcklwd m%4, m%3, m%2 ;packed in9 in7 + punpckhwd m%2, m%3 ;packed in5 in11 + mova m%7, [o(pd_2048)] + ITX_MUL2X_PACK %1, %6, %7, 401, 4076, 1 ;low: t8a high: t15a + ITX_MUL2X_PACK %4, %6, %7, 3166, 2598, 1 ;low: t9a high: t14a + ITX_MUL2X_PACK %2, %6, %7, 1931, 3612, 1 ;low: t10a high: t13a + ITX_MUL2X_PACK %5, %6, %7, 3920, 1189, 1 ;low: t11a high: t12a + psubsw m%6, m%1, m%4 ;low: t9 high: t14 + paddsw m%1, m%4 ;low: t8 high: t15 + psubsw m%4, m%5, m%2 ;low: t10 high: t13 + paddsw m%5, m%2 ;low: t11 high: t12 + mova m%2, [o(deint_shuf2)] + pshufb m%6, m%2 + pshufb m%4, m%2 + ITX_MUL2X_PACK %6, %3, %7, 1567, 3784, 1 ;low: t9a high: t14a + ITX_MUL2X_PACK %4, %3, %7, m3784, 1567, 1 ;low: t10a high: t13a + psubsw m%3, m%1, m%5 ;low: t11a high: t12a + paddsw m%1, m%5 ;low: t8a high: t15a + psubsw m%5, m%6, m%4 ;low: t10 high: t13 + paddsw m%6, m%4 ;low: t9 high: t14 + pshufb m%3, m%2 + pshufb m%5, m%2 + ITX_MUL2X_PACK %3, %2, %7, 2896, 2896, 4 ;t12, t11 + ITX_MUL2X_PACK %5, %4, %7, 2896, 2896, 4 ;t13a, t10a + packssdw m%2, m%4 ;low: t11 high: t10a + packssdw m%3, m%5 ;low: t12 high: t13a + punpckhqdq m%4, m%1, m%6 ;low: t15a high: t14 + punpcklqdq m%1, m%6 ;low: t8a high: t9 +%endmacro + +INV_TXFM_16X4_FN dct, dct +INV_TXFM_16X4_FN dct, adst +INV_TXFM_16X4_FN dct, flipadst +INV_TXFM_16X4_FN dct, identity + +cglobal idct_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_7ROWS coeffq, 16 + call .main + +.pass1_end: + punpckhwd m7, m0, m2 ;packed out1, out5 + punpcklwd m0, m2 ;packed out0, out4 + punpcklwd m2, m1, m3 ;packed out3, out7 + punpckhwd m1, m3 ;packed out2, out6 + mova [coeffq+16*6], m7 + mova m7, [coeffq+16*7] + punpckhwd m3, m4, m6 ;packed out9, out13 + punpcklwd m4, m6 ;packed out8, out12 + punpcklwd m6, m5, m7 ;packed out11, out15 + punpckhwd m5, m7 ;packed out10, out14 + +.pass1_end2: + mova m7, [o(pw_16384)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [coeffq+16*6] + mova [coeffq+16*6], m7 + +.pass1_end3: + punpckhwd m7, m3, m6 ;packed 9, 11, 13, 15 high + punpcklwd m3, m6 ;packed 9, 10, 13, 15 low + punpckhwd m6, m4, m5 ;packed 8, 10, 12, 14 high + punpcklwd m4, m5 ;packed 8, 10, 12, 14 low + punpckhwd m5, m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(1) + punpcklwd m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(0) + punpckhwd m3, m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(3) + punpcklwd m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(2) + mova [coeffq+16*7], m3 + mova m3, [coeffq+16*6] + punpckhwd m7, m3, m2 ;packed 1, 3, 5, 7 high + punpcklwd m3, m2 ;packed 1, 3, 5, 7 low + punpckhwd m2, m0, m1 ;packed 0, 2, 4, 6 high + punpcklwd m0, m1 ;packed 0, 2, 4, 6 low + punpckhwd m1, m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(1) + punpcklwd m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(0) + punpckhwd m3, m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(3) + punpcklwd m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(2) + jmp tx2q + +.pass2: + lea tx2q, [o(m(idct_8x4_internal_8bpc).pass2)] + +.pass2_end: + mova [coeffq+16*4], m4 + mova [coeffq+16*5], m5 + mova [coeffq+16*6], m6 + lea r3, [dstq+8] + call tx2q + + add coeffq, 16*4 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + mova m2, [coeffq+16*2] + mova m3, [coeffq+16*3] + mov dstq, r3 + jmp tx2q + +ALIGN function_align +cglobal_label .main + punpckhqdq m7, m0, m1 ;low:in1 high:in3 + punpcklqdq m0, m1 + punpcklqdq m1, m2, m3 + punpckhqdq m3, m2 ;low:in7 high:in5 + mova [coeffq+16*4], m7 + mova [coeffq+16*5], m3 + mova m7, [coeffq+16*7] + punpcklqdq m2, m4, m5 + punpckhqdq m4, m5 ;low:in9 high:in11 + punpcklqdq m3, m6, m7 + punpckhqdq m7, m6 ;low:in15 high:in13 + mova [coeffq+16*6], m4 + IDCT8_1D_PACKED + mova m6, [coeffq+16*4] + mova m4, [coeffq+16*5] + mova m5, [coeffq+16*6] + mova [coeffq+16*4], m1 + mova [coeffq+16*5], m2 + mova [coeffq+16*6], m3 + + IDCT16_1D_PACKED_ODDHALF 6, 4, 5, 7, 1, 2, 3 + + mova m1, [coeffq+16*4] + psubsw m3, m0, m7 ;low:out15 high:out14 + paddsw m0, m7 ;low:out0 high:out1 + psubsw m7, m1, m5 ;low:out12 high:out13 + paddsw m1, m5 ;low:out3 high:out2 + mova [coeffq+16*7], m3 + mova m2, [coeffq+16*5] + mova m3, [coeffq+16*6] + psubsw m5, m2, m4 ;low:out11 high:out10 + paddsw m2, m4 ;low:out4 high:out5 + psubsw m4, m3, m6 ;low:out8 high:out9 + paddsw m3, m6 ;low:out7 high:out6 + mova m6, m7 + ret + +INV_TXFM_16X4_FN adst, dct +INV_TXFM_16X4_FN adst, adst +INV_TXFM_16X4_FN adst, flipadst +INV_TXFM_16X4_FN adst, identity + +cglobal iadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_7ROWS coeffq, 16 + call .main + call .main_pass1_end + + punpckhwd m6, m7, m0 ;packed -out11, -out15 + punpcklwd m0, m7 ;packed out0, out4 + punpcklwd m7, m3, m4 ;packed -out3, -out7 + punpckhwd m4, m3 ;packed out8, out12 + mova m1, [coeffq+16*6] + punpcklwd m3, m1, m5 ;packed -out1, -out5 + punpckhwd m5, m1 ;packed out10, out14 + mova m1, [coeffq+16*7] + mova [coeffq+16*6], m3 + mova [coeffq+16*7], m7 + punpckhwd m3, m2, m1 ;packed -out9, -out13 + punpcklwd m1, m2 ;packed out2, out6 + + mova m7, [o(pw_16384)] + +.pass1_end: + REPX {pmulhrsw x, m7}, m0, m1, m4, m5 + pxor m2, m2 + psubw m2, m7 + mova m7, [coeffq+16*6] + REPX {pmulhrsw x, m2}, m7, m3, m6 + pmulhrsw m2, [coeffq+16*7] + mova [coeffq+16*6], m7 + jmp m(idct_16x4_internal_8bpc).pass1_end3 + +.pass2: + lea tx2q, [o(m(iadst_8x4_internal_8bpc).pass2)] + jmp m(idct_16x4_internal_8bpc).pass2_end + +ALIGN function_align +cglobal_label .main + mova [coeffq+16*6], m0 + pshufd m0, m1, q1032 + pshufd m2, m2, q1032 + punpckhwd m1, m6, m0 ;packed in13, in2 + punpcklwd m0, m6 ;packed in3, in12 + punpckhwd m7, m5, m2 ;packed in11, in4 + punpcklwd m2, m5 ;packed in5, in10 + mova m6, [o(pd_2048)] + ITX_MUL2X_PACK 1, 5, 6, 995, 3973 ;low:t2 high:t3 + ITX_MUL2X_PACK 7, 5, 6, 1751, 3703 ;low:t4 high:t5 + ITX_MUL2X_PACK 2, 5, 6, 3513, 2106 ;low:t10 high:t11 + ITX_MUL2X_PACK 0, 5, 6, 3857, 1380 ;low:t12 high:t13 + psubsw m5, m1, m2 ;low:t10a high:t11a + paddsw m1, m2 ;low:t2a high:t3a + psubsw m2, m7, m0 ;low:t12a high:t13a + paddsw m7, m0 ;low:t4a high:t5a + punpcklqdq m0, m5 + punpckhwd m0, m5 ;packed t10a, t11a + punpcklqdq m5, m2 + punpckhwd m2, m5 ;packed t13a, t12a + ITX_MUL2X_PACK 0, 5, 6, 3406, 2276 ;low:t10 high:t11 + ITX_MUL2X_PACK 2, 5, 6, 4017, 799, 1 ;low:t12 high:t13 + mova [coeffq+16*4], m1 + mova [coeffq+16*5], m7 + mova m1, [coeffq+16*6] + mova m7, [coeffq+16*7] + pshufd m1, m1, q1032 + pshufd m3, m3, q1032 + punpckhwd m5, m7, m1 ;packed in15, in0 + punpcklwd m1, m7 ;packed in1, in14 + punpckhwd m7, m4, m3 ;packed in9, in6 + punpcklwd m3, m4 ;packed in7, in8 + ITX_MUL2X_PACK 5, 4, 6, 201, 4091 ;low:t0 high:t1 + ITX_MUL2X_PACK 7, 4, 6, 2440, 3290 ;low:t6 high:t7 + ITX_MUL2X_PACK 3, 4, 6, 3035, 2751 ;low:t8 high:t9 + ITX_MUL2X_PACK 1, 4, 6, 4052, 601 ;low:t14 high:t15 + psubsw m4, m5, m3 ;low:t8a high:t9a + paddsw m5, m3 ;low:t0a high:t1a + psubsw m3, m7, m1 ;low:t14a high:t15a + paddsw m7, m1 ;low:t6a high:t7a + punpcklqdq m1, m4 + punpckhwd m1, m4 ;packed t8a, t9a + punpcklqdq m4, m3 + punpckhwd m3, m4 ;packed t15a, t14a + ITX_MUL2X_PACK 1, 4, 6, 799, 4017 ;low:t8 high:t9 + ITX_MUL2X_PACK 3, 4, 6, 2276, 3406, 1 ;low:t14 high:t15 + paddsw m4, m1, m2 ;low:t12a high:t13a + psubsw m1, m2 ;low:t8a high:t9a + psubsw m2, m0, m3 ;low:t14a high:t15a + paddsw m0, m3 ;low:t10a high:t11a + punpcklqdq m3, m1 + punpckhwd m3, m1 ;packed t12a, t13a + punpcklqdq m1, m2 + punpckhwd m2, m1 ;packed t15a, t14a + ITX_MUL2X_PACK 3, 1, 6, 1567, 3784 ;low:t12 high:t13 + ITX_MUL2X_PACK 2, 1, 6, 3784, 1567, 1 ;low:t14 high:t15 + psubsw m1, m3, m2 ;low:t14a high:t15a + paddsw m3, m2 ;low:out2 high:-out13 + psubsw m2, m4, m0 ;low:t10 high:t11 + paddsw m0, m4 ;low:-out1 high:out14 + mova [coeffq+16*6], m0 + mova [coeffq+16*7], m3 + mova m0, [coeffq+16*4] + mova m3, [coeffq+16*5] + psubsw m4, m5, m3 ;low:t4 high:t5 + paddsw m5, m3 ;low:t0 high:t1 + psubsw m3, m0, m7 ;low:t6 high:t7 + paddsw m0, m7 ;low:t2 high:t3 + punpcklqdq m7, m4 + punpckhwd m7, m4 ;packed t4, t5 + punpcklqdq m4, m3 + punpckhwd m3, m4 ;packed t7, t6 + ITX_MUL2X_PACK 7, 4, 6, 1567, 3784 ;low:t4a high:t5a + ITX_MUL2X_PACK 3, 4, 6, 3784, 1567, 1 ;low:t6a high:t7a + psubsw m4, m5, m0 ;low:t2a high:t3a + paddsw m0, m5 ;low:out0 high:-out15 + psubsw m5, m7, m3 ;low:t6 high:t7 + paddsw m3, m7 ;low:-out3 high:out12 + ret +ALIGN function_align +.main_pass1_end: + mova m7, [o(deint_shuf1)] + mova [coeffq+16*4], m0 + mova [coeffq+16*5], m3 + mova m0, [o(pw_2896_m2896)] + mova m3, [o(pw_2896_2896)] + pshufb m1, m7 ;t14a t15a + pshufb m2, m7 ;t10 t11 + pshufb m4, m7 ;t2a t3a + pshufb m5, m7 ;t6 t7 + pmaddwd m7, m0, m2 + pmaddwd m2, m3 + paddd m7, m6 + paddd m2, m6 + psrad m7, 12 + psrad m2, 12 + packssdw m2, m7 ;low:out6 high:-out9 + pmaddwd m7, m0, m4 + pmaddwd m4, m3 + paddd m7, m6 + paddd m4, m6 + psrad m7, 12 + psrad m4, 12 + packssdw m4, m7 ;low:-out7 high:out8 + pmaddwd m7, m3, m5 + pmaddwd m5, m0 + paddd m7, m6 + paddd m5, m6 + psrad m7, 12 + psrad m5, 12 + packssdw m7, m5 ;low:out4 high:-out11 + pmaddwd m5, m3, m1 + pmaddwd m1, m0 + paddd m5, m6 + paddd m1, m6 + psrad m5, 12 + psrad m1, 12 + packssdw m5, m1 ;low:-out5 high:out10 + mova m0, [coeffq+16*4] + mova m3, [coeffq+16*5] + ret +ALIGN function_align +cglobal_label .main_pass2_end + mova m7, [o(pw_2896x8)] + punpckhqdq m6, m2, m1 ;low:t11 high:t15a + punpcklqdq m2, m1 ;low:t10 high:t14a + psubsw m1, m2, m6 + paddsw m2, m6 + punpckhqdq m6, m4, m5 ;low:t3a high:t7 + punpcklqdq m4, m5 ;low:t2a high:t6 + psubsw m5, m4, m6 + paddsw m4, m6 + pmulhrsw m1, m7 ;low:-out9 high:out10 + pmulhrsw m2, m7 ;low:out6 high:-out5 + pmulhrsw m5, m7 ;low:out8 high:-out11 + pmulhrsw m4, m7 ;low:-out7 high:out4 + punpckhqdq m7, m4, m5 ;low:out4 high:-out11 + punpcklqdq m4, m5 ;low:-out7 high:out8 + punpckhqdq m5, m2, m1 ;low:-out5 high:out10 + punpcklqdq m2, m1 ;low:out6 high:-out9 + ret + + +INV_TXFM_16X4_FN flipadst, dct +INV_TXFM_16X4_FN flipadst, adst +INV_TXFM_16X4_FN flipadst, flipadst +INV_TXFM_16X4_FN flipadst, identity + +cglobal iflipadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_7ROWS coeffq, 16 + call m(iadst_16x4_internal_8bpc).main + call m(iadst_16x4_internal_8bpc).main_pass1_end + + punpcklwd m6, m7, m0 ;packed out11, out15 + punpckhwd m0, m7 ;packed -out0, -out4 + punpckhwd m7, m3, m4 ;packed out3, out7 + punpcklwd m4, m3 ;packed -out8, -out12 + mova m1, [coeffq+16*6] + punpckhwd m3, m1, m5 ;packed out1, out5 + punpcklwd m5, m1 ;packed -out10, -out14 + mova m1, [coeffq+16*7] + mova [coeffq+16*6], m3 + mova [coeffq+16*7], m7 + punpcklwd m3, m2, m1 ;packed out9, out13 + punpckhwd m1, m2 ;packed -out2, -out6 + + mova m7, [o(pw_m16384)] + jmp m(iadst_16x4_internal_8bpc).pass1_end + +.pass2: + lea tx2q, [o(m(iflipadst_8x4_internal_8bpc).pass2)] + jmp m(idct_16x4_internal_8bpc).pass2_end + + +INV_TXFM_16X4_FN identity, dct +INV_TXFM_16X4_FN identity, adst +INV_TXFM_16X4_FN identity, flipadst +INV_TXFM_16X4_FN identity, identity + +cglobal iidentity_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m1, [coeffq+16*6] + mova m0, [coeffq+16*5] + mova m2, [coeffq+16*7] + mova m6, [o(pw_1697x16)] + mova m7, [o(pw_16384)] + pmulhrsw m4, m6, m1 + pmulhrsw m3, m6, m0 + pmulhrsw m5, m6, m2 + pmulhrsw m4, m7 + pmulhrsw m3, m7 + pmulhrsw m5, m7 + paddsw m1, m4 + paddsw m0, m3 + paddsw m5, m2 + mova m2, [coeffq+16*2] + mova m3, [coeffq+16*3] + mova m4, [coeffq+16*4] + mova [coeffq+16*6], m1 + mova [coeffq+16*5], m0 + mova [coeffq+16*7], m5 + pmulhrsw m0, m6, m2 + pmulhrsw m1, m6, m3 + pmulhrsw m5, m6, m4 + pmulhrsw m0, m7 + pmulhrsw m1, m7 + pmulhrsw m5, m7 + paddsw m2, m0 + paddsw m3, m1 + paddsw m4, m5 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + pmulhrsw m5, m6, m0 + pmulhrsw m6, m1 + pmulhrsw m5, m7 + pmulhrsw m6, m7 + paddsw m0, m5 + paddsw m1, m6 + mova m6, [coeffq+16*6] + mova m5, [coeffq+16*5] + punpckhwd m7, m0, m2 ;packed out1, out5 + punpcklwd m0, m2 ;packed out0, out4 + punpckhwd m2, m1, m3 ;packed out3, out7 + punpcklwd m1, m3 ;packed out2, out6 + mova [coeffq+16*6], m7 + mova m7, [coeffq+16*7] + punpckhwd m3, m4, m6 ;packed out9, out13 + punpcklwd m4, m6 ;packed out8, out12 + punpckhwd m6, m5, m7 ;packed out11, out15 + punpcklwd m5, m7 ;packed out10, out14 + jmp m(idct_16x4_internal_8bpc).pass1_end3 + +.pass2: + lea tx2q, [o(m(iidentity_8x4_internal_8bpc).pass2)] + jmp m(idct_16x4_internal_8bpc).pass2_end + + +%macro SAVE_8ROWS 2 ;src, stride + mova [%1+%2*0], m0 + mova [%1+%2*1], m1 + mova [%1+%2*2], m2 + mova [%1+%2*3], m3 + mova [%1+%2*4], m4 + mova [%1+%2*5], m5 + mova [%1+%2*6], m6 + mova [%1+%2*7], m7 +%endmacro + +%macro INV_TXFM_8X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x16, 8, 16*16 +%ifidn %1_%2, dct_dct + pshuflw m0, [coeffq], q0000 + punpcklwd m0, m0 + mova m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mova m2, [o(pw_16384)] + mov [coeffq], eobd + pmulhrsw m0, m1 + pmulhrsw m0, m2 + psrlw m2, 3 ; pw_2048 + pmulhrsw m0, m1 + pmulhrsw m0, m2 + mov r3d, 4 + lea tx2q, [o(m(inv_txfm_add_dct_dct_8x16_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop +.end: + RET +%endif +%endmacro + +INV_TXFM_8X16_FN dct, dct +INV_TXFM_8X16_FN dct, adst +INV_TXFM_8X16_FN dct, flipadst +INV_TXFM_8X16_FN dct, identity + +cglobal idct_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(idct_8x8_internal_8bpc).pass1)] + +.pass1: + LOAD_8ROWS coeffq+16*1, 32, 1 + mov [rsp+gprsize+16*11], tx2q + lea tx2q, [o(m(idct_8x16_internal_8bpc).pass1_end)] + jmp r3 + +.pass1_end: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS coeffq+16*0, 32, 1 + mov tx2q, [rsp+gprsize+16*11] + jmp r3 + +.pass2: + lea tx2q, [o(m(idct_8x16_internal_8bpc).end)] + +.pass2_pre: + mova [coeffq+16*2 ], m1 + mova [coeffq+16*6 ], m3 + mova [coeffq+16*10], m5 + mova [coeffq+16*14], m7 + mova m1, m2 + mova m2, m4 + mova m3, m6 + mova m4, [coeffq+16*1 ] + mova m5, [coeffq+16*5 ] + mova m6, [coeffq+16*9 ] + mova m7, [coeffq+16*13] + +.pass2_main: + call m(idct_8x8_internal_8bpc).main + + SAVE_7ROWS rsp+gprsize+16*3, 16 + mova m0, [coeffq+16*2 ] + mova m1, [coeffq+16*6 ] + mova m2, [coeffq+16*10] + mova m3, [coeffq+16*14] + mova m4, [coeffq+16*3 ] + mova m5, [coeffq+16*7 ] + mova m6, [coeffq+16*11] + mova m7, [coeffq+16*15] + call m(idct_16x8_internal_8bpc).main + + mov r3, dstq + lea dstq, [dstq+strideq*8] + jmp m(idct_8x8_internal_8bpc).end + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + mov dstq, r3 + jmp m(idct_8x8_internal_8bpc).end + +.end1: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + ret + +INV_TXFM_8X16_FN adst, dct +INV_TXFM_8X16_FN adst, adst +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, identity + +cglobal iadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(iadst_8x8_internal_8bpc).pass1)] + jmp m(idct_8x16_internal_8bpc).pass1 + +.pass2: + lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)] + +.pass2_pre: + mova [rsp+gprsize+16*7], m0 + mova [rsp+gprsize+16*8], m1 + mova [rsp+gprsize+16*5], m6 + mova [rsp+gprsize+16*6], m7 + mova m0, m2 + mova m1, m3 + mova m2, m4 + mova m3, m5 + +.pass2_main: + mova m4, [coeffq+16*1 ] + mova m5, [coeffq+16*3 ] + mova m6, [coeffq+16*13] + mova m7, [coeffq+16*15] + mova [rsp+gprsize+16*3], m4 + mova [rsp+gprsize+16*4], m5 + mova [rsp+gprsize+16*9], m6 + mova [rsp+gprsize+32*5], m7 + mova m4, [coeffq+16*5 ] + mova m5, [coeffq+16*7 ] + mova m6, [coeffq+16*9 ] + mova m7, [coeffq+16*11] + + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end + + mov r3, dstq + lea dstq, [dstq+strideq*8] + jmp m(iadst_8x8_internal_8bpc).end + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + mov dstq, r3 + jmp m(iadst_8x8_internal_8bpc).end + + +INV_TXFM_8X16_FN flipadst, dct +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst +INV_TXFM_8X16_FN flipadst, identity + +cglobal iflipadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + lea r3, [o(m(iflipadst_8x8_internal_8bpc).pass1)] + jmp m(idct_8x16_internal_8bpc).pass1 + +.pass2: + lea tx2q, [o(m(iflipadst_8x16_internal_8bpc).end)] + lea r3, [dstq+strideq*8] + +.pass2_pre: + mova [rsp+gprsize+16*7], m0 + mova [rsp+gprsize+16*8], m1 + mova [rsp+gprsize+16*5], m6 + mova [rsp+gprsize+16*6], m7 + mova m0, m2 + mova m1, m3 + mova m2, m4 + mova m3, m5 + +.pass2_main: + mova m4, [coeffq+16*1 ] + mova m5, [coeffq+16*3 ] + mova m6, [coeffq+16*13] + mova m7, [coeffq+16*15] + mova [rsp+gprsize+16*3], m4 + mova [rsp+gprsize+16*4], m5 + mova [rsp+gprsize+16*9], m6 + mova [rsp+gprsize+32*5], m7 + mova m4, [coeffq+16*5 ] + mova m5, [coeffq+16*7 ] + mova m6, [coeffq+16*9 ] + mova m7, [coeffq+16*11] + + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end + jmp m(iflipadst_8x8_internal_8bpc).end + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + mov dstq, r3 + jmp m(iflipadst_8x8_internal_8bpc).end + + +INV_TXFM_8X16_FN identity, dct +INV_TXFM_8X16_FN identity, adst +INV_TXFM_8X16_FN identity, flipadst +INV_TXFM_8X16_FN identity, identity + +cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq+16*1, 32, 1 + mov r3, tx2q + lea tx2q, [o(.pass1_end)] + mova [rsp+gprsize+16*1], m6 + jmp m(idct_8x8_internal_8bpc).pass1_end3 + +.pass1_end: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS coeffq+16*0, 32, 1 + mov tx2q, r3 + mova [rsp+gprsize+16*1], m6 + jmp m(idct_8x8_internal_8bpc).pass1_end3 + +.pass2: + lea tx2q, [o(.end1)] + +.end: + mova [rsp+gprsize+16*0], m7 + mova [rsp+gprsize+16*1], m6 + mova m7, [o(pw_1697x16)] + REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5 + mova m6, [rsp+gprsize+16*1] + mova [rsp+gprsize+16*2], m5 + IDTX16 6, 5, 7 + mova m5, [rsp+gprsize+16*0] + IDTX16 5, 7, 7 + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [rsp+gprsize+16*2] + mova [rsp+gprsize+16*0], m5 + mova [rsp+gprsize+16*1], m6 + mova [rsp+gprsize+16*2], m7 + jmp m(idct_8x8_internal_8bpc).end3 + +.end1: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + lea dstq, [dstq+strideq*2] + jmp .end + + +%macro INV_TXFM_16X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x8, 8, 16*16 +%ifidn %1_%2, dct_dct + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_16384)] + mov [coeffq], eobd + pmulhrsw m0, m1 + mov r2d, 4 + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x8_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly +.end: + RET +%endif +%endmacro + +INV_TXFM_16X8_FN dct, dct +INV_TXFM_16X8_FN dct, adst +INV_TXFM_16X8_FN dct, flipadst +INV_TXFM_16X8_FN dct, identity + +cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq+16*0, 32, 1 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + LOAD_8ROWS coeffq+16*1, 32, 1 + call .main + mov r3, tx2q + lea tx2q, [o(.pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mov tx2q, r3 + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass2: + lea tx2q, [o(.end)] + lea r3, [dstq+8] + jmp m(idct_8x8_internal_8bpc).pass2_main + +.end: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + mov dstq, r3 + jmp m(idct_8x8_internal_8bpc).pass2_main + + +ALIGN function_align +cglobal_label .main + mova [rsp+gprsize*2+16*1], m2 + mova [rsp+gprsize*2+16*2], m6 + mova [rsp+gprsize*2+32*5], m5 + + mova m6, [o(pd_2048)] + ITX_MULSUB_2W 0, 7, 2, 5, 6, 401, 4076 ;t8a, t15a + ITX_MULSUB_2W 4, 3, 2, 5, 6, 3166, 2598 ;t9a, t14a + psubsw m2, m0, m4 ;t9 + paddsw m0, m4 ;t8 + psubsw m4, m7, m3 ;t14 + paddsw m7, m3 ;t15 + ITX_MULSUB_2W 4, 2, 3, 5, 6, 1567, 3784 ;t9a, t14a + mova m3, [rsp+gprsize*2+16*1] + mova m5, [rsp+gprsize*2+32*5] + mova [rsp+gprsize*2+16*1], m2 + mova [rsp+gprsize*2+32*5], m4 + mova m2, [rsp+gprsize*2+16*2] + mova [rsp+gprsize*2+16*2], m7 + ITX_MULSUB_2W 3, 5, 7, 4, 6, 1931, 3612 ;t10a, t13a + ITX_MULSUB_2W 2, 1, 7, 4, 6, 3920, 1189 ;t11a, t12a + psubsw m4, m2, m3 ;t10 + paddsw m2, m3 ;t11 + psubsw m3, m1, m5 ;t13 + paddsw m1, m5 ;t12 + ITX_MULSUB_2W 3, 4, 7, 5, 6, m3784, 1567 ;t10a, t13a + mova m7, [rsp+gprsize*2+32*5] + psubsw m6, m0, m2 ;t11a + paddsw m0, m2 ;t8a + paddsw m2, m7, m3 ;t9 + psubsw m7, m3 ;t10 + mova m5, [rsp+gprsize*2+16*0] + psubsw m3, m5, m0 ;out8 + paddsw m0, m5 ;out7 + mova [rsp+gprsize*2+32*5], m0 + mova m5, [rsp+gprsize*2+16*9] + psubsw m0, m5, m2 ;out9 + paddsw m2, m5 ;out6 + mova [rsp+gprsize*2+16*0], m0 + mova [rsp+gprsize*2+16*9], m2 + mova m0, [rsp+gprsize*2+16*1] + mova m2, [rsp+gprsize*2+16*2] + mova [rsp+gprsize*2+16*1], m3 + psubsw m5, m0, m4 ;t13 + paddsw m0, m4 ;t14 + mova m3, [o(pd_2048)] + psubsw m4, m2, m1 ;t12a + paddsw m1, m2 ;t15a + mova [rsp+gprsize*2+16*2], m1 + ITX_MULSUB_2W 5, 7, 1, 2, 3, 2896, 2896 ;t10a, t13a + ITX_MULSUB_2W 4, 6, 1, 2, 3, 2896, 2896 ;t11, t12 + mova m3, [rsp+gprsize*2+16*8] + psubsw m2, m3, m5 ;out10 + paddsw m3, m5 ;out5 + mova m5, [rsp+gprsize*2+16*7] + mova [rsp+gprsize*2+16*8], m3 + psubsw m3, m5, m4 ;out11 + paddsw m5, m4 ;out4 + mova m4, [rsp+gprsize*2+16*6] + mova [rsp+gprsize*2+16*7], m5 + paddsw m5, m4, m6 ;out3 + psubsw m4, m6 ;out12 + mova m6, [rsp+gprsize*2+16*5] + mova [rsp+gprsize*2+16*6], m5 + psubsw m5, m6, m7 ;out13 + paddsw m6, m7 ;out2 + mova m7, [rsp+gprsize*2+16*4] + mova [rsp+gprsize*2+16*5], m6 + psubsw m6, m7, m0 ;out14 + paddsw m7, m0 ;out1 + mova m1, [rsp+gprsize*2+16*2] + mova m0, [rsp+gprsize*2+16*3] + mova [rsp+gprsize*2+16*4], m7 + psubsw m7, m0, m1 ;out15 + paddsw m0, m1 ;out0 + mova [rsp+gprsize*2+16*3], m0 + mova m1, [rsp+gprsize*2+16*0] + mova m0, [rsp+gprsize*2+16*1] + mova [rsp+gprsize*2+16*0], m7 + ret + +INV_TXFM_16X8_FN adst, dct +INV_TXFM_16X8_FN adst, adst +INV_TXFM_16X8_FN adst, flipadst +INV_TXFM_16X8_FN adst, identity + +cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m7, [o(pw_2896x8)] + pmulhrsw m0, m7, [coeffq+16*0 ] + pmulhrsw m1, m7, [coeffq+16*1 ] + pmulhrsw m2, m7, [coeffq+16*14] + pmulhrsw m3, m7, [coeffq+16*15] + mova [rsp+gprsize+16*7], m0 + mova [rsp+gprsize+16*8], m1 + mova [rsp+gprsize+16*9], m2 + mova [rsp+gprsize+32*5], m3 + pmulhrsw m0, m7, [coeffq+16*6 ] + pmulhrsw m1, m7, [coeffq+16*7 ] + pmulhrsw m2, m7, [coeffq+16*8 ] + pmulhrsw m3, m7, [coeffq+16*9 ] + mova [rsp+gprsize+16*3], m2 + mova [rsp+gprsize+16*4], m3 + mova [rsp+gprsize+16*5], m0 + mova [rsp+gprsize+16*6], m1 + pmulhrsw m0, m7, [coeffq+16*2 ] + pmulhrsw m1, m7, [coeffq+16*3 ] + pmulhrsw m2, m7, [coeffq+16*4 ] + pmulhrsw m3, m7, [coeffq+16*5 ] + pmulhrsw m4, m7, [coeffq+16*10] + pmulhrsw m5, m7, [coeffq+16*11] + pmulhrsw m6, m7, [coeffq+16*12] + pmulhrsw m7, [coeffq+16*13] + + call .main + call .main_pass1_end + mov r3, tx2q + lea tx2q, [o(.pass1_end)] + jmp m(iadst_8x8_internal_8bpc).pass1_end + +.pass1_end: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mov tx2q, r3 + jmp m(iadst_8x8_internal_8bpc).pass1_end + +.pass2: + lea tx2q, [o(.end)] + lea r3, [dstq+8] + jmp m(iadst_8x8_internal_8bpc).pass2_main + +.end: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + mov dstq, r3 + jmp m(iadst_8x8_internal_8bpc).pass2_main + +ALIGN function_align +cglobal_label .main + mova [rsp+gprsize*2+16*0], m1 + mova [rsp+gprsize*2+16*1], m2 + mova [rsp+gprsize*2+16*2], m6 + + mova m6, [o(pd_2048)] + ITX_MULSUB_2W 7, 0, 1, 2, 6, 995, 3973 ;t3, t2 + ITX_MULSUB_2W 3, 4, 1, 2, 6, 3513, 2106 ;t11, t10 + psubsw m1, m0, m4 ;t10a + paddsw m0, m4 ;t2a + psubsw m4, m7, m3 ;t11a + paddsw m3, m7 ;t3a + ITX_MULSUB_2W 1, 4, 7, 2, 6, 3406, 2276 ;t11, t10 + mova m2, [rsp+gprsize*2+16*0] ;in3 + mova m7, [rsp+gprsize*2+16*1] ;in4 + mova [rsp+gprsize*2+16*0], m1 ;t11 + mova [rsp+gprsize*2+16*1], m4 ;t10 + mova m1, [rsp+gprsize*2+16*2] ;in12 + mova [rsp+gprsize*2+16*2], m0 ;t2a + ITX_MULSUB_2W 5, 7, 0, 4, 6, 1751, 3703 ;t5, t4 + ITX_MULSUB_2W 2, 1, 0, 4, 6, 3857, 1380 ;t13, t12 + psubsw m0, m7, m1 ;t12a + paddsw m1, m7 ;t4a + psubsw m4, m5, m2 ;t13a + paddsw m5, m2 ;t5a + ITX_MULSUB_2W 4, 0, 7, 2, 6, 4017, 799 ;t12, t13 + mova m2, [rsp+gprsize*2+16*8] ;in1 + mova m7, [rsp+gprsize*2+16*9] ;in14 + mova [rsp+gprsize*2+16*8], m4 ;t12 + mova [rsp+gprsize*2+16*9], m0 ;t13 + mova m4, [rsp+gprsize*2+16*4] ;in9 + mova m0, [rsp+gprsize*2+16*5] ;in6 + mova [rsp+gprsize*2+16*4], m1 ;t4a + mova [rsp+gprsize*2+16*5], m5 ;t5a + ITX_MULSUB_2W 2, 7, 1, 5, 6, 4052, 601 ;t15, t14 + ITX_MULSUB_2W 4, 0, 1, 5, 6, 2440, 3290 ;t7, t6 + psubsw m1, m0, m7 ;t14a + paddsw m0, m7 ;t6a + psubsw m5, m4, m2 ;t15a + paddsw m4, m2 ;t7a + ITX_MULSUB_2W 5, 1, 7, 2, 6, 2276, 3406 ;t14, t15 + mova m2, [rsp+gprsize*2+16*2] ;t2a + mova [rsp+gprsize*2+16*2], m5 ;t14 + psubsw m7, m2, m0 ;t6 + paddsw m2, m0 ;t2 + psubsw m0, m3, m4 ;t7 + paddsw m3, m4 ;t3 + ITX_MULSUB_2W 0, 7, 4, 5, 6, 3784, 1567 ;t6a, t7a + mova m4, [rsp+gprsize*2+16*7] ;in0 + mova m5, [rsp+gprsize*2+32*5] ;in15 + mova [rsp+gprsize*2+16*7], m3 ;t3 + mova [rsp+gprsize*2+32*5], m1 ;t15 + mova m1, [rsp+gprsize*2+16*6] ;in7 + mova m3, [rsp+gprsize*2+16*3] ;in8 + mova [rsp+gprsize*2+16*6], m7 ;t7a + mova [rsp+gprsize*2+16*3], m0 ;t6a + ITX_MULSUB_2W 5, 4, 0, 7, 6, 201, 4091 ;t1, t0 + ITX_MULSUB_2W 1, 3, 0, 7, 6, 3035, 2751 ;t9, t8 + psubsw m0, m4, m3 ;t8a + paddsw m4, m3 ;t0a + psubsw m3, m5, m1 ;t9a + paddsw m5, m1 ;t1a + ITX_MULSUB_2W 0, 3, 1, 7, 6, 799, 4017 ;t9, t8 + mova m1, [rsp+gprsize*2+16*4] ;t4a + mova m7, [rsp+gprsize*2+16*5] ;t5a + mova [rsp+gprsize*2+16*4], m3 ;t8 + mova [rsp+gprsize*2+16*5], m0 ;t9 + psubsw m0, m4, m1 ;t4 + paddsw m4, m1 ;t0 + psubsw m3, m5, m7 ;t5 + paddsw m5, m7 ;t1 + ITX_MULSUB_2W 0, 3, 1, 7, 6, 1567, 3784 ;t5a, t4a + mova m7, [rsp+gprsize*2+16*3] ;t6a + psubsw m1, m4, m2 ;t2a + paddsw m4, m2 ;out0 + mova [rsp+gprsize*2+16*3], m4 ;out0 + mova m4, [rsp+gprsize*2+16*6] ;t7a + psubsw m2, m3, m7 ;t6 + paddsw m3, m7 ;-out3 + mova [rsp+gprsize*2+16*6], m3 ;-out3 + psubsw m3, m0, m4 ;t7 + paddsw m0, m4 ;out12 + mova [rsp+gprsize*2+16*12], m3 + mova m3, [rsp+gprsize*2+16*7] ;t3 + mova [rsp+gprsize*2+16* 7], m2 ;out4 + psubsw m2, m5, m3 ;t3a + paddsw m5, m3 ;-out15 + mova [rsp+gprsize*2+16*11], m2 + mova m2, [rsp+gprsize*2+32*5] ;t15 + mova [rsp+gprsize*2+16*10], m1 ;-out7 + mova m1, [rsp+gprsize*2+16*0] ;t11 + mova [rsp+gprsize*2+16*0 ], m5 ;-out15 + mova m3, [rsp+gprsize*2+16*1] ;t10 + mova [rsp+gprsize*2+16*1 ], m4 ;-out11 + mova m4, [rsp+gprsize*2+16*2] ;t14 + mova [rsp+gprsize*2+16*2 ], m0 ;out12 + psubsw m0, m3, m4 ;t14a + paddsw m3, m4 ;t10a + psubsw m5, m1, m2 ;t15a + paddsw m1, m2 ;t11a + ITX_MULSUB_2W 5, 0, 2, 4, 6, 3784, 1567 ;t14, t15 + mova m2, [rsp+gprsize*2+16*4] ;t8 + mova m4, [rsp+gprsize*2+16*5] ;t9 + mova [rsp+gprsize*2+16*4], m3 ;t10a + mova [rsp+gprsize*2+16*5], m1 ;t11a + mova m3, [rsp+gprsize*2+16*8] ;t12 + mova m1, [rsp+gprsize*2+16*9] ;t13 + mova [rsp+gprsize*2+16*8], m5 ;t14 + mova [rsp+gprsize*2+16*9], m0 ;t15 + psubsw m5, m2, m3 ;t12a + paddsw m2, m3 ;t8a + psubsw m0, m4, m1 ;t13a + paddsw m4, m1 ;t9a + ITX_MULSUB_2W 5, 0, 1, 3, 6, 1567, 3784 ;t13, t12 + mova m6, [rsp+gprsize*2+16*4] ;t10a + mova m1, [rsp+gprsize*2+16*5] ;t11a + psubsw m3, m2, m6 ;t10 + paddsw m2, m6 ;-out1 + paddsw m6, m4, m1 ;out14 + psubsw m4, m1 ;t11 + mova [rsp+gprsize*2+16*14], m4 + mova [rsp+gprsize*2+16* 4], m2 ;-out1 + mova m4, [rsp+gprsize*2+16*8] ;t14 + mova m2, [rsp+gprsize*2+16*9] ;t15 + mova [rsp+gprsize*2+16* 9], m3 ;out6 + psubsw m3, m0, m4 ;t14a + paddsw m0, m4 ;out2 + psubsw m4, m5, m2 ;t15a + paddsw m5, m2 ;-out13 + mova [rsp+gprsize*2+16* 5], m0 ;out2 + ret +ALIGN function_align +.main_pass1_end: + mova m0, [rsp+gprsize*2+16*14] + mova [rsp+gprsize*2+16*14], m5 + mova [rsp+gprsize*2+16*15], m6 + mova m5, [o(pw_2896_2896)] + mova m6, [o(pw_2896_m2896)] + mova m7, [o(pd_2048)] + punpcklwd m2, m3, m4 + punpckhwd m3, m4 + pmaddwd m4, m5, m2 + pmaddwd m2, m6 + pmaddwd m1, m5, m3 + pmaddwd m3, m6 + REPX {paddd x, m7}, m4, m2, m1, m3 + REPX {psrad x, 12}, m4, m1, m2, m3 + packssdw m4, m1 ;-out5 + packssdw m2, m3 ;out10 + mova [rsp+gprsize*2+16* 8], m4 + mova m3, [rsp+gprsize*2+16* 9] + punpcklwd m1, m3, m0 + punpckhwd m3, m0 + pmaddwd m0, m5, m1 + pmaddwd m1, m6 + pmaddwd m4, m5, m3 + pmaddwd m3, m6 + REPX {paddd x, m7}, m0, m1, m4, m3 + REPX {psrad x, 12}, m0, m4, m1, m3 + packssdw m0, m4 ;out6 + packssdw m1, m3 ;-out9 + mova [rsp+gprsize*2+16* 9], m0 + mova m0, [rsp+gprsize*2+16* 7] + mova m4, [rsp+gprsize*2+16*12] + punpcklwd m3, m0, m4 + punpckhwd m0, m4 + pmaddwd m4, m5, m3 + pmaddwd m3, m6 + pmaddwd m5, m0 + pmaddwd m0, m6 + REPX {paddd x, m7}, m4, m3, m5, m0 + REPX {psrad x, 12}, m4, m5, m3, m0 + packssdw m4, m5 ;out4 + packssdw m3, m0 ;-out11 + mova [rsp+gprsize*2+16* 7], m4 + mova m4, [rsp+gprsize*2+16*10] + mova m5, [rsp+gprsize*2+16*11] + punpcklwd m0, m4, m5 + punpckhwd m4, m5 + pmaddwd m5, m0, [o(pw_2896_2896)] + pmaddwd m0, m6 + pmaddwd m6, m4 + pmaddwd m4, [o(pw_2896_2896)] + REPX {paddd x, m7}, m5, m0, m6, m4 + REPX {psrad x, 12}, m0, m6, m5, m4 + packssdw m0, m6 ;out8 + packssdw m5, m4 ;-out7 + mova [rsp+gprsize*2+16*10], m5 + mova m4, [rsp+gprsize*2+16* 2] ;out12 + mova m5, [rsp+gprsize*2+16*14] ;-out13 + mova m6, [rsp+gprsize*2+16*15] ;out14 + ret +ALIGN function_align +cglobal_label .main_pass2_end + mova m7, [o(pw_2896x8)] + mova m1, [rsp+gprsize*2+16* 9] + mova m2, [rsp+gprsize*2+16*14] + paddsw m0, m1, m2 + psubsw m1, m2 + pmulhrsw m0, m7 ;out6 + pmulhrsw m1, m7 ;-out9 + mova [rsp+gprsize*2+16* 9], m0 + psubsw m2, m3, m4 + paddsw m3, m4 + pmulhrsw m2, m7 ;out10 + pmulhrsw m3, m7 ;-out5 + mova [rsp+gprsize*2+16* 8], m3 + mova m3, [rsp+gprsize*2+16* 7] + mova m4, [rsp+gprsize*2+16*12] + paddsw m0, m3, m4 + psubsw m3, m4 + pmulhrsw m0, m7 ;out4 + pmulhrsw m3, m7 ;-out11 + mova [rsp+gprsize*2+16* 7], m0 + mova m0, [rsp+gprsize*2+16*10] + paddsw m4, m0, [rsp+gprsize*2+16*11] + psubsw m0, [rsp+gprsize*2+16*11] + pmulhrsw m4, m7 ;-out7 + pmulhrsw m0, m7 ;out8 + mova [rsp+gprsize*2+16*10], m4 + mova m4, [rsp+gprsize*2+16*2 ] ;out12 + ret + +INV_TXFM_16X8_FN flipadst, dct +INV_TXFM_16X8_FN flipadst, adst +INV_TXFM_16X8_FN flipadst, flipadst +INV_TXFM_16X8_FN flipadst, identity + +cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mova m7, [o(pw_2896x8)] + pmulhrsw m0, m7, [coeffq+16*0 ] + pmulhrsw m1, m7, [coeffq+16*1 ] + pmulhrsw m2, m7, [coeffq+16*14] + pmulhrsw m3, m7, [coeffq+16*15] + mova [rsp+gprsize+16*7], m0 + mova [rsp+gprsize+16*8], m1 + mova [rsp+gprsize+16*9], m2 + mova [rsp+gprsize+32*5], m3 + pmulhrsw m0, m7, [coeffq+16*6 ] + pmulhrsw m1, m7, [coeffq+16*7 ] + pmulhrsw m2, m7, [coeffq+16*8 ] + pmulhrsw m3, m7, [coeffq+16*9 ] + mova [rsp+gprsize+16*3], m2 + mova [rsp+gprsize+16*4], m3 + mova [rsp+gprsize+16*5], m0 + mova [rsp+gprsize+16*6], m1 + pmulhrsw m0, m7, [coeffq+16*2 ] + pmulhrsw m1, m7, [coeffq+16*3 ] + pmulhrsw m2, m7, [coeffq+16*4 ] + pmulhrsw m3, m7, [coeffq+16*5 ] + pmulhrsw m4, m7, [coeffq+16*10] + pmulhrsw m5, m7, [coeffq+16*11] + pmulhrsw m6, m7, [coeffq+16*12] + pmulhrsw m7, [coeffq+16*13] + + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end + + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS coeffq+16*0, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mov r3, tx2q + lea tx2q, [o(.pass1_end)] + jmp m(iflipadst_8x8_internal_8bpc).pass1_end + +.pass1_end: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS coeffq+16*0, 32 + mova [rsp+gprsize+16*0], m7 + mov tx2q, r3 + jmp m(iflipadst_8x8_internal_8bpc).pass1_end + +.pass2: + lea tx2q, [o(.end)] + lea r3, [dstq+8] + jmp m(iflipadst_8x8_internal_8bpc).pass2_main + +.end: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + mov dstq, r3 + jmp m(iflipadst_8x8_internal_8bpc).pass2_main + + +INV_TXFM_16X8_FN identity, dct +INV_TXFM_16X8_FN identity, adst +INV_TXFM_16X8_FN identity, flipadst +INV_TXFM_16X8_FN identity, identity + +cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + add coeffq, 16*16 + mova m4, [coeffq-16*7] + mova m5, [coeffq-16*5] + mova m6, [coeffq-16*3] + mova m7, [coeffq-16*1] + mov r3, tx2q + lea tx2q, [o(.pass1_end)] + +.pass1: + mova m0, [o(pw_2896x8)] + mova m2, [o(pw_1697x16)] + mova m3, [o(pw_16384)] + sub coeffq, 8*16 + REPX {pmulhrsw x, m0}, m4, m5, m6, m7 + pmulhrsw m1, m2, m4 + pmulhrsw m1, m3 + paddsw m1, m4 ; 1 + pmulhrsw m4, m2, m5 + pmulhrsw m4, m3 + paddsw m4, m5 ; 3 + pmulhrsw m5, m2, m6 + pmulhrsw m5, m3 + paddsw m5, m6 ; 5 + pmulhrsw m6, m2, m7 + pmulhrsw m6, m3 + paddsw m7, m6 ; 7 + pmulhrsw m6, m0, [coeffq+16*6] + mova [rsp+gprsize+16*0], m4 + pmulhrsw m4, m2, m6 + pmulhrsw m4, m3 + paddsw m6, m4 ; 6 + pmulhrsw m4, m0, [coeffq+16*4] + mova [rsp+gprsize+16*1], m6 + pmulhrsw m6, m2, m4 + pmulhrsw m6, m3 + paddsw m4, m6 ; 4 + pmulhrsw m6, m0, [coeffq+16*2] + pmulhrsw m0, [coeffq+16*0] + pmulhrsw m2, m6 + pmulhrsw m2, m3 + paddsw m2, m6 ; 2 + pmulhrsw m6, m0, [o(pw_1697x16)] + pmulhrsw m6, m3 + mova m3, [rsp+gprsize+16*0] + paddsw m0, m6 + jmp m(idct_8x8_internal_8bpc).pass1_end3 + +.pass1_end: + mova [coeffq+16*1], m4 + mova [coeffq+16*3], m5 + mova [coeffq+16*5], m6 + mova [coeffq+16*7], m7 + mova m4, [coeffq-16*7] + mova m5, [coeffq-16*5] + mova m6, [coeffq-16*3] + mova m7, [coeffq-16*1] + mova [coeffq-16*7], m0 + mova [coeffq-16*5], m1 + mova [coeffq-16*3], m2 + mova [coeffq-16*1], m3 + mov tx2q, r3 + jmp .pass1 + +.pass2: + lea tx2q, [o(.end)] + lea r3, [dstq+8] + jmp m(iidentity_8x8_internal_8bpc).end + +.end: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + mov dstq, r3 + jmp m(iidentity_8x8_internal_8bpc).end + + +%macro INV_TXFM_16X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x16, 8, 16*16 +%ifidn %1_%2, dct_dct + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + mov r2d, 8 + lea tx2q, [o(m(inv_txfm_add_dct_dct_16x16_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly +.end: + RET +%endif +%endmacro + +INV_TXFM_16X16_FN dct, dct +INV_TXFM_16X16_FN dct, adst +INV_TXFM_16X16_FN dct, flipadst +INV_TXFM_16X16_FN dct, identity + +cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq+16*1, 64 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+16*3, 64 + call m(idct_16x8_internal_8bpc).main + mov r3, tx2q + lea tx2q, [o(.pass1_end)] + mova m7, [o(pw_8192)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end: + SAVE_8ROWS coeffq+16*17, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end1)] + mova m7, [o(pw_8192)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS coeffq+16*0, 64 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+16*2, 64 + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(.pass1_end2)] + mova m7, [o(pw_8192)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end2: + SAVE_8ROWS coeffq+16*16, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mov tx2q, r3 + mova m7, [o(pw_8192)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass2: + lea tx2q, [o(.end)] + jmp m(idct_8x16_internal_8bpc).pass2_pre + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.end1)] + mov dstq, r3 + lea r3, [dstq+8] + jmp m(idct_8x8_internal_8bpc).end + +.end1: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + + add coeffq, 32*8 + mov dstq, r3 + + mova m0, [coeffq+16*0 ] + mova m1, [coeffq+16*4 ] + mova m2, [coeffq+16*8 ] + mova m3, [coeffq+16*12] + mova m4, [coeffq+16*1 ] + mova m5, [coeffq+16*5 ] + mova m6, [coeffq+16*9 ] + mova m7, [coeffq+16*13] + lea tx2q, [o(m(idct_8x16_internal_8bpc).end)] + jmp m(idct_8x16_internal_8bpc).pass2_main + + +%macro ITX_16X16_ADST_LOAD_ODD_COEFS 0 + mova m0, [coeffq+16*1 ] + mova m1, [coeffq+16*3 ] + mova m2, [coeffq+16*29] + mova m3, [coeffq+16*31] + mova [rsp+gprsize+16*7], m0 + mova [rsp+gprsize+16*8], m1 + mova [rsp+gprsize+16*9], m2 + mova [rsp+gprsize+32*5], m3 + mova m0, [coeffq+16*13] + mova m1, [coeffq+16*15] + mova m2, [coeffq+16*17] + mova m3, [coeffq+16*19] + mova [rsp+gprsize+16*3], m2 + mova [rsp+gprsize+16*4], m3 + mova [rsp+gprsize+16*5], m0 + mova [rsp+gprsize+16*6], m1 + mova m0, [coeffq+16*5 ] + mova m1, [coeffq+16*7 ] + mova m2, [coeffq+16*9 ] + mova m3, [coeffq+16*11] + mova m4, [coeffq+16*21] + mova m5, [coeffq+16*23] + mova m6, [coeffq+16*25] + mova m7, [coeffq+16*27] +%endmacro + +%macro ITX_16X16_ADST_LOAD_EVEN_COEFS 0 + mova m0, [coeffq+16*0 ] + mova m1, [coeffq+16*2 ] + mova m2, [coeffq+16*28] + mova m3, [coeffq+16*30] + mova [rsp+gprsize+16*7], m0 + mova [rsp+gprsize+16*8], m1 + mova [rsp+gprsize+16*9], m2 + mova [rsp+gprsize+32*5], m3 + mova m0, [coeffq+16*12] + mova m1, [coeffq+16*14] + mova m2, [coeffq+16*16] + mova m3, [coeffq+16*18] + mova [rsp+gprsize+16*3], m2 + mova [rsp+gprsize+16*4], m3 + mova [rsp+gprsize+16*5], m0 + mova [rsp+gprsize+16*6], m1 + mova m0, [coeffq+16*4 ] + mova m1, [coeffq+16*6 ] + mova m2, [coeffq+16*8 ] + mova m3, [coeffq+16*10] + mova m4, [coeffq+16*20] + mova m5, [coeffq+16*22] + mova m6, [coeffq+16*24] + mova m7, [coeffq+16*26] +%endmacro + +INV_TXFM_16X16_FN adst, dct +INV_TXFM_16X16_FN adst, adst +INV_TXFM_16X16_FN adst, flipadst + +cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + ITX_16X16_ADST_LOAD_ODD_COEFS + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end + + mov r3, tx2q + lea tx2q, [o(.pass1_end)] + mova m7, [o(pw_8192)] + jmp m(iadst_8x8_internal_8bpc).pass1_end1 + +.pass1_end: + SAVE_8ROWS coeffq+16*17, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end1)] + mova m7, [o(pw_8192)] + jmp m(iadst_8x8_internal_8bpc).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+16*1, 32 + ITX_16X16_ADST_LOAD_EVEN_COEFS + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end + + lea tx2q, [o(.pass1_end2)] + mova m7, [o(pw_8192)] + jmp m(iadst_8x8_internal_8bpc).pass1_end1 + +.pass1_end2: + SAVE_8ROWS coeffq+16*16, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mov tx2q, r3 + mova m7, [o(pw_8192)] + jmp m(iadst_8x8_internal_8bpc).pass1_end1 + +.pass2: + lea tx2q, [o(.end)] + jmp m(iadst_8x16_internal_8bpc).pass2_pre + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.end1)] + mov dstq, r3 + lea r3, [dstq+8] + jmp m(iadst_8x8_internal_8bpc).end + +.end1: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + + add coeffq, 32*8 + mov dstq, r3 + + mova m4, [coeffq+16*0 ] + mova m5, [coeffq+16*2 ] + mova m0, [coeffq+16*4 ] + mova m1, [coeffq+16*6 ] + mova m2, [coeffq+16*8 ] + mova m3, [coeffq+16*10] + mova m6, [coeffq+16*12] + mova m7, [coeffq+16*14] + mova [rsp+gprsize+16*7], m4 + mova [rsp+gprsize+16*8], m5 + mova [rsp+gprsize+16*5], m6 + mova [rsp+gprsize+16*6], m7 + lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)] + jmp m(iadst_8x16_internal_8bpc).pass2_main + + +INV_TXFM_16X16_FN flipadst, dct +INV_TXFM_16X16_FN flipadst, adst +INV_TXFM_16X16_FN flipadst, flipadst + +cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + ITX_16X16_ADST_LOAD_ODD_COEFS + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end + + mov r3, tx2q + lea tx2q, [o(.pass1_end)] + mova m7, [o(pw_m8192)] + jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 + +.pass1_end: + SAVE_8ROWS coeffq+16*1, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end1)] + mova m7, [o(pw_m8192)] + jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+16*17, 32 + ITX_16X16_ADST_LOAD_EVEN_COEFS + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass1_end + + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS coeffq+16*0, 32 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end2)] + mova m7, [o(pw_m8192)] + jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 + +.pass1_end2: + SAVE_8ROWS coeffq+16*16, 32 + LOAD_8ROWS coeffq+16* 0, 32 + mova [rsp+gprsize+16*0], m7 + mov tx2q, r3 + mova m7, [o(pw_m8192)] + jmp m(iflipadst_8x8_internal_8bpc).pass1_end1 + +.pass2: + lea tx2q, [o(.end)] + lea r3, [dstq+8] + jmp m(iflipadst_8x16_internal_8bpc).pass2_pre + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.end1)] + lea dstq, [dstq+strideq*2] + jmp m(iflipadst_8x8_internal_8bpc).end + +.end1: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + + add coeffq, 32*8 + + mova m4, [coeffq+16*0 ] + mova m5, [coeffq+16*2 ] + mova m0, [coeffq+16*4 ] + mova m1, [coeffq+16*6 ] + mova m2, [coeffq+16*8 ] + mova m3, [coeffq+16*10] + mova m6, [coeffq+16*12] + mova m7, [coeffq+16*14] + mova [rsp+gprsize+16*7], m4 + mova [rsp+gprsize+16*8], m5 + mova [rsp+gprsize+16*5], m6 + mova [rsp+gprsize+16*6], m7 + + lea tx2q, [o(.end2)] + mov dstq, r3 + jmp m(iflipadst_8x16_internal_8bpc).pass2_main + +.end2: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + lea dstq, [dstq+strideq*2] + jmp m(iflipadst_8x8_internal_8bpc).end + + +%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16 + pmulhrsw m%2, m%3, m%1 + psraw m%2, 1 + pavgw m%1, m%2 +%endmacro + +INV_TXFM_16X16_FN identity, dct +INV_TXFM_16X16_FN identity, identity + +cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + add coeffq, 16*17 + mov r3, tx2q + lea tx2q, [o(.pass1_end)] + +.pass1: + mova m6, [o(pw_1697x16)] + mova m7, [coeffq+32*6] + mova m0, [coeffq+32*0] + mova m1, [coeffq+32*1] + mova m2, [coeffq+32*2] + mova m3, [coeffq+32*3] + mova m4, [coeffq+32*4] + REPX {IDTX16B x, 5, 6}, 7, 0, 1, 2, 3, 4 + mova m5, [coeffq+32*5] + mova [rsp+gprsize+16*1], m7 + IDTX16B 5, 7, 6 + mova m7, [coeffq+32*7] + IDTX16B 7, 6, 6 + jmp m(idct_8x8_internal_8bpc).pass1_end3 + +.pass1_end: + SAVE_8ROWS coeffq, 32 + sub coeffq, 16 + lea tx2q, [o(.pass1_end1)] + jmp .pass1 + +.pass1_end1: + SAVE_8ROWS coeffq, 32 + sub coeffq, 15*16 + lea tx2q, [o(.pass1_end2)] + jmp .pass1 + +.pass1_end2: + SAVE_8ROWS coeffq, 32 + sub coeffq, 16 + mov tx2q, r3 + jmp .pass1 + +.pass2: + lea r3, [dstq+8] + lea tx2q, [o(.end1)] + +.end: + mova [rsp+gprsize+16*0], m7 + mova [rsp+gprsize+16*1], m4 + mova m7, [o(pw_1697x16)] + REPX {IDTX16 x, 4, 7}, 5, 6, 0, 1, 2, 3 + mova m4, [o(pw_2048)] + pmulhrsw m5, m4 + pmulhrsw m6, m4 + mova [rsp+gprsize+16*2], m5 + mova m5, [rsp+gprsize+16*1] + mova [rsp+gprsize+16*1], m6 + IDTX16 5, 6, 7 + mova m6, [rsp+gprsize+16*0] + IDTX16 6, 7, 7 + REPX {pmulhrsw x, m4}, m0, m1, m2, m3, m6 + pmulhrsw m4, m5 + mova [rsp+gprsize+16*0], m6 + jmp m(idct_8x8_internal_8bpc).end3 + +.end1: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(.end2)] + lea dstq, [dstq+strideq*2] + jmp .end + +.end2: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + + add coeffq, 32*8 + LOAD_8ROWS coeffq, 32 + lea tx2q, [o(.end3)] + mov dstq, r3 + jmp .end + +.end3: + LOAD_8ROWS coeffq+16*1, 32 + lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)] + lea dstq, [dstq+strideq*2] + jmp .end + + +cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + call m(idct_8x32_internal_8bpc) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + pmulhrsw m0, m2 + psrlw m2, 2 ;pw_2048 + pmulhrsw m0, m1 + pmulhrsw m0, m2 + pshuflw m0, m0, q0000 + punpcklwd m0, m0 + mov r3d, 8 + lea tx2q, [o(.end)] + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop + +.end: + RET + + + +cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + cmp eobd, 106 + jle .fast + + LOAD_8ROWS coeffq+16*3, 64 + call m(idct_8x8_internal_8bpc).main + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1: + mova [rsp+gprsize+16*9 ], m0 ;in24 + mova [rsp+gprsize+16*10], m4 ;in28 + mova [rsp+gprsize+16*17], m2 ;in26 + mova [rsp+gprsize+16*18], m6 ;in30 + mova [rsp+gprsize+16*31], m1 ;in25 + mova [rsp+gprsize+16*30], m3 ;in27 + mova [rsp+gprsize+16*27], m5 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + LOAD_8ROWS coeffq+16*2, 64 + call m(idct_8x8_internal_8bpc).main + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_1: + mova [rsp+gprsize+16*7 ], m0 ;in16 + mova [rsp+gprsize+16*8 ], m4 ;in20 + mova [rsp+gprsize+16*15], m2 ;in18 + mova [rsp+gprsize+16*16], m6 ;in22 + mova [rsp+gprsize+16*33], m1 ;in17 + mova [rsp+gprsize+16*28], m3 ;in19 + mova [rsp+gprsize+16*29], m5 ;in21 + mova [rsp+gprsize+16*32], m7 ;in23 + +.fast: + LOAD_8ROWS coeffq+16*1, 64 + call m(idct_8x8_internal_8bpc).main + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end: + mova [rsp+gprsize+16*5 ], m0 ;in8 + mova [rsp+gprsize+16*6 ], m4 ;in12 + mova [rsp+gprsize+16*13], m2 ;in10 + mova [rsp+gprsize+16*14], m6 ;in14 + mova [rsp+gprsize+16*21], m1 ;in9 + mova [rsp+gprsize+16*24], m3 ;in11 + mova [rsp+gprsize+16*25], m5 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + LOAD_8ROWS coeffq+16*0, 64 + call m(idct_8x8_internal_8bpc).main + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end1: + mova [rsp+gprsize+16*11], m2 ;in2 + mova [rsp+gprsize+16*12], m6 ;in6 + mova [rsp+gprsize+16*19], m1 ;in1 + mova [rsp+gprsize+16*26], m3 ;in3 + mova [rsp+gprsize+16*23], m5 ;in5 + mova [rsp+gprsize+16*22], m7 ;in7 + mova m1, m4 ;in4 + mova m2, [rsp+gprsize+16*5 ] ;in8 + mova m3, [rsp+gprsize+16*6 ] ;in12 + + cmp eobd, 106 + jg .full + + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3 , 16 + mova m0, [rsp+gprsize+16*11] + mova m1, [rsp+gprsize+16*12] + mova m2, [rsp+gprsize+16*13] + mova m3, [rsp+gprsize+16*14] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call .main_fast + jmp .pass2 + +.full: + mova m4, [rsp+gprsize+16*7 ] ;in16 + mova m5, [rsp+gprsize+16*8 ] ;in20 + mova m6, [rsp+gprsize+16*9 ] ;in24 + mova m7, [rsp+gprsize+16*10] ;in28 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3 , 16 + LOAD_8ROWS rsp+gprsize+16*11, 16 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + call .main + +.pass2: + lea r3, [o(.end6)] + +.end: + mova [rsp+gprsize+16*0 ], m7 + lea tx2q, [o(.end2)] + +.end1: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14, 15, \ + 16, 17, 18, 19, 20, 21, 22, 23, \ + 24, 25, 26, 27, 28, 29, 30, 31 + + jmp tx2q + +.end2: + lea tx2q, [o(.end3)] + jmp m(idct_8x8_internal_8bpc).end + +.end3: + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0 ], m7 + lea dstq, [dstq+strideq*2] + lea tx2q, [o(.end4)] + jmp m(idct_8x8_internal_8bpc).end + +.end4: + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0 ], m7 + lea dstq, [dstq+strideq*2] + lea tx2q, [o(.end5)] + jmp m(idct_8x8_internal_8bpc).end + +.end5: + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0 ], m7 + lea dstq, [dstq+strideq*2] + mov tx2q, r3 + jmp m(idct_8x8_internal_8bpc).end + +.end6: + ret + +ALIGN function_align +cglobal_label .main_veryfast + mova m0, [rsp+gprsize*2+16*19] ;in1 + pmulhrsw m3, m0, [o(pw_4091x8)] ;t30,t31 + pmulhrsw m0, [o(pw_201x8)] ;t16,t17 + mova m7, [o(pd_2048)] + mova [rsp+gprsize*2+16*19], m0 ;t16 + mova [rsp+gprsize*2+16*34], m3 ;t31 + ITX_MULSUB_2W 3, 0, 1, 2, 7, 799, 4017 ;t17a, t30a + mova [rsp+gprsize*2+16*20], m3 ;t17a + mova [rsp+gprsize*2+16*33], m0 ;t30a + mova m1, [rsp+gprsize*2+16*22] ;in7 + pmulhrsw m2, m1, [o(pw_3857x8)] ;t28,t29 + pmulhrsw m1, [o(pw_m1380x8)] ;t18,t19 + mova [rsp+gprsize*2+16*22], m1 ;t19 + mova [rsp+gprsize*2+16*31], m2 ;t28 + ITX_MULSUB_2W 2, 1, 0, 3, 7, m4017, 799 ;t18a, t29a + mova [rsp+gprsize*2+16*21], m2 ;t18a + mova [rsp+gprsize*2+16*32], m1 ;t29a + mova m0, [rsp+gprsize*2+16*23] ;in5 + pmulhrsw m3, m0, [o(pw_3973x8)] ;t26, t27 + pmulhrsw m0, [o(pw_995x8)] ;t20, t21 + mova [rsp+gprsize*2+16*23], m0 ;t20 + mova [rsp+gprsize*2+16*30], m3 ;t27 + ITX_MULSUB_2W 3, 0, 1, 2, 7, 3406, 2276 ;t21a, t26a + mova [rsp+gprsize*2+16*24], m3 ;t21a + mova [rsp+gprsize*2+16*29], m0 ;t26a + mova m2, [rsp+gprsize*2+16*26] ;in3 + pxor m0, m0 + mova m3, m0 + pmulhrsw m1, m2, [o(pw_4052x8)] + pmulhrsw m2, [o(pw_m601x8)] + jmp .main2 + +ALIGN function_align +cglobal_label .main_fast ;bottom half is zero + mova m0, [rsp+gprsize*2+16*19] ;in1 + mova m1, [rsp+gprsize*2+16*20] ;in15 + pmulhrsw m3, m0, [o(pw_4091x8)] ;t31a + pmulhrsw m0, [o(pw_201x8)] ;t16a + pmulhrsw m2, m1, [o(pw_3035x8)] ;t30a + pmulhrsw m1, [o(pw_m2751x8)] ;t17a + mova m7, [o(pd_2048)] + psubsw m4, m0, m1 ;t17 + paddsw m0, m1 ;t16 + psubsw m5, m3, m2 ;t30 + paddsw m3, m2 ;t31 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a + mova [rsp+gprsize*2+16*19], m0 ;t16 + mova [rsp+gprsize*2+16*20], m5 ;t17a + mova [rsp+gprsize*2+16*33], m4 ;t30a + mova [rsp+gprsize*2+16*34], m3 ;t31 + mova m0, [rsp+gprsize*2+16*21] ;in9 + mova m1, [rsp+gprsize*2+16*22] ;in7 + pmulhrsw m3, m0, [o(pw_3703x8)] + pmulhrsw m0, [o(pw_1751x8)] + pmulhrsw m2, m1, [o(pw_3857x8)] + pmulhrsw m1, [o(pw_m1380x8)] + psubsw m4, m1, m0 ;t18 + paddsw m0, m1 ;t19 + psubsw m5, m2, m3 ;t29 + paddsw m3, m2 ;t28 + ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a + mova [rsp+gprsize*2+16*21], m5 ;t18a + mova [rsp+gprsize*2+16*22], m0 ;t19 + mova [rsp+gprsize*2+16*31], m3 ;t28 + mova [rsp+gprsize*2+16*32], m4 ;t29a + mova m0, [rsp+gprsize*2+16*23] ;in5 + mova m1, [rsp+gprsize*2+16*24] ;in11 + pmulhrsw m3, m0, [o(pw_3973x8)] + pmulhrsw m0, [o(pw_995x8)] + pmulhrsw m2, m1, [o(pw_3513x8)] + pmulhrsw m1, [o(pw_m2106x8)] + psubsw m4, m0, m1 ;t21 + paddsw m0, m1 ;t20 + psubsw m5, m3, m2 ;t26 + paddsw m3, m2 ;t27 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a + mova [rsp+gprsize*2+16*23], m0 ;t20 + mova [rsp+gprsize*2+16*24], m5 ;t21a + mova [rsp+gprsize*2+16*29], m4 ;t26a + mova [rsp+gprsize*2+16*30], m3 ;t27 + mova m0, [rsp+gprsize*2+16*25] ;in13 + mova m2, [rsp+gprsize*2+16*26] ;in3 + pmulhrsw m3, m0, [o(pw_3290x8)] + pmulhrsw m0, [o(pw_2440x8)] + pmulhrsw m1, m2, [o(pw_4052x8)] + pmulhrsw m2, [o(pw_m601x8)] + jmp .main2 + +ALIGN function_align +cglobal_label .main + mova m7, [o(pd_2048)] + mova m0, [rsp+gprsize*2+16*19] ;in1 + mova m1, [rsp+gprsize*2+16*20] ;in15 + mova m2, [rsp+gprsize*2+16*33] ;in17 + mova m3, [rsp+gprsize*2+16*34] ;in31 + ITX_MULSUB_2W 0, 3, 4, 5, 7, 201, 4091 ;t16a, t31a + ITX_MULSUB_2W 2, 1, 4, 5, 7, 3035, 2751 ;t17a, t30a + psubsw m4, m0, m2 ;t17 + paddsw m0, m2 ;t16 + psubsw m5, m3, m1 ;t30 + paddsw m3, m1 ;t31 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a + mova [rsp+gprsize*2+16*19], m0 ;t16 + mova [rsp+gprsize*2+16*20], m5 ;t17a + mova [rsp+gprsize*2+16*33], m4 ;t30a + mova [rsp+gprsize*2+16*34], m3 ;t31 + mova m0, [rsp+gprsize*2+16*21] ;in9 + mova m1, [rsp+gprsize*2+16*22] ;in7 + mova m2, [rsp+gprsize*2+16*31] ;in25 + mova m3, [rsp+gprsize*2+16*32] ;in23 + ITX_MULSUB_2W 0, 3, 4, 5, 7, 1751, 3703 ;t18a, t29a + ITX_MULSUB_2W 2, 1, 4, 5, 7, 3857, 1380 ;t19a, t28a + psubsw m4, m2, m0 ;t18 + paddsw m0, m2 ;t19 + psubsw m5, m1, m3 ;t29 + paddsw m3, m1 ;t28 + ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a + mova [rsp+gprsize*2+16*21], m5 ;t18a + mova [rsp+gprsize*2+16*22], m0 ;t19 + mova [rsp+gprsize*2+16*31], m3 ;t28 + mova [rsp+gprsize*2+16*32], m4 ;t29a + mova m0, [rsp+gprsize*2+16*23] ;in5 + mova m1, [rsp+gprsize*2+16*24] ;in11 + mova m2, [rsp+gprsize*2+16*29] ;in21 + mova m3, [rsp+gprsize*2+16*30] ;in27 + ITX_MULSUB_2W 0, 3, 4, 5, 7, 995, 3973 ;t20a, t27a + ITX_MULSUB_2W 2, 1, 4, 5, 7, 3513, 2106 ;t21a, t26a + psubsw m4, m0, m2 ;t21 + paddsw m0, m2 ;t20 + psubsw m5, m3, m1 ;t26 + paddsw m3, m1 ;t27 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a + mova [rsp+gprsize*2+16*23], m0 ;t20 + mova [rsp+gprsize*2+16*24], m5 ;t21a + mova [rsp+gprsize*2+16*29], m4 ;t26a + mova [rsp+gprsize*2+16*30], m3 ;t27 + mova m0, [rsp+gprsize*2+16*25] ;in13 + mova m1, [rsp+gprsize*2+16*26] ;in3 + mova m2, [rsp+gprsize*2+16*27] ;in29 + mova m3, [rsp+gprsize*2+16*28] ;in19 + ITX_MULSUB_2W 0, 3, 4, 5, 7, 2440, 3290 ;t22a, t25a + ITX_MULSUB_2W 2, 1, 4, 5, 7, 4052, 601 ;t23a, t24a + +.main2: + psubsw m4, m2, m0 ;t22 + paddsw m0, m2 ;t23 + psubsw m5, m1, m3 ;t25 + paddsw m3, m1 ;t24 + ITX_MULSUB_2W 5, 4, 1, 2, 7, m2276, 3406 ;t22a, t25a + mova m2, [rsp+gprsize*2+16*24] ;t21a + psubsw m1, m5, m2 ;t21 + paddsw m5, m2 ;t22 + mova [rsp+gprsize*2+16*25], m5 ;t22 + mova m2, [rsp+gprsize*2+16*29] ;t26a + psubsw m5, m4, m2 ;t26 + paddsw m4, m2 ;t25 + mova [rsp+gprsize*2+16*28], m4 ;t25 + ITX_MULSUB_2W 5, 1, 2, 4, 7, m3784, 1567 ;t21a, t26a + mova [rsp+gprsize*2+16*24], m5 ;t21a + mova [rsp+gprsize*2+16*29], m1 ;t26a + + mova m1, [rsp+gprsize*2+16*23] ;t20 + mova m5, [rsp+gprsize*2+16*30] ;t27 + psubsw m2, m0, m1 ;t20a + paddsw m0, m1 ;t23a + psubsw m6, m3, m5 ;t27a + paddsw m3, m5 ;t24a + ITX_MULSUB_2W 6, 2, 1, 5, 7, m3784, 1567 ;t20, t27 + mova [rsp+gprsize*2+16*26], m0 ;t23a + mova [rsp+gprsize*2+16*27], m3 ;t24a + mova [rsp+gprsize*2+16*30], m2 ;t27 + + mova m0, [rsp+gprsize*2+16*20] ;t17a + mova m1, [rsp+gprsize*2+16*21] ;t18a + mova m2, [rsp+gprsize*2+16*32] ;t29a + mova m3, [rsp+gprsize*2+16*33] ;t30a + psubsw m4, m0, m1 ;t18 + paddsw m0, m1 ;t17 + psubsw m5, m3, m2 ;t29 + paddsw m3, m2 ;t30 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t18a, t29a + mova [rsp+gprsize*2+16*20], m0 ;t17 + mova [rsp+gprsize*2+16*21], m5 ;t18a + mova [rsp+gprsize*2+16*32], m4 ;t29a + mova [rsp+gprsize*2+16*33], m3 ;t30 + mova m0, [rsp+gprsize*2+16*19] ;t16 + mova m1, [rsp+gprsize*2+16*22] ;t19 + mova m2, [rsp+gprsize*2+16*31] ;t28 + mova m3, [rsp+gprsize*2+16*34] ;t31 + psubsw m4, m0, m1 ;t19a + paddsw m0, m1 ;t16a + psubsw m5, m3, m2 ;t28a + paddsw m3, m2 ;t31a + ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t19, t28 + mova m2, [rsp+gprsize*2+16*15] ;tmp12 + psubsw m1, m5, m6 ;t20a + paddsw m5, m6 ;t19a + psubsw m6, m2, m5 ;out19 + paddsw m2, m5 ;out12 + mova m5, [rsp+gprsize*2+16*30] ;t27 + mova [rsp+gprsize*2+16*22], m6 ;out19 + mova [rsp+gprsize*2+16*15], m2 ;out12 + psubsw m6, m4, m5 ;t27a + paddsw m4, m5 ;t28a + ITX_MULSUB_2W 6, 1, 2, 5, 7, 2896, 2896 ;t20, t27 + mova m2, [rsp+gprsize*2+16*6 ] ;tmp3 + psubsw m5, m2, m4 ;out28 + paddsw m2, m4 ;out3 + mova m4, [rsp+gprsize*2+16*14] ;tmp11 + mova [rsp+gprsize*2+16*31], m5 ;out28 + mova [rsp+gprsize*2+16*6 ], m2 ;out3 + psubsw m5, m4, m6 ;out20 + paddsw m4, m6 ;out11 + mova m2, [rsp+gprsize*2+16*7 ] ;tmp4 + mova [rsp+gprsize*2+16*23], m5 ;out20 + mova [rsp+gprsize*2+16*14], m4 ;out11 + psubsw m5, m2, m1 ;out27 + paddsw m2, m1 ;out4 + mova m1, [rsp+gprsize*2+16*26] ;t23a + mova m4, [rsp+gprsize*2+16*27] ;t24a + mova [rsp+gprsize*2+16*30], m5 ;out27 + mova [rsp+gprsize*2+16*7 ], m2 ;out4 + psubsw m5, m0, m1 ;t23 + paddsw m0, m1 ;t16 + psubsw m2, m3, m4 ;t24 + paddsw m3, m4 ;t31 + ITX_MULSUB_2W 2, 5, 4, 6, 7, 2896, 2896 ;t23a, t24a + mova m6, [rsp+gprsize*2+16*18] ;tmp15 + psubsw m4, m6, m0 ;out16 + paddsw m6, m0 ;out15 + mova m0, [rsp+gprsize*2+16*3 ] ;tmp0 + mova m1, [rsp+gprsize*2+16*11] ;tmp8 + mova [rsp+gprsize*2+16*18], m6 ;out15 + mova [rsp+gprsize*2+16*19], m4 ;out16 + psubsw m6, m0, m3 ;out31 + paddsw m0, m3 ;out0 + psubsw m4, m1, m2 ;out23 + paddsw m1, m2 ;out8 + mova m3, [rsp+gprsize*2+16*10] ;tmp7 + mova [rsp+gprsize*2+16*34], m6 ;out31 + mova [rsp+gprsize*2+16*11], m1 ;out8 + mova [rsp+gprsize*2+16*26], m4 ;out23 + paddsw m6, m3, m5 ;out7 + psubsw m3, m5 ;out24 + mova m1, [rsp+gprsize*2+16*20] ;t17 + mova m5, [rsp+gprsize*2+16*25] ;t22 + mova m2, [rsp+gprsize*2+16*17] ;tmp14 + mova [rsp+gprsize*2+16*27], m3 ;out24 + psubsw m4, m1, m5 ;t22a + paddsw m1, m5 ;t17a + psubsw m3, m2, m1 ;out17 + paddsw m2, m1 ;out14 + mova m5, [rsp+gprsize*2+16*28] ;t25 + mova m1, [rsp+gprsize*2+16*33] ;t30 + mova [rsp+gprsize*2+16*17], m2 ;out14 + mova [rsp+gprsize*2+16*20], m3 ;out17 + psubsw m2, m1, m5 ;t25a + paddsw m1, m5 ;t30a + ITX_MULSUB_2W 2, 4, 3, 5, 7, 2896, 2896 ;t22, t25 + mova m5, [rsp+gprsize*2+16*4 ] ;tmp1 + psubsw m3, m5, m1 ;out30 + paddsw m5, m1 ;out1 + mova m1, [rsp+gprsize*2+16*12] ;tmp9 + mova [rsp+gprsize*2+16*33], m3 ;out30 + mova [rsp+gprsize*2+16*4 ], m5 ;out1 + psubsw m3, m1, m2 ;out22 + paddsw m1, m2 ;out9 + mova m5, [rsp+gprsize*2+16*9 ] ;tmp6 + mova [rsp+gprsize*2+16*25], m3 ;out22 + mova [rsp+gprsize*2+16*12], m1 ;out9 + psubsw m3, m5, m4 ;out25 + paddsw m5, m4 ;out6 + mova m4, [rsp+gprsize*2+16*21] ;t18a + mova m1, [rsp+gprsize*2+16*24] ;t21a + mova m2, [rsp+gprsize*2+16*16] ;tmp13 + mova [rsp+gprsize*2+16*28], m3 ;out25 + mova [rsp+gprsize*2+16*9 ], m5 ;out6 + paddsw m3, m4, m1 ;t18 + psubsw m4, m1 ;t21 + psubsw m5, m2, m3 ;out18 + paddsw m2, m3 ;out13 + mova m1, [rsp+gprsize*2+16*29] ;t26a + mova m3, [rsp+gprsize*2+16*32] ;t29a + mova [rsp+gprsize*2+16*21], m5 ;out18 + mova [rsp+gprsize*2+16*16], m2 ;out13 + psubsw m5, m3, m1 ;t26 + paddsw m3, m1 ;t29 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 2896, 2896 ;t21a, t26a + mova m2, [rsp+gprsize*2+16*5 ] ;tmp2 + psubsw m1, m2, m3 ;out29 + paddsw m2, m3 ;out2 + mova m3, [rsp+gprsize*2+16*13] ;tmp10 + mova [rsp+gprsize*2+16*32], m1 ;out29 + psubsw m7, m3, m5 ;out21 + paddsw m3, m5 ;out10 + mova m5, [rsp+gprsize*2+16*8 ] ;tmp5 + mova [rsp+gprsize*2+16*24], m7 ;out21 + mova [rsp+gprsize*2+16*13], m3 ;out10 + psubsw m1, m5, m4 ;out26 + paddsw m5, m4 ;out5 + mova m7, m6 ;out7 + mova m3, [rsp+gprsize*2+16*6 ] ;out3 + mova m4, [rsp+gprsize*2+16*7 ] ;out4 + mova [rsp+gprsize*2+16*29], m1 ;out26 + mova m6, [rsp+gprsize*2+16*9 ] ;out6 + mova m1, [rsp+gprsize*2+16*4 ] ;out1 + ret + + +cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + call m(idct_32x8_internal_8bpc) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + mov r3d, 8 + lea tx2q, [o(.end)] + +.body: + pmulhrsw m0, m2 + movd m2, [o(pw_2048)] ;intentionally rip-relative + pmulhrsw m0, m1 + pmulhrsw m0, m2 + pshuflw m0, m0, q0000 + punpcklwd m0, m0 + pxor m5, m5 + +.loop: + mova m1, [dstq+16*0] + mova m3, [dstq+16*1] + punpckhbw m2, m1, m5 + punpcklbw m1, m5 + punpckhbw m4, m3, m5 + punpcklbw m3, m5 + paddw m2, m0 + paddw m1, m0 + paddw m4, m0 + paddw m3, m0 + packuswb m1, m2 + packuswb m3, m4 + mova [dstq+16*0], m1 + mova [dstq+16*1], m3 + add dstq, strideq + dec r3d + jg .loop + jmp tx2q + +.end: + RET + + +cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq+16*0, 64 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + LOAD_8ROWS coeffq+16*2, 64 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+16*1, 32 + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + mova [rsp+gprsize+16*21], m4 ;in9 + mova [rsp+gprsize+16*24], m5 ;in11 + mova [rsp+gprsize+16*25], m6 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + + cmp eobd, 106 + jg .full + call m(idct_8x32_internal_8bpc).main_fast + jmp .pass2 + +.full: + LOAD_8ROWS coeffq+16*17, 32 + mova [rsp+gprsize+16*33], m0 ;in17 + mova [rsp+gprsize+16*28], m1 ;in19 + mova [rsp+gprsize+16*29], m2 ;in21 + mova [rsp+gprsize+16*32], m3 ;in23 + mova [rsp+gprsize+16*31], m4 ;in25 + mova [rsp+gprsize+16*30], m5 ;in27 + mova [rsp+gprsize+16*27], m6 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + call m(idct_8x32_internal_8bpc).main + +.pass2: + mova [rsp+gprsize+16*0 ], m7 + lea tx2q, [o(.end)] + jmp m(idct_8x32_internal_8bpc).end1 + +.end: + mova m7, [o(pw_8192)] + lea tx2q, [o(.end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.end1: + lea r3, [dstq+8] + lea tx2q, [o(.end2)] + jmp m(idct_8x8_internal_8bpc).pass2_main + +.end2: + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0 ], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.end3: + mov dstq, r3 + add r3, 8 + lea tx2q, [o(.end4)] + jmp m(idct_8x8_internal_8bpc).pass2_main + +.end4: + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0 ], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.end5: + mov dstq, r3 + add r3, 8 + lea tx2q, [o(.end6)] + jmp m(idct_8x8_internal_8bpc).pass2_main + +.end6: + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0 ], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.end7: + mov dstq, r3 + lea tx2q, [o(.end8)] + jmp m(idct_8x8_internal_8bpc).pass2_main + +.end8: + ret + + +cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 + mov r5d, 4 + mov tx2d, 2 + cmp eobd, 107 + cmovns tx2d, r5d + mov r3d, tx2d +%if ARCH_X86_32 + LEA r5, $$ +%endif + lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)] +.loop: + LOAD_8ROWS coeffq+16*0, 64 + paddsw m6, [o(pw_5)] + mova [rsp+16*1], m6 + mova m6, [o(pw_5)] + REPX {paddsw x, m6}, m0, m1, m2, m3, m4, m5, m7 + call m(idct_8x8_internal_8bpc).pass1_end3 + REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7 + mova [rsp+16*2], m5 + mova [rsp+16*1], m6 + mova [rsp+16*0], m7 + call m(idct_8x8_internal_8bpc).end3 + lea dstq, [dstq+strideq*2] + pxor m7, m7 + REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + add coeffq, 16 + dec r3d + jg .loop + RET + +cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 + mov r5d, 4 + mov tx2d, 2 + cmp eobd, 107 + cmovns tx2d, r5d + mov r3d, tx2d +%if ARCH_X86_32 + LEA r5, $$ +%endif + +.loop: + LOAD_8ROWS coeffq+16*0, 16 + pmulhrsw m6, [o(pw_4096)] + mova [rsp+16*1], m6 + mova m6, [o(pw_4096)] + REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 + lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)] + call m(idct_8x8_internal_8bpc).pass1_end3 + + mov [rsp+16*3], dstq + mova [rsp+16*2], m5 + mova [rsp+16*1], m6 + mova [rsp+16*0], m7 + lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)] + call m(idct_8x8_internal_8bpc).end3 + + add coeffq, 16*8 + mov dstq, [rsp+16*3] + lea dstq, [dstq+8] + dec r3d + jg .loop + jnc .loop + RET + + +cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + call m(idct_16x32_internal_8bpc) +.end: + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_16384)] + mov [coeffq], eobd + pmulhrsw m0, m1 + mov r2d, 16 + lea tx2q, [o(.end)] + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly + + +cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + LOAD_8ROWS coeffq+16*1, 128, 1 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+16*5, 128, 1 + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(.pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end: + SAVE_8ROWS coeffq+16*33, 64 ;in8~in15 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end1: + mova [coeffq+16*1 ], m0 ;in8 + mova [coeffq+16*5 ], m4 ;in12 + mova [rsp+gprsize+16*13], m2 ;in10 + mova [rsp+gprsize+16*14], m6 ;in14 + mova [rsp+gprsize+16*21], m1 ;in9 + mova [rsp+gprsize+16*24], m3 ;in11 + mova [rsp+gprsize+16*25], m5 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + LOAD_8ROWS coeffq+16*0, 128, 1 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+16*4, 128, 1 + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(.pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end2: + SAVE_8ROWS coeffq+16*32, 64 ;in0~in7 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end3: + mova [rsp+gprsize+16*11], m2 ;in2 + mova [rsp+gprsize+16*12], m6 ;in6 + mova [rsp+gprsize+16*19], m1 ;in1 + mova [rsp+gprsize+16*26], m3 ;in3 + mova [rsp+gprsize+16*23], m5 ;in5 + mova [rsp+gprsize+16*22], m7 ;in7 + + cmp eobd, 150 + jg .full + + mova m1, m4 ;in4 + mova m2, [coeffq+16*1 ] ;in8 + mova m3, [coeffq+16*5 ] ;in12 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + mova m0, [rsp+gprsize+16*11] ;in2 + mova m1, [rsp+gprsize+16*12] ;in6 + mova m2, [rsp+gprsize+16*13] ;in10 + mova m3, [rsp+gprsize+16*14] ;in14 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call m(idct_8x32_internal_8bpc).main_fast + jmp .pass2 + +.full: + mova [coeffq+16*0 ], m0 ;in0 + mova [coeffq+16*4 ], m4 ;in4 + + LOAD_8ROWS coeffq+16*2, 128, 1 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+16*6, 128, 1 + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(.pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end4: + SAVE_8ROWS coeffq+16*34, 64 ;in16~in23 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end5: + mova [coeffq+16*2 ], m0 ;in16 + mova [coeffq+16*6 ], m4 ;in20 + mova [rsp+gprsize+16*15], m2 ;in18 + mova [rsp+gprsize+16*16], m6 ;in22 + mova [rsp+gprsize+16*33], m1 ;in17 + mova [rsp+gprsize+16*28], m3 ;in19 + mova [rsp+gprsize+16*29], m5 ;in21 + mova [rsp+gprsize+16*32], m7 ;in23 + + LOAD_8ROWS coeffq+16*3, 128, 1 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+16*7, 128, 1 + call m(idct_16x8_internal_8bpc).main + lea tx2q, [o(.pass1_end6)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end6: + SAVE_8ROWS coeffq+16*35, 64 ;in24~in31 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end7: + mova [rsp+gprsize+16*17], m2 ;in26 + mova [rsp+gprsize+16*18], m6 ;in30 + mova [rsp+gprsize+16*31], m1 ;in25 + mova [rsp+gprsize+16*30], m3 ;in27 + mova [rsp+gprsize+16*27], m5 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + + mova m6, m0 ;in24 + mova m7, m4 ;in28 + mova m0, [coeffq+16*0 ] ;in0 + mova m1, [coeffq+16*4 ] ;in4 + mova m2, [coeffq+16*1 ] ;in8 + mova m3, [coeffq+16*5 ] ;in12 + mova m4, [coeffq+16*2 ] ;in16 + mova m5, [coeffq+16*6 ] ;in20 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3 , 16 + LOAD_8ROWS rsp+gprsize+16*11, 16 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call m(idct_8x32_internal_8bpc).main + +.pass2: + mov [rsp+gprsize*1+16*35], eobd + lea r3, [dstq+8] + mov [rsp+gprsize*2+16*35], r3 + lea r3, [o(.end)] + jmp m(idct_8x32_internal_8bpc).end + +.end: + mov dstq, [rsp+gprsize*2+16*35] + mov eobd, [rsp+gprsize*1+16*35] + add coeffq, 16*32 + + mova m0, [coeffq+16*4 ] ;in1 + mova m1, [coeffq+16*12] ;in3 + mova m2, [coeffq+16*20] ;in5 + mova m3, [coeffq+16*28] ;in7 + mova m4, [coeffq+16*5 ] ;in9 + mova m5, [coeffq+16*13] ;in11 + mova m6, [coeffq+16*21] ;in13 + mova m7, [coeffq+16*29] ;in15 + + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + mova [rsp+gprsize+16*21], m4 ;in9 + mova [rsp+gprsize+16*24], m5 ;in11 + mova [rsp+gprsize+16*25], m6 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + + mova m0, [coeffq+16*0 ] ;in0 + mova m1, [coeffq+16*16] ;in4 + mova m2, [coeffq+16*1 ] ;in8 + mova m3, [coeffq+16*17] ;in12 + + cmp eobd, 150 + jg .full1 + + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + mova m0, [coeffq+16*8 ] ;in2 + mova m1, [coeffq+16*24] ;in6 + mova m2, [coeffq+16*9 ] ;in10 + mova m3, [coeffq+16*25] ;in14 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call m(idct_8x32_internal_8bpc).main_fast + jmp m(idct_8x32_internal_8bpc).pass2 + +.full1: + mova m4, [coeffq+16*2 ] ;in16 + mova m5, [coeffq+16*18] ;in20 + mova m6, [coeffq+16*3 ] ;in24 + mova m7, [coeffq+16*19] ;in26 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + mova m0, [coeffq+16*8 ] ;in2 + mova m1, [coeffq+16*24] ;in6 + mova m2, [coeffq+16*9 ] ;in10 + mova m3, [coeffq+16*25] ;in14 + mova m4, [coeffq+16*10] ;in18 + mova m5, [coeffq+16*26] ;in22 + mova m6, [coeffq+16*11] ;in26 + mova m7, [coeffq+16*27] ;in30 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + mova m0, [coeffq+16*6 ] ;in17 + mova m1, [coeffq+16*14] ;in19 + mova m2, [coeffq+16*22] ;in21 + mova m3, [coeffq+16*30] ;in23 + mova m4, [coeffq+16*7 ] ;in25 + mova m5, [coeffq+16*15] ;in27 + mova m6, [coeffq+16*23] ;in29 + mova m7, [coeffq+16*31] ;in31 + + mova [rsp+gprsize+16*33], m0 ;in17 + mova [rsp+gprsize+16*28], m1 ;in19 + mova [rsp+gprsize+16*29], m2 ;in21 + mova [rsp+gprsize+16*32], m3 ;in23 + mova [rsp+gprsize+16*31], m4 ;in25 + mova [rsp+gprsize+16*30], m5 ;in27 + mova [rsp+gprsize+16*27], m6 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + + call m(idct_8x32_internal_8bpc).main + jmp m(idct_8x32_internal_8bpc).pass2 + + +cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + + call m(idct_32x16_internal_8bpc) + call m(idct_8x16_internal_8bpc).pass2 + + add coeffq, 16*16 + lea dstq, [r3+8] + LOAD_8ROWS rsp+16*11, 16 + mova [rsp+16*0], m7 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end + call m(idct_8x16_internal_8bpc).pass2 + + add coeffq, 16*16 + lea dstq, [r3+8] + LOAD_8ROWS rsp+16*19, 16 + mova [rsp+16*0], m7 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end + call m(idct_8x16_internal_8bpc).pass2 + + add coeffq, 16*16 + lea dstq, [r3+8] + LOAD_8ROWS rsp+16*27, 16 + mova [rsp+16*0], m7 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end + call m(idct_8x16_internal_8bpc).pass2 + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_16384)] + mov [coeffq], eobd + pmulhrsw m0, m1 + mov r3d, 16 + lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body + + +cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + add coeffq, 16 + lea r3, [o(.pass1_end1)] +.pass1: + LOAD_8ROWS coeffq+16*0, 128, 1 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + LOAD_8ROWS coeffq+16*4, 128, 1 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+16*2, 64, 1 + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + mova [rsp+gprsize+16*21], m4 ;in9 + mova [rsp+gprsize+16*24], m5 ;in11 + mova [rsp+gprsize+16*25], m6 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + + LOAD_8ROWS coeffq+16*34, 64, 1 + mova [rsp+gprsize+16*33], m0 ;in17 + mova [rsp+gprsize+16*28], m1 ;in19 + mova [rsp+gprsize+16*29], m2 ;in21 + mova [rsp+gprsize+16*32], m3 ;in23 + mova [rsp+gprsize+16*31], m4 ;in25 + mova [rsp+gprsize+16*30], m5 ;in27 + mova [rsp+gprsize+16*27], m6 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + call m(idct_8x32_internal_8bpc).main + +.pass1_end: + mova [rsp+gprsize+16*0 ], m7 + mov tx2q, r3 + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end1: + SAVE_8ROWS coeffq+16*0, 32 + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0 ], m7 + lea tx2q, [o(.pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end2: + SAVE_8ROWS coeffq+16*16, 32 + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0 ], m7 + lea tx2q, [o(.pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end3: + SAVE_8ROWS coeffq+16*32, 32 + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0 ], m7 + lea tx2q, [o(.pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end4: + SAVE_8ROWS coeffq+16*48, 32 + + sub coeffq, 16 + lea r3, [o(.end)] + jmp .pass1 + +.end: + ret + + +cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 + mov r4d, eobd + cmp eobd, 43 ;if (eob > 43) + sbb r3d, r3d ; iteration_count++ + cmp r4d, 150 ;if (eob > 150) + sbb r3d, 0 ; iteration_count++ + cmp r4d, 278 ;if (eob > 278) + sbb r3d, -4 ; iteration_count++ + +%if ARCH_X86_32 + LEA r5, $$ +%endif + lea r4, [dstq+8] + mov [rsp+16*3], r4 + mov [rsp+gprsize+16*3], r3d + mov [rsp+gprsize*2+16*3], coeffq + +.loop: + LOAD_8ROWS coeffq, 64, 1 + mova [rsp+16*1], m6 + pxor m6, m6 + REPX {mova [coeffq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end3 + mova [rsp+16*0], m2 + mova [rsp+16*1], m3 + mova [rsp+16*2], m4 + mova m3, [o(pw_1697x16)] + mova m4, [o(pw_16384)] + REPX {IDTX16 x, 2, 3, 4}, 5, 6, 7, 0, 1 + mova m2, [o(pw_8192)] + REPX {pmulhrsw x, m2}, m5, m6, m7, m0, m1 + mova m2, [rsp+16*0] + mova [rsp+16*0], m7 + IDTX16 2, 7, 3, 4 + mova m7, [rsp+16*2] + mova [rsp+16*2], m5 + IDTX16 7, 5, 3, 4 + mova m5, [rsp+16*1] + mova [rsp+16*1], m6 + pmulhrsw m3, m5 + pmulhrsw m3, m4 + psrlw m4, 1 ; pw_8192 + paddsw m3, m5 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + pmulhrsw m4, m7 + call m(idct_8x8_internal_8bpc).end3 + lea dstq, [dstq+strideq*2] + add coeffq, 16 + dec r3d + jg .loop + mov coeffq, [rsp+gprsize*2+16*3] + add coeffq, 64*8 + mov r3d, [rsp+gprsize+16*3] + xor dstq, dstq + mov [rsp+gprsize+16*3], dstq + mov dstq, [rsp+16*3] + test r3d, r3d + jnz .loop + RET + + +cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2 + mov r4d, 12 ;0100b + mov r5d, 136 ;1000 1000b + cmp eobd, 44 ;if (eob > 43) + cmovns r4d, r5d ; iteration_count+2 + cmp eobd, 151 ;if (eob > 150) + mov r3d, 34952 ;1000 1000 1000 1000b + cmovs r3d, r4d ; iteration_count += 4 + +%if ARCH_X86_32 + LEA r5, $$ +%endif + lea r4, [dstq+8] + mov [rsp+16*3], r4 + +.loop: + LOAD_8ROWS coeffq, 32, 1 + REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 + mova [rsp+16*1], m6 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end3 + mova [rsp+16*1], m5 + mova [rsp+16*2], m6 + mova m6, [o(pw_1697x16)] + REPX {IDTX16 x, 5, 6}, 7, 0, 1, 2, 3, 4 + pmulhrsw m7, [o(pw_2048)] + mova m5, [rsp+16*1] + mova [rsp+16*0], m7 + IDTX16 5, 7, 6 + mova m7, [rsp+16*2] + IDTX16 7, 6, 6 + mova m6, [o(pw_2048)] + REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7 + mova [rsp+16*2], m5 + mova [rsp+16*1], m7 + call m(idct_8x8_internal_8bpc).end3 + lea dstq, [dstq+strideq*2] + pxor m7, m7 + REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + +.loop_end: + add coeffq, 16 + shr r3d, 2 + jz .ret + test r3d, 2 + jnz .loop + mov r4d, r3d + and r4d, 1 + lea coeffq, [coeffq+r4*8+32*7] + mov dstq, [rsp+16*3] + lea r4, [dstq+8] + mov [rsp+16*3], r4 + jmp .loop + +.ret: + RET + + +cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + + call m(idct_32x32_internal_8bpc) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + mov r3d, 32 + lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body + + +cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mov r4d, 2 + sub eobd, 136 + mov [rsp+gprsize*1+16*35], eobd + mov r3d, 4 + cmovs r3d, r4d + +%if ARCH_X86_32 + LEA r5, $$ +%endif + + mov [rsp+gprsize*2+16*35], coeffq + +.pass1_loop: + LOAD_8ROWS coeffq+64*1, 64*2 + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + mova [rsp+gprsize+16*21], m4 ;in9 + mova [rsp+gprsize+16*24], m5 ;in11 + mova [rsp+gprsize+16*25], m6 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + + mov tx2d, [rsp+gprsize*1+16*35] + test tx2d, tx2d + jl .fast + +.full: + LOAD_8ROWS coeffq+64*0, 64*4 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+64*2, 64*4 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+64*17, 64*2 + mova [rsp+gprsize+16*33], m0 ;in17 + mova [rsp+gprsize+16*28], m1 ;in19 + mova [rsp+gprsize+16*29], m2 ;in21 + mova [rsp+gprsize+16*32], m3 ;in23 + mova [rsp+gprsize+16*31], m4 ;in25 + mova [rsp+gprsize+16*30], m5 ;in27 + mova [rsp+gprsize+16*27], m6 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + + call m(idct_8x32_internal_8bpc).main + jmp .pass1_end + +.fast: + mova m0, [coeffq+256*0] + mova m1, [coeffq+256*1] + mova m2, [coeffq+256*2] + mova m3, [coeffq+256*3] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + + SAVE_7ROWS rsp+gprsize+16*3, 16 + mova m0, [coeffq+128*1] + mova m1, [coeffq+128*3] + mova m2, [coeffq+128*5] + mova m3, [coeffq+128*7] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call m(idct_8x32_internal_8bpc).main_fast + +.pass1_end: + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+64*0, 64 + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end2: + SAVE_8ROWS coeffq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end3: + SAVE_8ROWS coeffq+64*16, 64 + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end4: + SAVE_8ROWS coeffq+64*24, 64 + + add coeffq, 16 + dec r3d + jg .pass1_loop + + +.pass2: + mov coeffq, [rsp+gprsize*2+16*35] + mov r3d, 4 + lea tx2q, [o(.pass2_end)] + +.pass2_loop: + mov [rsp+gprsize*3+16*35], r3d + lea r3, [dstq+8] + mov [rsp+gprsize*2+16*35], r3 + + mova m0, [coeffq+16*4 ] + mova m1, [coeffq+16*12] + mova m2, [coeffq+16*20] + mova m3, [coeffq+16*28] + mova m4, [coeffq+16*5 ] + mova m5, [coeffq+16*13] + mova m6, [coeffq+16*21] + mova m7, [coeffq+16*29] + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + mova [rsp+gprsize+16*21], m4 ;in9 + mova [rsp+gprsize+16*24], m5 ;in11 + mova [rsp+gprsize+16*25], m6 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + + mov eobd, [rsp+gprsize*1+16*35] + test eobd, eobd + jl .fast1 + +.full1: + mova m0, [coeffq+16*0 ] + mova m1, [coeffq+16*16] + mova m2, [coeffq+16*1 ] + mova m3, [coeffq+16*17] + mova m4, [coeffq+16*2 ] + mova m5, [coeffq+16*18] + mova m6, [coeffq+16*3 ] + mova m7, [coeffq+16*19] + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + mova m0, [coeffq+16*8 ] + mova m1, [coeffq+16*24] + mova m2, [coeffq+16*9 ] + mova m3, [coeffq+16*25] + mova m4, [coeffq+16*10] + mova m5, [coeffq+16*26] + mova m6, [coeffq+16*11] + mova m7, [coeffq+16*27] + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + mova m0, [coeffq+16*6 ] + mova m1, [coeffq+16*14] + mova m2, [coeffq+16*22] + mova m3, [coeffq+16*30] + mova m4, [coeffq+16*7 ] + mova m5, [coeffq+16*15] + mova m6, [coeffq+16*23] + mova m7, [coeffq+16*31] + mova [rsp+gprsize+16*33], m0 ;in17 + mova [rsp+gprsize+16*28], m1 ;in19 + mova [rsp+gprsize+16*29], m2 ;in21 + mova [rsp+gprsize+16*32], m3 ;in23 + mova [rsp+gprsize+16*31], m4 ;in25 + mova [rsp+gprsize+16*30], m5 ;in27 + mova [rsp+gprsize+16*27], m6 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + + call m(idct_8x32_internal_8bpc).main + jmp tx2q + +.fast1: + mova m0, [coeffq+16*0 ] + mova m1, [coeffq+16*16] + mova m2, [coeffq+16*1 ] + mova m3, [coeffq+16*17] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + mova m0, [coeffq+16*8 ] + mova m1, [coeffq+16*24] + mova m2, [coeffq+16*9 ] + mova m3, [coeffq+16*25] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call m(idct_8x32_internal_8bpc).main_fast + jmp tx2q + +.pass2_end: + lea r3, [o(.pass2_end1)] + jmp m(idct_8x32_internal_8bpc).end + +.pass2_end1: + lea tx2q, [o(.pass2_end)] + add coeffq, 16*32 + mov dstq, [rsp+gprsize*2+16*35] + mov r3d, [rsp+gprsize*3+16*35] + dec r3d + jg .pass2_loop + + ret + + +cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2 + mov r4d, 2 + cmp eobd, 136 + mov r3d, 4 + cmovs r3d, r4d + +%if ARCH_X86_32 + LEA r5, $$ +%endif + + lea r4, [dstq+8] + mov [rsp+gprsize*0+16*3], r4 + mov [rsp+gprsize*1+16*3], r3d + mov [rsp+gprsize*2+16*3], r3d + mov [rsp+gprsize*3+16*3], coeffq + +.loop: + LOAD_8ROWS coeffq, 64 + mova [rsp+16*1], m6 + lea tx2q, [o(m(idct_32x16_internal_8bpc).end)] + call m(idct_8x8_internal_8bpc).pass1_end3 + pmulhrsw m7, [o(pw_8192)] + mova [rsp+16*0], m7 + mova m7, [o(pw_8192)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + mova [rsp+16*1], m6 + mova [rsp+16*2], m5 + call m(idct_8x8_internal_8bpc).end3 + lea dstq, [dstq+strideq*2] + + pxor m7, m7 + REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + + add coeffq, 16 + dec r3d + jg .loop + + mov r4d, [rsp+gprsize*2+16*3] + dec r4d + jle .ret + + mov dstq, [rsp+gprsize*0+16*3] + mov coeffq, [rsp+gprsize*3+16*3] + mov [rsp+gprsize*2+16*3], r4 + lea r3, [dstq+8] + add coeffq, 64*8 + mov [rsp+gprsize*0+16*3], r3 + mov r3d, [rsp+gprsize*1+16*3] + mov [rsp+gprsize*3+16*3], coeffq + jmp .loop + +.ret: + RET + + +cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + call m(idct_16x64_internal_8bpc) +.end: + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + mov r2d, 32 + lea tx2q, [o(.end)] + jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly + + +cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mov r4d, 2 + sub eobd, 151 + mov [rsp+gprsize*1+16*67], eobd + mov r3d, 4 + cmovs r3d, r4d + +%if ARCH_X86_32 + LEA r5, $$ +%endif + + mov [rsp+gprsize*2+16*67], coeffq + +.pass1_loop: + LOAD_8ROWS coeffq+64*0, 64*2 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+64*1, 64*2 + call m(idct_16x8_internal_8bpc).main + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end: + SAVE_8ROWS coeffq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+64*0, 64 + + add coeffq, 16 + dec r3d + jg .pass1_loop + + mov coeffq, [rsp+gprsize*2+16*67] + mov r3d, 2 + lea r4, [dstq+8] + mov [rsp+gprsize*2+16*67], r4 + lea r4, [o(.end1)] + +.pass2_loop: + mov [rsp+gprsize*3+16*67], r3d + mov eobd, [rsp+gprsize*1+16*67] + + mova m0, [coeffq+16*4 ] ;in1 + mova m1, [coeffq+16*12] ;in3 + mova m2, [coeffq+16*20] ;in5 + mova m3, [coeffq+16*28] ;in7 + mova m4, [coeffq+16*5 ] ;in9 + mova m5, [coeffq+16*13] ;in11 + mova m6, [coeffq+16*21] ;in13 + mova m7, [coeffq+16*29] ;in15 + mova [rsp+gprsize+16*35], m0 ;in1 + mova [rsp+gprsize+16*49], m1 ;in3 + mova [rsp+gprsize+16*43], m2 ;in5 + mova [rsp+gprsize+16*41], m3 ;in7 + mova [rsp+gprsize+16*39], m4 ;in9 + mova [rsp+gprsize+16*45], m5 ;in11 + mova [rsp+gprsize+16*47], m6 ;in13 + mova [rsp+gprsize+16*37], m7 ;in15 + + pxor m4, m4 + mova m0, [coeffq+16*0] + mova m1, [coeffq+16*1] + + test eobd, eobd + jl .fast + +.full: + mova m2, [coeffq+16*2] + mova m3, [coeffq+16*3] + + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + pxor m4, m4 + mova m0, [coeffq+16*16] + mova m1, [coeffq+16*17] + mova m2, [coeffq+16*18] + mova m3, [coeffq+16*19] + + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + mova m0, [coeffq+16*8 ] + mova m1, [coeffq+16*24] + mova m2, [coeffq+16*9 ] + mova m3, [coeffq+16*25] + mova m4, [coeffq+16*10] + mova m5, [coeffq+16*26] + mova m6, [coeffq+16*11] + mova m7, [coeffq+16*27] + mova [rsp+gprsize+16*19], m0 + mova [rsp+gprsize+16*26], m1 + mova [rsp+gprsize+16*23], m2 + mova [rsp+gprsize+16*22], m3 + mova [rsp+gprsize+16*21], m4 + mova [rsp+gprsize+16*24], m5 + mova [rsp+gprsize+16*25], m6 + mova [rsp+gprsize+16*20], m7 + + call m(idct_8x32_internal_8bpc).main_fast + SAVE_8ROWS rsp+gprsize+16*3, 16 + + mova m0, [coeffq+16*6 ] ;in17 + mova m1, [coeffq+16*14] ;in19 + mova m2, [coeffq+16*22] ;in21 + mova m3, [coeffq+16*30] ;in23 + mova m4, [coeffq+16*7 ] ;in25 + mova m5, [coeffq+16*15] ;in27 + mova m6, [coeffq+16*23] ;in29 + mova m7, [coeffq+16*31] ;in31 + mova [rsp+gprsize+16*63], m0 ;in17 + mova [rsp+gprsize+16*53], m1 ;in19 + mova [rsp+gprsize+16*55], m2 ;in21 + mova [rsp+gprsize+16*61], m3 ;in23 + mova [rsp+gprsize+16*59], m4 ;in25 + mova [rsp+gprsize+16*57], m5 ;in27 + mova [rsp+gprsize+16*51], m6 ;in29 + mova [rsp+gprsize+16*65], m7 ;in31 + + call .main + jmp .end + +.fast: + REPX {mova x, m4}, m2, m3, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + pxor m4, m4 + mova m0, [coeffq+16*16] + mova m1, [coeffq+16*17] + + REPX {mova x, m4}, m2, m3, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + mova m0, [coeffq+16*8 ] + mova m1, [coeffq+16*24] + mova m2, [coeffq+16*9 ] + mova m3, [coeffq+16*25] + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + + call m(idct_8x32_internal_8bpc).main_veryfast + SAVE_8ROWS rsp+gprsize+16*3, 16 + + call .main_fast + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mov r3, r4 + jmp m(idct_8x32_internal_8bpc).end2 + +.end1: + LOAD_8ROWS rsp+gprsize+16*35, 16 + lea dstq, [dstq+strideq*2] + lea r3, [rsp+16*32+gprsize] + call .write + mov dstq, [rsp+gprsize*2+16*67] + mov r3d, [rsp+gprsize*3+16*67] + lea r4, [dstq+8] + mov [rsp+gprsize*2+16*67], r4 + lea r4, [o(.end1)] + + dec r3d + jg .pass2_loop + ret +.write: + mova [r3+16*0], m7 + mov r4, -16*32 + pxor m7, m7 + sub coeffq, r4 +.zero_loop: + mova [coeffq+r4+16*0], m7 + mova [coeffq+r4+16*1], m7 + add r4, 16*2 + jl .zero_loop + call .write_main2 + LOAD_8ROWS r3+16*11, 16 + call .write_main + LOAD_8ROWS r3+16*19, 16 + call .write_main + LOAD_8ROWS r3+16*27, 16 +.write_main: + mova [r3+16*0], m7 +.write_main2: + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [r3+16*0] + mova [r3+16*2], m5 + mova [r3+16*1], m6 + mova [r3+16*0], m7 + WRITE_8X4 0, 1, 2, 3, 5, 6, 7 + lea dstq, [dstq+strideq*2] + WRITE_8X4 4, [r3+16*2], [r3+16*1], [r3+16*0], 5, 6, 7 + lea dstq, [dstq+strideq*2] + ret + + +ALIGN function_align +cglobal_label .main_fast + mova m0, [rsp+gprsize*2+16*35] ;in1 + pmulhrsw m3, m0, [o(pw_4095x8)] ;t62,t63 + pmulhrsw m0, [o(pw_101x8)] ;t32,t33 + mova m7, [o(pd_2048)] + mova [rsp+gprsize*2+16*35], m0 ;t32 + mova [rsp+gprsize*2+16*66], m3 ;t63 + ITX_MULSUB_2W 3, 0, 1, 2, 7, 401, 4076 ;t33a, t62a + mova [rsp+gprsize*2+16*36], m3 ;t33a + mova [rsp+gprsize*2+16*65], m0 ;t62a + + mova m1, [rsp+gprsize*2+16*37] ;in15 + pmulhrsw m2, m1, [o(pw_3822x8)] ;t60,t61 + pmulhrsw m1, [o(pw_m1474x8)] ;t34,t35 + mova [rsp+gprsize*2+16*38], m1 ;t35 + mova [rsp+gprsize*2+16*63], m2 ;t60 + ITX_MULSUB_2W 2, 1, 0, 3, 7, m4076, 401 ;t34a, t61a + mova [rsp+gprsize*2+16*37], m2 ;t34a + mova [rsp+gprsize*2+16*64], m1 ;t61a + + mova m0, [rsp+gprsize*2+16*39] ;in9 + pmulhrsw m3, m0, [o(pw_3996x8)] ;t58,t59 + pmulhrsw m0, [o(pw_897x8)] ;t36,t37 + mova [rsp+gprsize*2+16*39], m0 ;t36 + mova [rsp+gprsize*2+16*62], m3 ;t59 + ITX_MULSUB_2W 3, 0, 1, 2, 7, 3166, 2598 ;t37a, t58a + mova [rsp+gprsize*2+16*40], m3 ;t37a + mova [rsp+gprsize*2+16*61], m0 ;t58a + + mova m1, [rsp+gprsize*2+16*41] ;in7 + pmulhrsw m2, m1, [o(pw_4036x8)] ;t56,t57 + pmulhrsw m1, [o(pw_m700x8)] ;t38,t39 + mova [rsp+gprsize*2+16*42], m1 ;t39 + mova [rsp+gprsize*2+16*59], m2 ;t56 + ITX_MULSUB_2W 2, 1, 0, 3, 7, m2598, 3166 ;t38a, t57a + mova [rsp+gprsize*2+16*41], m2 ;t38a + mova [rsp+gprsize*2+16*60], m1 ;t57a + + mova m0, [rsp+gprsize*2+16*43] ;in5 + pmulhrsw m3, m0, [o(pw_4065x8)] ;t54,t55 + pmulhrsw m0, [o(pw_501x8)] ;t40,t41 + mova [rsp+gprsize*2+16*43], m0 ;t40 + mova [rsp+gprsize*2+16*58], m3 ;t55 + ITX_MULSUB_2W 3, 0, 1, 2, 7, 1931, 3612 ;t41a, t54a + mova [rsp+gprsize*2+16*44], m3 ;t41a + mova [rsp+gprsize*2+16*57], m0 ;t54a + + mova m1, [rsp+gprsize*2+16*45] ;in11 + pmulhrsw m2, m1, [o(pw_3948x8)] ;t52,t53 + pmulhrsw m1, [o(pw_m1092x8)] ;t42,t43 + mova [rsp+gprsize*2+16*46], m1 ;t43 + mova [rsp+gprsize*2+16*55], m2 ;t52 + ITX_MULSUB_2W 2, 1, 0, 3, 7, m3612, 1931 ;t42a, t53a + mova [rsp+gprsize*2+16*45], m2 ;t42a + mova [rsp+gprsize*2+16*56], m1 ;t53a + + mova m0, [rsp+gprsize*2+16*47] ;in13 + pmulhrsw m3, m0, [o(pw_3889x8)] ;t50,t51 + pmulhrsw m0, [o(pw_1285x8)] ;t44,t45 + mova m6, m0 + mova [rsp+gprsize*2+16*54], m3 ;t51 + ITX_MULSUB_2W 3, 0, 1, 2, 7, 3920, 1189 ;t45a, t50a + mova [rsp+gprsize*2+16*48], m3 ;t45a + mova [rsp+gprsize*2+16*53], m0 ;t50a + + mova m0, [rsp+gprsize*2+16*49] ;in3 + pmulhrsw m3, m0, [o(pw_4085x8)] ;t48,t49 + pmulhrsw m0, [o(pw_m301x8)] ;t46,t47 + mova m4, m3 + mova m5, m0 + + jmp .main2 + +ALIGN function_align +cglobal_label .main + mova m0, [rsp+gprsize*2+16*35] ;in1 + mova m1, [rsp+gprsize*2+16*65] ;in31 + pmulhrsw m3, m0, [o(pw_4095x8)] ;t63a + pmulhrsw m0, [o(pw_101x8)] ;t32a + pmulhrsw m2, m1, [o(pw_2967x8)] ;t62a + pmulhrsw m1, [o(pw_m2824x8)] ;t33a + mova m7, [o(pd_2048)] + psubsw m4, m0, m1 ;t33 + paddsw m0, m1 ;t32 + psubsw m5, m3, m2 ;t62 + paddsw m3, m2 ;t63 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 401, 4076 ;t33a, t62a + mova [rsp+gprsize*2+16*35], m0 ;t32 + mova [rsp+gprsize*2+16*36], m5 ;t33a + mova [rsp+gprsize*2+16*65], m4 ;t62a + mova [rsp+gprsize*2+16*66], m3 ;t63 + + mova m0, [rsp+gprsize*2+16*63] ;in17 + mova m1, [rsp+gprsize*2+16*37] ;in15 + pmulhrsw m3, m0, [o(pw_3745x8)] ;t61a + pmulhrsw m0, [o(pw_1660x8)] ;t34a + pmulhrsw m2, m1, [o(pw_3822x8)] ;t60a + pmulhrsw m1, [o(pw_m1474x8)] ;t35a + psubsw m4, m1, m0 ;t34 + paddsw m0, m1 ;t35 + psubsw m5, m2, m3 ;t61 + paddsw m3, m2 ;t60 + ITX_MULSUB_2W 5, 4, 1, 2, 7, m4076, 401 ;t34a, t61a + mova [rsp+gprsize*2+16*37], m5 ;t34a + mova [rsp+gprsize*2+16*38], m0 ;t35 + mova [rsp+gprsize*2+16*63], m3 ;t60 + mova [rsp+gprsize*2+16*64], m4 ;t61a + + mova m0, [rsp+gprsize*2+16*39] ;in9 + mova m1, [rsp+gprsize*2+16*61] ;in23 + pmulhrsw m3, m0, [o(pw_3996x8)] ;t59a + pmulhrsw m0, [o(pw_897x8)] ;t36a + pmulhrsw m2, m1, [o(pw_3461x8)] ;t58a + pmulhrsw m1, [o(pw_m2191x8)] ;t37a + psubsw m4, m0, m1 ;t37 + paddsw m0, m1 ;t36 + psubsw m5, m3, m2 ;t58 + paddsw m3, m2 ;t59 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 3166, 2598 ;t37a, t58a + mova [rsp+gprsize*2+16*39], m0 ;t36 + mova [rsp+gprsize*2+16*40], m5 ;t37a + mova [rsp+gprsize*2+16*61], m4 ;t58a + mova [rsp+gprsize*2+16*62], m3 ;t59 + + mova m0, [rsp+gprsize*2+16*59] ;in25 + mova m1, [rsp+gprsize*2+16*41] ;in7 + pmulhrsw m3, m0, [o(pw_3349x8)] ;t57a + pmulhrsw m0, [o(pw_2359x8)] ;t38a + pmulhrsw m2, m1, [o(pw_4036x8)] ;t56a + pmulhrsw m1, [o(pw_m700x8)] ;t39a + psubsw m4, m1, m0 ;t38 + paddsw m0, m1 ;t39 + psubsw m5, m2, m3 ;t57 + paddsw m3, m2 ;t56 + ITX_MULSUB_2W 5, 4, 1, 2, 7, m2598, 3166 ;t38a, t57a + mova [rsp+gprsize*2+16*41], m5 ;t38a + mova [rsp+gprsize*2+16*42], m0 ;t39 + mova [rsp+gprsize*2+16*59], m3 ;t56 + mova [rsp+gprsize*2+16*60], m4 ;t57a + + mova m0, [rsp+gprsize*2+16*43] ;in5 + mova m1, [rsp+gprsize*2+16*57] ;in27 + pmulhrsw m3, m0, [o(pw_4065x8)] ;t55a + pmulhrsw m0, [o(pw_501x8)] ;t40a + pmulhrsw m2, m1, [o(pw_3229x8)] ;t54a + pmulhrsw m1, [o(pw_m2520x8)] ;t41a + psubsw m4, m0, m1 ;t41 + paddsw m0, m1 ;t40 + psubsw m5, m3, m2 ;t54 + paddsw m3, m2 ;t55 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 1931, 3612 ;t41a, t54a + mova [rsp+gprsize*2+16*43], m0 ;t40 + mova [rsp+gprsize*2+16*44], m5 ;t41a + mova [rsp+gprsize*2+16*57], m4 ;t54a + mova [rsp+gprsize*2+16*58], m3 ;t55 + + mova m0, [rsp+gprsize*2+16*55] ;in21 + mova m1, [rsp+gprsize*2+16*45] ;in11 + pmulhrsw m3, m0, [o(pw_3564x8)] ;t53a + pmulhrsw m0, [o(pw_2019x8)] ;t42a + pmulhrsw m2, m1, [o(pw_3948x8)] ;t52a + pmulhrsw m1, [o(pw_m1092x8)] ;t43a + psubsw m4, m1, m0 ;t42 + paddsw m0, m1 ;t43 + psubsw m5, m2, m3 ;t53 + paddsw m3, m2 ;t52 + ITX_MULSUB_2W 5, 4, 1, 2, 7, m3612, 1931 ;t42a, t53a + mova [rsp+gprsize*2+16*45], m5 ;t42a + mova [rsp+gprsize*2+16*46], m0 ;t43 + mova [rsp+gprsize*2+16*55], m3 ;t52 + mova [rsp+gprsize*2+16*56], m4 ;t53a + + mova m0, [rsp+gprsize*2+16*47] ;in13 + mova m1, [rsp+gprsize*2+16*53] ;in19 + pmulhrsw m3, m0, [o(pw_3889x8)] ;t51a + pmulhrsw m0, [o(pw_1285x8)] ;t44a + pmulhrsw m2, m1, [o(pw_3659x8)] ;t50a + pmulhrsw m1, [o(pw_m1842x8)] ;t45a + psubsw m4, m0, m1 ;t45 + paddsw m0, m1 ;t44 + psubsw m5, m3, m2 ;t50 + paddsw m3, m2 ;t51 + ITX_MULSUB_2W 5, 4, 1, 2, 7, 3920, 1189 ;t45a, t50a + mova m6, m0 + mova [rsp+gprsize*2+16*48], m5 ;t45a + mova [rsp+gprsize*2+16*53], m4 ;t50a + mova [rsp+gprsize*2+16*54], m3 ;t51 + + mova m0, [rsp+gprsize*2+16*51] ;in29 + mova m1, [rsp+gprsize*2+16*49] ;in3 + pmulhrsw m3, m0, [o(pw_3102x8)] ;t49a + pmulhrsw m0, [o(pw_2675x8)] ;t46a + pmulhrsw m2, m1, [o(pw_4085x8)] ;t48a + pmulhrsw m1, [o(pw_m301x8)] ;t47a + psubsw m5, m1, m0 ;t46 + paddsw m0, m1 ;t47 + psubsw m4, m2, m3 ;t49 + paddsw m3, m2 ;t48 + +ALIGN function_align +.main2: + ITX_MULSUB_2W 4, 5, 1, 2, 7, m1189, 3920 ;t46a, t49a + mova m1, [rsp+gprsize*2+16*54] ;t51 + psubsw m2, m0, m6 ;t44a + paddsw m0, m6 ;t47a + psubsw m6, m3, m1 ;t51a + paddsw m3, m1 ;t48a + mova [rsp+gprsize*2+16*50], m0 ;t47a + mova [rsp+gprsize*2+16*51], m3 ;t48a + ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t44, t51 + mova [rsp+gprsize*2+16*47], m6 ;t44 + mova [rsp+gprsize*2+16*54], m2 ;t51 + + mova m0, [rsp+gprsize*2+16*48] ;t45a + mova m3, [rsp+gprsize*2+16*53] ;t50a + psubsw m2, m4, m0 ;t45 + paddsw m4, m0 ;t46 + psubsw m6, m5, m3 ;t50 + paddsw m5, m3 ;t49 + ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t45a, t50a + mova [rsp+gprsize*2+16*48], m6 ;t45a + mova [rsp+gprsize*2+16*49], m4 ;t46 + mova [rsp+gprsize*2+16*52], m5 ;t49 + mova [rsp+gprsize*2+16*53], m2 ;t50a + + mova m0, [rsp+gprsize*2+16*43] ;t40 + mova m2, [rsp+gprsize*2+16*46] ;t43 + mova m3, [rsp+gprsize*2+16*55] ;t52 + mova m1, [rsp+gprsize*2+16*58] ;t55 + psubsw m4, m0, m2 ;t43a + paddsw m0, m2 ;t40a + psubsw m5, m1, m3 ;t52a + paddsw m1, m3 ;t55a + ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t43, t52 + mova [rsp+gprsize*2+16*43], m0 ;t40a + mova [rsp+gprsize*2+16*46], m5 ;t43 + mova [rsp+gprsize*2+16*55], m4 ;t52 + mova [rsp+gprsize*2+16*58], m1 ;t55a + + mova m0, [rsp+gprsize*2+16*44] ;t41a + mova m2, [rsp+gprsize*2+16*45] ;t42a + mova m3, [rsp+gprsize*2+16*56] ;t53a + mova m1, [rsp+gprsize*2+16*57] ;t54a + psubsw m4, m0, m2 ;t42 + paddsw m0, m2 ;t41 + psubsw m5, m1, m3 ;t53 + paddsw m1, m3 ;t54 + ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t42a, t53a + mova [rsp+gprsize*2+16*44], m0 ;t41 + mova [rsp+gprsize*2+16*45], m5 ;t42a + mova [rsp+gprsize*2+16*56], m4 ;t53a + mova [rsp+gprsize*2+16*57], m1 ;t54 + + mova m0, [rsp+gprsize*2+16*41] ;t38a + mova m2, [rsp+gprsize*2+16*40] ;t37a + mova m3, [rsp+gprsize*2+16*61] ;t58a + mova m1, [rsp+gprsize*2+16*60] ;t57a + psubsw m4, m0, m2 ;t37 + paddsw m0, m2 ;t38 + psubsw m5, m1, m3 ;t58 + paddsw m1, m3 ;t57 + ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t37a, t58a + mova [rsp+gprsize*2+16*41], m0 ;t38 + mova [rsp+gprsize*2+16*40], m5 ;t37a + mova [rsp+gprsize*2+16*61], m4 ;t58a + mova [rsp+gprsize*2+16*60], m1 ;t57 + + mova m0, [rsp+gprsize*2+16*42] ;t39 + mova m2, [rsp+gprsize*2+16*39] ;t36 + mova m3, [rsp+gprsize*2+16*62] ;t59 + mova m1, [rsp+gprsize*2+16*59] ;t56 + psubsw m4, m0, m2 ;t36a + paddsw m0, m2 ;t39a + psubsw m5, m1, m3 ;t59a + paddsw m1, m3 ;t56a + ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t36, t59 + mova [rsp+gprsize*2+16*42], m0 ;t39a + mova [rsp+gprsize*2+16*39], m5 ;t36 + mova [rsp+gprsize*2+16*62], m4 ;t59 + mova [rsp+gprsize*2+16*59], m1 ;t56a + + mova m0, [rsp+gprsize*2+16*35] ;t32 + mova m2, [rsp+gprsize*2+16*38] ;t35 + mova m3, [rsp+gprsize*2+16*63] ;t60 + mova m1, [rsp+gprsize*2+16*66] ;t63 + psubsw m4, m0, m2 ;t35a + paddsw m0, m2 ;t32a + psubsw m5, m1, m3 ;t60a + paddsw m1, m3 ;t63a + ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t35, t60 + mova [rsp+gprsize*2+16*35], m0 ;t32a + mova [rsp+gprsize*2+16*38], m5 ;t35 + mova [rsp+gprsize*2+16*63], m4 ;t60 + mova [rsp+gprsize*2+16*66], m1 ;t63a + + mova m0, [rsp+gprsize*2+16*36] ;t33a + mova m2, [rsp+gprsize*2+16*37] ;t34a + mova m3, [rsp+gprsize*2+16*64] ;t61a + mova m1, [rsp+gprsize*2+16*65] ;t62a + psubsw m4, m0, m2 ;t34 + paddsw m0, m2 ;t33 + psubsw m5, m1, m3 ;t61 + paddsw m1, m3 ;t62 + ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t34a, t61a + + mova m2, [rsp+gprsize*2+16*41] ;t38 + mova m3, [rsp+gprsize*2+16*60] ;t57 + psubsw m6, m0, m2 ;t38a + paddsw m0, m2 ;t33a + psubsw m2, m1, m3 ;t57a + paddsw m1, m3 ;t62a + mova [rsp+gprsize*2+16*36], m0 ;t33a + mova [rsp+gprsize*2+16*65], m1 ;t62a + ITX_MULSUB_2W 2, 6, 0, 3, 7, 1567, 3784 ;t38, t57 + mova [rsp+gprsize*2+16*41], m2 ;t38 + mova [rsp+gprsize*2+16*60], m6 ;t57 + + mova m2, [rsp+gprsize*2+16*40] ;t37 + mova m3, [rsp+gprsize*2+16*61] ;t58 + psubsw m0, m5, m2 ;t37 + paddsw m5, m2 ;t34 + psubsw m1, m4, m3 ;t58 + paddsw m4, m3 ;t61 + ITX_MULSUB_2W 1, 0, 2, 3, 7, 1567, 3784 ;t37a, t58a + mova [rsp+gprsize*2+16*37], m5 ;t34 + mova [rsp+gprsize*2+16*64], m4 ;t61 + mova [rsp+gprsize*2+16*40], m1 ;t37a + mova [rsp+gprsize*2+16*61], m0 ;t58a + + mova m0, [rsp+gprsize*2+16*38] ;t35 + mova m2, [rsp+gprsize*2+16*39] ;t36 + mova m3, [rsp+gprsize*2+16*62] ;t59 + mova m1, [rsp+gprsize*2+16*63] ;t60 + psubsw m4, m0, m2 ;t36a + paddsw m0, m2 ;t35a + psubsw m5, m1, m3 ;t59a + paddsw m1, m3 ;t60a + ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t36, t59 + mova [rsp+gprsize*2+16*38], m0 ;t35a + mova [rsp+gprsize*2+16*39], m5 ;t36 + mova [rsp+gprsize*2+16*62], m4 ;t59 + mova [rsp+gprsize*2+16*63], m1 ;t60a + + mova m0, [rsp+gprsize*2+16*35] ;t32a + mova m2, [rsp+gprsize*2+16*42] ;t39a + mova m3, [rsp+gprsize*2+16*59] ;t56a + mova m1, [rsp+gprsize*2+16*66] ;t63a + psubsw m4, m0, m2 ;t39 + paddsw m0, m2 ;t32 + psubsw m5, m1, m3 ;t56 + paddsw m1, m3 ;t63 + ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t39a, t56a + mova [rsp+gprsize*2+16*35], m0 ;t32 + mova [rsp+gprsize*2+16*42], m5 ;t39a + mova [rsp+gprsize*2+16*59], m4 ;t56a + mova [rsp+gprsize*2+16*66], m1 ;t63 + + mova m0, [rsp+gprsize*2+16*50] ;t47a + mova m2, [rsp+gprsize*2+16*43] ;t40a + mova m3, [rsp+gprsize*2+16*58] ;t55a + mova m1, [rsp+gprsize*2+16*51] ;t48a + psubsw m4, m0, m2 ;t40 + paddsw m0, m2 ;t47 + psubsw m5, m1, m3 ;t55 + paddsw m1, m3 ;t48 + ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t40a, t55a + mova [rsp+gprsize*2+16*50], m0 ;t47 + mova [rsp+gprsize*2+16*43], m5 ;t40a + mova [rsp+gprsize*2+16*58], m4 ;t55a + mova [rsp+gprsize*2+16*51], m1 ;t48 + + mova m0, [rsp+gprsize*2+16*49] ;t46 + mova m2, [rsp+gprsize*2+16*44] ;t41 + mova m3, [rsp+gprsize*2+16*57] ;t54 + mova m1, [rsp+gprsize*2+16*52] ;t49 + psubsw m4, m0, m2 ;t41a + paddsw m0, m2 ;t46a + psubsw m5, m1, m3 ;t54a + paddsw m1, m3 ;t49a + ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t41, t54 + mova [rsp+gprsize*2+16*49], m0 ;t46a + mova [rsp+gprsize*2+16*44], m5 ;t41 + mova [rsp+gprsize*2+16*57], m4 ;t54 + mova [rsp+gprsize*2+16*52], m1 ;t49a + + mova m0, [rsp+gprsize*2+16*48] ;t45a + mova m2, [rsp+gprsize*2+16*45] ;t42a + mova m3, [rsp+gprsize*2+16*56] ;t53a + mova m1, [rsp+gprsize*2+16*53] ;t50a + psubsw m4, m0, m2 ;t42 + paddsw m0, m2 ;t45 + psubsw m5, m1, m3 ;t53 + paddsw m1, m3 ;t50 + ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t42a, t53a + mova [rsp+gprsize*2+16*48], m0 ;t45 + mova [rsp+gprsize*2+16*45], m5 ;t42a + mova [rsp+gprsize*2+16*56], m4 ;t53a + mova [rsp+gprsize*2+16*53], m1 ;t50 + + mova m0, [rsp+gprsize*2+16*47] ;t44 + mova m2, [rsp+gprsize*2+16*46] ;t43 + mova m3, [rsp+gprsize*2+16*55] ;t52 + mova m1, [rsp+gprsize*2+16*54] ;t51 + psubsw m4, m0, m2 ;t43a + paddsw m0, m2 ;t44a + psubsw m5, m1, m3 ;t52a + paddsw m1, m3 ;t51a + ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t43, t52 + + mova m2, [rsp+gprsize*2+16*38] ;t35a + mova m3, [rsp+gprsize*2+16*31] ;tmp[28] + psubsw m6, m2, m0 ;t44 + paddsw m2, m0 ;t35 + psubsw m0, m3, m2 ;out35 + paddsw m2, m3 ;out28 + mova m3, [rsp+gprsize*2+16*63] ;t60a + mova [rsp+gprsize*2+16*38], m0 ;out35 + mova [rsp+gprsize*2+16*31], m2 ;out28 + psubsw m0, m3, m1 ;t51 + paddsw m3, m1 ;t60 + ITX_MULSUB_2W 0, 6, 1, 2, 7, 2896, 2896 ;t44a, t51a + mova m2, [rsp+gprsize*2+16*6 ] ;tmp[3] + psubsw m1, m2, m3 ;out60 + paddsw m2, m3 ;out3 + mova m3, [rsp+gprsize*2+16*22] ;tmp[19] + mova [rsp+gprsize*2+16*63], m1 ;out60 + mova [rsp+gprsize*2+16*6 ], m2 ;out3 + psubsw m1, m3, m0 ;out44 + paddsw m3, m0 ;out19 + mova m2, [rsp+gprsize*2+16*15] ;tmp[12] + + mova m0, [rsp+gprsize*2+16*39] ;t36 + mova [rsp+gprsize*2+16*47], m1 ;out44 + mova [rsp+gprsize*2+16*22], m3 ;out19 + mova m1, [rsp+gprsize*2+16*62] ;t59 + psubsw m3, m2, m6 ;out51 + paddsw m2, m6 ;out12 + mova [rsp+gprsize*2+16*54], m3 ;out51 + mova [rsp+gprsize*2+16*15], m2 ;out12 + psubsw m2, m0, m5 ;t43a + paddsw m0, m5 ;t36a + mova m5, [rsp+gprsize*2+16*30] ;tmp[27] + psubsw m3, m1, m4 ;t52a + paddsw m1, m4 ;t59a + ITX_MULSUB_2W 3, 2, 4, 6, 7, 2896, 2896 ;t43, t52 + mova m4, [rsp+gprsize*2+16*7 ] ;tmp[4 ] + psubsw m6, m5, m0 ;out36 + paddsw m5, m0 ;out27 + psubsw m0, m4, m1 ;out59 + paddsw m4, m1 ;out4 + mova [rsp+gprsize*2+16*39], m6 ;out36 + mova [rsp+gprsize*2+16*30], m5 ;out27 + mova [rsp+gprsize*2+16*62], m0 ;out59 + mova [rsp+gprsize*2+16*7 ], m4 ;out4 + mova m0, [rsp+gprsize*2+16*23] ;tmp[20] + mova m5, [rsp+gprsize*2+16*14] ;tmp[11] + psubsw m4, m0, m3 ;out43 + paddsw m0, m3 ;out20 + psubsw m6, m5, m2 ;out52 + paddsw m5, m2 ;out11 + mova [rsp+gprsize*2+16*46], m4 ;out43 + mova [rsp+gprsize*2+16*23], m0 ;out20 + mova [rsp+gprsize*2+16*55], m6 ;out52 + mova [rsp+gprsize*2+16*14], m5 ;out11 + + mova m0, [rsp+gprsize*2+16*40] ;t37a + mova m5, [rsp+gprsize*2+16*45] ;t42a + mova m3, [rsp+gprsize*2+16*56] ;t53a + mova m1, [rsp+gprsize*2+16*61] ;t58a + mova m2, [rsp+gprsize*2+16*29] ;tmp[26] + psubsw m4, m0, m5 ;t42 + paddsw m0, m5 ;t37 + psubsw m5, m1, m3 ;t53 + paddsw m1, m3 ;t58 + ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t43, t52 + mova m3, [rsp+gprsize*2+16*8 ] ;tmp[5 ] + psubsw m6, m2, m0 ;out37 + paddsw m2, m0 ;out26 + psubsw m0, m3, m1 ;out58 + paddsw m3, m1 ;out5 + mova [rsp+gprsize*2+16*40], m6 ;out37 + mova [rsp+gprsize*2+16*29], m2 ;out26 + mova [rsp+gprsize*2+16*61], m0 ;out58 + mova [rsp+gprsize*2+16*8 ], m3 ;out5 + mova m0, [rsp+gprsize*2+16*24] ;tmp[21] + mova m1, [rsp+gprsize*2+16*13] ;tmp[10] + psubsw m2, m0, m5 ;out42 + paddsw m0, m5 ;out21 + psubsw m3, m1, m4 ;out53 + paddsw m1, m4 ;out10 + mova [rsp+gprsize*2+16*45], m2 ;out42 + mova [rsp+gprsize*2+16*24], m0 ;out21 + mova [rsp+gprsize*2+16*56], m3 ;out53 + mova [rsp+gprsize*2+16*13], m1 ;out10 + + mova m0, [rsp+gprsize*2+16*41] ;t38 + mova m5, [rsp+gprsize*2+16*44] ;t41 + mova m3, [rsp+gprsize*2+16*57] ;t54 + mova m1, [rsp+gprsize*2+16*60] ;t57 + mova m2, [rsp+gprsize*2+16*28] ;tmp[25] + psubsw m4, m0, m5 ;t41a + paddsw m0, m5 ;t38a + psubsw m5, m1, m3 ;t54a + paddsw m1, m3 ;t57a + ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t41a, t54a + mova m3, [rsp+gprsize*2+16*9 ] ;tmp[6 ] + psubsw m6, m2, m0 ;out38 + paddsw m2, m0 ;out25 + psubsw m0, m3, m1 ;out57 + paddsw m3, m1 ;out6 + mova [rsp+gprsize*2+16*41], m6 ;out38 + mova [rsp+gprsize*2+16*28], m2 ;out25 + mova [rsp+gprsize*2+16*60], m0 ;out57 + mova [rsp+gprsize*2+16*9 ], m3 ;out6 + mova m0, [rsp+gprsize*2+16*25] ;tmp[22] + mova m1, [rsp+gprsize*2+16*12] ;tmp[9 ] + psubsw m2, m0, m5 ;out41 + paddsw m0, m5 ;out22 + psubsw m3, m1, m4 ;out54 + paddsw m1, m4 ;out9 + mova [rsp+gprsize*2+16*44], m2 ;out41 + mova [rsp+gprsize*2+16*25], m0 ;out22 + mova [rsp+gprsize*2+16*57], m3 ;out54 + mova [rsp+gprsize*2+16*12], m1 ;out9 + + mova m0, [rsp+gprsize*2+16*42] ;t39a + mova m5, [rsp+gprsize*2+16*43] ;t40a + mova m3, [rsp+gprsize*2+16*58] ;t55a + mova m1, [rsp+gprsize*2+16*59] ;t56a + mova m2, [rsp+gprsize*2+16*27] ;tmp[24] + psubsw m4, m0, m5 ;t40 + paddsw m0, m5 ;t39 + psubsw m5, m1, m3 ;t55 + paddsw m1, m3 ;t56 + ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t40a, t55a + mova m3, [rsp+gprsize*2+16*10] ;tmp[7 ] + psubsw m6, m2, m0 ;out39 + paddsw m2, m0 ;out24 + psubsw m0, m3, m1 ;out56 + paddsw m3, m1 ;out7 + mova [rsp+gprsize*2+16*42], m6 ;out39 + mova [rsp+gprsize*2+16*27], m2 ;out24 + mova [rsp+gprsize*2+16*59], m0 ;out56 + mova [rsp+gprsize*2+16*10], m3 ;out7 + mova m0, [rsp+gprsize*2+16*26] ;tmp[23] + mova m1, [rsp+gprsize*2+16*11] ;tmp[8 ] + psubsw m2, m0, m5 ;out40 + paddsw m0, m5 ;out23 + psubsw m3, m1, m4 ;out55 + paddsw m1, m4 ;out8 + mova [rsp+gprsize*2+16*43], m2 ;out40 + mova [rsp+gprsize*2+16*26], m0 ;out23 + mova [rsp+gprsize*2+16*58], m3 ;out55 + mova [rsp+gprsize*2+16*11], m1 ;out8 + + mova m0, [rsp+gprsize*2+16*37] ;t34 + mova m5, [rsp+gprsize*2+16*48] ;t45 + mova m3, [rsp+gprsize*2+16*53] ;t50 + mova m1, [rsp+gprsize*2+16*64] ;t61 + mova m2, [rsp+gprsize*2+16*32] ;tmp[29] + psubsw m4, m0, m5 ;t45a + paddsw m0, m5 ;t34a + psubsw m5, m1, m3 ;t50a + paddsw m1, m3 ;t61a + ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50 + mova m3, [rsp+gprsize*2+16*5 ] ;tmp[2 ] + psubsw m6, m2, m0 ;out34 + paddsw m2, m0 ;out29 + psubsw m0, m3, m1 ;out61 + paddsw m3, m1 ;out2 + mova [rsp+gprsize*2+16*37], m6 ;out34 + mova [rsp+gprsize*2+16*32], m2 ;out29 + mova [rsp+gprsize*2+16*64], m0 ;out61 + mova [rsp+gprsize*2+16*5 ], m3 ;out2 + mova m0, [rsp+gprsize*2+16*21] ;tmp[18] + mova m1, [rsp+gprsize*2+16*16] ;tmp[13] + psubsw m2, m0, m5 ;out45 + paddsw m0, m5 ;out18 + psubsw m3, m1, m4 ;out50 + paddsw m1, m4 ;out13 + mova [rsp+gprsize*2+16*48], m2 ;out45 + mova [rsp+gprsize*2+16*21], m0 ;out18 + mova [rsp+gprsize*2+16*53], m3 ;out50 + mova [rsp+gprsize*2+16*16], m1 ;out13 + + mova m0, [rsp+gprsize*2+16*36] ;t33a + mova m5, [rsp+gprsize*2+16*49] ;t46a + mova m3, [rsp+gprsize*2+16*52] ;t49a + mova m1, [rsp+gprsize*2+16*65] ;t62a + mova m2, [rsp+gprsize*2+16*33] ;tmp[30] + psubsw m4, m0, m5 ;t46 + paddsw m0, m5 ;t33 + psubsw m5, m1, m3 ;t49 + paddsw m1, m3 ;t62 + ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50 + mova m3, [rsp+gprsize*2+16*4 ] ;tmp[1 ] + psubsw m6, m2, m0 ;out33 + paddsw m2, m0 ;out30 + psubsw m0, m3, m1 ;out62 + paddsw m3, m1 ;out1 + mova [rsp+gprsize*2+16*36], m6 ;out33 + mova [rsp+gprsize*2+16*33], m2 ;out30 + mova [rsp+gprsize*2+16*65], m0 ;out62 + mova [rsp+gprsize*2+16*4 ], m3 ;out1 + mova m0, [rsp+gprsize*2+16*20] ;tmp[17] + mova m1, [rsp+gprsize*2+16*17] ;tmp[14] + psubsw m2, m0, m5 ;out46 + paddsw m0, m5 ;out17 + psubsw m3, m1, m4 ;out49 + paddsw m1, m4 ;out14 + mova [rsp+gprsize*2+16*49], m2 ;out46 + mova [rsp+gprsize*2+16*20], m0 ;out17 + mova [rsp+gprsize*2+16*52], m3 ;out49 + mova [rsp+gprsize*2+16*17], m1 ;out14 + + mova m0, [rsp+gprsize*2+16*35] ;t32 + mova m5, [rsp+gprsize*2+16*50] ;t47 + mova m3, [rsp+gprsize*2+16*51] ;t48 + mova m1, [rsp+gprsize*2+16*66] ;t63 + mova m2, [rsp+gprsize*2+16*34] ;tmp[31] + psubsw m4, m0, m5 ;t47a + paddsw m0, m5 ;t32a + psubsw m5, m1, m3 ;t48a + paddsw m1, m3 ;t63a + ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t47, t48 + mova m3, [rsp+gprsize*2+16*3 ] ;tmp[0 ] + psubsw m6, m2, m0 ;out32 + paddsw m2, m0 ;out31 + psubsw m0, m3, m1 ;out63 + paddsw m3, m1 ;out0 + mova [rsp+gprsize*2+16*35], m6 ;out32 + mova [rsp+gprsize*2+16*34], m2 ;out31 + mova [rsp+gprsize*2+16*66], m0 ;out63 + mova [rsp+gprsize*2+16*3 ], m3 ;out0 + mova m0, [rsp+gprsize*2+16*19] ;tmp[16] + mova m1, [rsp+gprsize*2+16*18] ;tmp[15] + psubsw m2, m0, m5 ;out47 + paddsw m0, m5 ;out16 + psubsw m3, m1, m4 ;out48 + paddsw m1, m4 ;out15 + mova [rsp+gprsize*2+16*50], m2 ;out47 + mova [rsp+gprsize*2+16*19], m0 ;out16 + mova [rsp+gprsize*2+16*51], m3 ;out48 + mova [rsp+gprsize*2+16*18], m1 ;out15 + ret + + +cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + + call m(idct_64x16_internal_8bpc) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + mov r3d, 16 + lea tx2q, [o(.end)] + +.body: + pmulhrsw m0, m2 + movd m2, [o(pw_2048)] ;intentionally rip-relative + pmulhrsw m0, m1 + pmulhrsw m0, m2 + pshuflw m0, m0, q0000 + punpcklwd m0, m0 + pxor m7, m7 + +.loop: + mova m1, [dstq+16*0] + mova m3, [dstq+16*1] + mova m5, [dstq+16*2] + mova m6, [dstq+16*3] + punpckhbw m2, m1, m7 + punpcklbw m1, m7 + punpckhbw m4, m3, m7 + punpcklbw m3, m7 + paddw m2, m0 + paddw m1, m0 + paddw m4, m0 + paddw m3, m0 + packuswb m1, m2 + packuswb m3, m4 + punpckhbw m2, m5, m7 + punpcklbw m5, m7 + punpckhbw m4, m6, m7 + punpcklbw m6, m7 + paddw m2, m0 + paddw m5, m0 + paddw m4, m0 + paddw m6, m0 + packuswb m5, m2 + packuswb m6, m4 + mova [dstq+16*0], m1 + mova [dstq+16*1], m3 + mova [dstq+16*2], m5 + mova [dstq+16*3], m6 + add dstq, strideq + dec r3d + jg .loop + jmp tx2q + +.end: + RET + + +%macro LOAD_4ROWS 2-3 0 ;src, stride, is_rect2 + +%if %3 + mova m3, [o(pw_2896x8)] + pmulhrsw m0, m3, [%1+%2*0] + pmulhrsw m1, m3, [%1+%2*1] + pmulhrsw m2, m3, [%1+%2*2] + pmulhrsw m3, [%1+%2*3] +%else + mova m0, [%1+%2*0] + mova m1, [%1+%2*1] + mova m2, [%1+%2*2] + mova m3, [%1+%2*3] +%endif +%endmacro + +%macro LOAD_4ROWS_H 2 ;src, stride + mova m4, [%1+%2*0] + mova m5, [%1+%2*1] + mova m6, [%1+%2*2] + mova m7, [%1+%2*3] +%endmacro + +cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mov r3d, 2 + mov [rsp+gprsize*2+16*67], dstq + lea dstq, [rsp+gprsize+16*68] + +.pass1_loop: + LOAD_4ROWS coeffq+32*0, 32*8 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + pxor m4, m4 + LOAD_4ROWS coeffq+32*4, 32*8 + + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+32*2, 32*4 + mova [rsp+gprsize+16*19], m0 + mova [rsp+gprsize+16*26], m1 + mova [rsp+gprsize+16*23], m2 + mova [rsp+gprsize+16*22], m3 + mova [rsp+gprsize+16*21], m4 + mova [rsp+gprsize+16*24], m5 + mova [rsp+gprsize+16*25], m6 + mova [rsp+gprsize+16*20], m7 + + call m(idct_8x32_internal_8bpc).main_fast + SAVE_8ROWS rsp+gprsize+16*3, 16 + + LOAD_8ROWS coeffq+32*1, 32*2 + mova [rsp+gprsize+16*35], m0 ;in1 + mova [rsp+gprsize+16*49], m1 ;in3 + mova [rsp+gprsize+16*43], m2 ;in5 + mova [rsp+gprsize+16*41], m3 ;in7 + mova [rsp+gprsize+16*39], m4 ;in9 + mova [rsp+gprsize+16*45], m5 ;in11 + mova [rsp+gprsize+16*47], m6 ;in13 + mova [rsp+gprsize+16*37], m7 ;in15 + + LOAD_8ROWS coeffq+32*17, 32*2 + mova [rsp+gprsize+16*63], m0 ;in17 + mova [rsp+gprsize+16*53], m1 ;in19 + mova [rsp+gprsize+16*55], m2 ;in21 + mova [rsp+gprsize+16*61], m3 ;in23 + mova [rsp+gprsize+16*59], m4 ;in25 + mova [rsp+gprsize+16*57], m5 ;in27 + mova [rsp+gprsize+16*51], m6 ;in29 + mova [rsp+gprsize+16*65], m7 ;in31 + + call m(idct_16x64_internal_8bpc).main + + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end: + SAVE_8ROWS coeffq+32*0, 32 + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+32*8, 32 + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end2: + SAVE_8ROWS coeffq+32*16, 32 + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end3: + SAVE_8ROWS coeffq+32*24, 32 + LOAD_8ROWS rsp+gprsize+16*35, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end4: + SAVE_8ROWS dstq+32*0, 32 + LOAD_8ROWS rsp+gprsize+16*43, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end5: + SAVE_8ROWS dstq+32*8, 32 + LOAD_8ROWS rsp+gprsize+16*51, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end6)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end6: + SAVE_8ROWS dstq+32*16, 32 + LOAD_8ROWS rsp+gprsize+16*59, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end7: + SAVE_8ROWS dstq+32*24, 32 + + add coeffq, 16 + add dstq, 16 + dec r3d + jg .pass1_loop + +.pass2: + mov dstq, [rsp+gprsize*2+16*67] + sub coeffq, 32 + mov r3d, 4 + +.pass2_loop: + mov [rsp+gprsize*1+16*67], r3d + + LOAD_4ROWS coeffq+16*0, 32*2 + LOAD_4ROWS_H coeffq+16*1, 32*2 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_4ROWS coeffq+16*2, 32*2 + LOAD_4ROWS_H coeffq+16*3, 32*2 + call m(idct_16x8_internal_8bpc).main + + mov r3, dstq + lea tx2q, [o(.end)] + lea dstq, [dstq+strideq*8] + jmp m(idct_8x8_internal_8bpc).end + +.end: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.end1)] + mov dstq, r3 + jmp m(idct_8x8_internal_8bpc).end + +.end1: + pxor m7, m7 + REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + + add coeffq, 16*16 + mov r3d, [rsp+gprsize*1+16*67] + mov dstq, [rsp+gprsize*2+16*67] + add dstq, 8 + mov [rsp+gprsize*2+16*67], dstq + dec r3d + jg .pass2_loop + + mov r3d, 4 + lea coeffq, [rsp+gprsize+16*68] +.pass2_loop2: + mov [rsp+gprsize*1+16*67], r3d + + LOAD_4ROWS coeffq+16*0, 32*2 + LOAD_4ROWS_H coeffq+16*1, 32*2 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_4ROWS coeffq+16*2, 32*2 + LOAD_4ROWS_H coeffq+16*3, 32*2 + call m(idct_16x8_internal_8bpc).main + + mov r3, dstq + lea tx2q, [o(.end2)] + lea dstq, [dstq+strideq*8] + jmp m(idct_8x8_internal_8bpc).end + +.end2: + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.end3)] + mov dstq, r3 + jmp m(idct_8x8_internal_8bpc).end + +.end3: + + add coeffq, 16*16 + mov r3d, [rsp+gprsize*1+16*67] + mov dstq, [rsp+gprsize*2+16*67] + add dstq, 8 + mov [rsp+gprsize*2+16*67], dstq + dec r3d + jg .pass2_loop2 + ret + + +cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + call m(idct_32x64_internal_8bpc) +.end: + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_16384)] + mov [coeffq], eobd + pmulhrsw m0, m1 + mov r3d, 64 + lea tx2q, [o(.end)] + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body + + +cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mov r4d, 2 + sub eobd, 136 + mov [rsp+gprsize*1+16*67], eobd + mov r3d, 4 + cmovs r3d, r4d + +%if ARCH_X86_32 + LEA r5, $$ +%endif + + mov [rsp+gprsize*2+16*67], coeffq + +.pass1_loop: + LOAD_8ROWS coeffq+64*1, 64*2, 1 + mova [rsp+gprsize+16*19], m0 ;in1 + mova [rsp+gprsize+16*26], m1 ;in3 + mova [rsp+gprsize+16*23], m2 ;in5 + mova [rsp+gprsize+16*22], m3 ;in7 + mova [rsp+gprsize+16*21], m4 ;in9 + mova [rsp+gprsize+16*24], m5 ;in11 + mova [rsp+gprsize+16*25], m6 ;in13 + mova [rsp+gprsize+16*20], m7 ;in15 + + mov tx2d, [rsp+gprsize*1+16*67] + test tx2d, tx2d + jl .fast + +.full: + LOAD_8ROWS coeffq+64*0, 64*4, 1 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_8ROWS coeffq+64*2, 64*4, 1 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+64*17, 64*2, 1 + mova [rsp+gprsize+16*33], m0 ;in17 + mova [rsp+gprsize+16*28], m1 ;in19 + mova [rsp+gprsize+16*29], m2 ;in21 + mova [rsp+gprsize+16*32], m3 ;in23 + mova [rsp+gprsize+16*31], m4 ;in25 + mova [rsp+gprsize+16*30], m5 ;in27 + mova [rsp+gprsize+16*27], m6 ;in29 + mova [rsp+gprsize+16*34], m7 ;in31 + + call m(idct_8x32_internal_8bpc).main + jmp .pass1_end + +.fast: + LOAD_4ROWS coeffq, 256, 1 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + + SAVE_7ROWS rsp+gprsize+16*3, 16 + LOAD_4ROWS coeffq+128*1, 256, 1 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + call m(idct_8x32_internal_8bpc).main_fast + +.pass1_end: + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end1: + SAVE_8ROWS coeffq+64*0, 64 + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end2: + SAVE_8ROWS coeffq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end3: + SAVE_8ROWS coeffq+64*16, 64 + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end4: + SAVE_8ROWS coeffq+64*24, 64 + + add coeffq, 16 + dec r3d + jg .pass1_loop + +.pass2: + mov coeffq, [rsp+gprsize*2+16*67] + mov r3d, 4 + lea r4, [dstq+8] + mov [rsp+gprsize*2+16*67], r4 + lea r4, [o(m(idct_16x64_internal_8bpc).end1)] + jmp m(idct_16x64_internal_8bpc).pass2_loop + + +cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + call m(idct_64x32_internal_8bpc) +.end: + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_16384)] + pmulhrsw m0, m1 + mov [coeffq], eobd + mov r3d, 32 + lea tx2q, [o(.end)] + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body + + +cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mov r4d, 2 + sub eobd, 136 + mov [rsp+gprsize*1+16*67], eobd + mov r3d, 4 + cmovs r3d, r4d + +%if ARCH_X86_32 + LEA r5, $$ +%endif + + mov [rsp+gprsize*2+16*67], coeffq + mov [rsp+gprsize*3+16*67], dstq + lea dstq, [rsp+gprsize+16*69] + mov [rsp+gprsize*4+16*67], dstq + +.pass1_loop: + LOAD_4ROWS coeffq+64*0, 64*8, 1 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + pxor m4, m4 + LOAD_4ROWS coeffq+64*4, 64*8, 1 + + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+64*2, 64*4, 1 + mova [rsp+gprsize+16*19], m0 + mova [rsp+gprsize+16*26], m1 + mova [rsp+gprsize+16*23], m2 + mova [rsp+gprsize+16*22], m3 + mova [rsp+gprsize+16*21], m4 + mova [rsp+gprsize+16*24], m5 + mova [rsp+gprsize+16*25], m6 + mova [rsp+gprsize+16*20], m7 + + call m(idct_8x32_internal_8bpc).main_fast + SAVE_8ROWS rsp+gprsize+16*3, 16 + + LOAD_8ROWS coeffq+64*1, 64*2, 1 + mova [rsp+gprsize+16*35], m0 ;in1 + mova [rsp+gprsize+16*49], m1 ;in3 + mova [rsp+gprsize+16*43], m2 ;in5 + mova [rsp+gprsize+16*41], m3 ;in7 + mova [rsp+gprsize+16*39], m4 ;in9 + mova [rsp+gprsize+16*45], m5 ;in11 + mova [rsp+gprsize+16*47], m6 ;in13 + mova [rsp+gprsize+16*37], m7 ;in15 + + LOAD_8ROWS coeffq+64*17, 64*2, 1 + mova [rsp+gprsize+16*63], m0 ;in17 + mova [rsp+gprsize+16*53], m1 ;in19 + mova [rsp+gprsize+16*55], m2 ;in21 + mova [rsp+gprsize+16*61], m3 ;in23 + mova [rsp+gprsize+16*59], m4 ;in25 + mova [rsp+gprsize+16*57], m5 ;in27 + mova [rsp+gprsize+16*51], m6 ;in29 + mova [rsp+gprsize+16*65], m7 ;in31 + + call m(idct_16x64_internal_8bpc).main + + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end: + SAVE_8ROWS coeffq+64*0, 64 + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end1: + SAVE_8ROWS coeffq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end2: + SAVE_8ROWS coeffq+64*16, 64 + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end3: + SAVE_8ROWS coeffq+64*24, 64 + LOAD_8ROWS rsp+gprsize+16*35, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end4: + SAVE_8ROWS dstq+64*0, 64 + LOAD_8ROWS rsp+gprsize+16*43, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end5: + SAVE_8ROWS dstq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*51, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end6)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end6: + SAVE_8ROWS dstq+64*16, 64 + LOAD_8ROWS rsp+gprsize+16*59, 16 + mova [rsp+gprsize+16*0], m7 + lea tx2q, [o(.pass1_end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end + +.pass1_end7: + SAVE_8ROWS dstq+64*24, 64 + + add coeffq, 16 + add dstq, 16 + dec r3d + jg .pass1_loop + +.pass2: + mov coeffq, [rsp+gprsize*4+16*67] + mov dstq, [rsp+gprsize*3+16*67] + mov eobd, [rsp+gprsize*1+16*67] + lea dstq, [dstq+32] + mov [rsp+gprsize*1+16*35], eobd + lea tx2q, [o(.pass2_end)] + mov r3d, 4 + jmp m(idct_32x32_internal_8bpc).pass2_loop + +.pass2_end: + mova [rsp+gprsize+16*0], m7 + lea r3, [o(.pass2_end1)] + jmp m(idct_8x32_internal_8bpc).end2 + +.pass2_end1: + lea tx2q, [o(.pass2_end)] + add coeffq, 16*32 + mov dstq, [rsp+gprsize*2+16*35] + mov r3d, [rsp+gprsize*3+16*35] + dec r3d + jg m(idct_32x32_internal_8bpc).pass2_loop + +.pass2_end2: + mov dstq, [rsp+gprsize*3+16*67] + mov coeffq, [rsp+gprsize*2+16*67] + lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)] + mov r3d, 4 + jmp m(idct_32x32_internal_8bpc).pass2_loop + + +cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2 +%if ARCH_X86_32 + LEA r5, $$ +%endif + test eobd, eobd + jz .dconly + + call m(idct_64x64_internal_8bpc) + RET + +.dconly: + movd m1, [o(pw_2896x8)] + pmulhrsw m0, m1, [coeffq] + movd m2, [o(pw_8192)] + mov [coeffq], eobd + mov r3d, 64 + lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)] + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body + +cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2 + mov r5d, 4 + mov r4d, 2 + sub eobd, 136 + cmovns r4d, r5d + +%if ARCH_X86_32 + LEA r5, $$ +%endif + + mov [rsp+gprsize*1+16*67], eobd + mov r3d, r4d + mov [rsp+gprsize*4+16*67], coeffq + mov [rsp+gprsize*3+16*67], dstq + lea dstq, [rsp+gprsize+16*69] + mov [rsp+gprsize*2+16*67], dstq + +.pass1_loop: + LOAD_4ROWS coeffq+64*0, 64*8 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_8bpc).main + SAVE_7ROWS rsp+gprsize+16*3, 16 + + pxor m4, m4 + LOAD_4ROWS coeffq+64*4, 64*8 + + REPX {mova x, m4}, m5, m6, m7 + call m(idct_16x8_internal_8bpc).main + mova m7, [rsp+gprsize+16*0] + SAVE_8ROWS rsp+gprsize+16*11, 16 + + LOAD_8ROWS coeffq+64*2, 64*4 + mova [rsp+gprsize+16*19], m0 + mova [rsp+gprsize+16*26], m1 + mova [rsp+gprsize+16*23], m2 + mova [rsp+gprsize+16*22], m3 + mova [rsp+gprsize+16*21], m4 + mova [rsp+gprsize+16*24], m5 + mova [rsp+gprsize+16*25], m6 + mova [rsp+gprsize+16*20], m7 + + call m(idct_8x32_internal_8bpc).main_fast + SAVE_8ROWS rsp+gprsize+16*3, 16 + + LOAD_8ROWS coeffq+64*1, 64*2 + mova [rsp+gprsize+16*35], m0 ;in1 + mova [rsp+gprsize+16*49], m1 ;in3 + mova [rsp+gprsize+16*43], m2 ;in5 + mova [rsp+gprsize+16*41], m3 ;in7 + mova [rsp+gprsize+16*39], m4 ;in9 + mova [rsp+gprsize+16*45], m5 ;in11 + mova [rsp+gprsize+16*47], m6 ;in13 + mova [rsp+gprsize+16*37], m7 ;in15 + + LOAD_8ROWS coeffq+64*17, 64*2 + mova [rsp+gprsize+16*63], m0 ;in17 + mova [rsp+gprsize+16*53], m1 ;in19 + mova [rsp+gprsize+16*55], m2 ;in21 + mova [rsp+gprsize+16*61], m3 ;in23 + mova [rsp+gprsize+16*59], m4 ;in25 + mova [rsp+gprsize+16*57], m5 ;in27 + mova [rsp+gprsize+16*51], m6 ;in29 + mova [rsp+gprsize+16*65], m7 ;in31 + + call m(idct_16x64_internal_8bpc).main + + LOAD_8ROWS rsp+gprsize+16*3, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end: + SAVE_8ROWS coeffq+64*0, 64 + LOAD_8ROWS rsp+gprsize+16*11, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end1)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end1: + SAVE_8ROWS coeffq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*19, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end2)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end2: + SAVE_8ROWS coeffq+64*16, 64 + LOAD_8ROWS rsp+gprsize+16*27, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end3)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end3: + SAVE_8ROWS coeffq+64*24, 64 + LOAD_8ROWS rsp+gprsize+16*35, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end4)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end4: + SAVE_8ROWS dstq+64*0, 64 + LOAD_8ROWS rsp+gprsize+16*43, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end5)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end5: + SAVE_8ROWS dstq+64*8, 64 + LOAD_8ROWS rsp+gprsize+16*51, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end6)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end6: + SAVE_8ROWS dstq+64*16, 64 + LOAD_8ROWS rsp+gprsize+16*59, 16 + mova [rsp+gprsize+16*0], m7 + mova m7, [o(pw_8192)] + lea tx2q, [o(.pass1_end7)] + jmp m(idct_8x8_internal_8bpc).pass1_end1 + +.pass1_end7: + SAVE_8ROWS dstq+64*24, 64 + + add coeffq, 16 + add dstq, 16 + dec r3d + jg .pass1_loop + +.pass2: + mov dstq, [rsp+gprsize*3+16*67] + mov coeffq, [rsp+gprsize*2+16*67] + lea dstq, [dstq+32] + mov r3d, 4 + lea r4, [dstq+8] + mov [rsp+gprsize*2+16*67], r4 + lea r4, [o(.pass2_end)] + jmp m(idct_16x64_internal_8bpc).pass2_loop + +.pass2_end: + LOAD_8ROWS rsp+gprsize+16*35, 16 + lea dstq, [dstq+strideq*2] + lea r3, [rsp+16*32+gprsize] + mova [rsp+gprsize+16*0], m7 + call m(idct_16x64_internal_8bpc).write + mov dstq, [rsp+gprsize*2+16*67] + mov r3d, [rsp+gprsize*3+16*67] + lea r4, [dstq+8] + mov [rsp+gprsize*2+16*67], r4 + lea r4, [o(.pass2_end)] + + dec r3d + jg m(idct_16x64_internal_8bpc).pass2_loop + +.pass2_end2: + mov coeffq, [rsp+gprsize*4+16*67] + mov dstq, [rsp+gprsize*2+16*67] + mov r3d, 4 + sub dstq, 72 + lea r4, [dstq+8] + mov [rsp+gprsize*2+16*67], r4 + lea r4, [o(m(idct_16x64_internal_8bpc).end1)] + jmp m(idct_16x64_internal_8bpc).pass2_loop |