summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/x86/itx_sse.asm
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/dav1d/src/x86/itx_sse.asm')
-rw-r--r--third_party/dav1d/src/x86/itx_sse.asm6533
1 files changed, 6533 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/itx_sse.asm b/third_party/dav1d/src/x86/itx_sse.asm
new file mode 100644
index 0000000000..ec7e3a52f4
--- /dev/null
+++ b/third_party/dav1d/src/x86/itx_sse.asm
@@ -0,0 +1,6533 @@
+; Copyright © 2018-2021, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+
+SECTION_RODATA 16
+
+deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+
+deint_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+deint_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7
+
+%macro COEF_PAIR 2-3 0 ; !0 = m%1_m%2, 2 = no %2_%1
+pw_%1_m%2: times 4 dw %1, -%2
+%if %3 != 2
+pw_%2_%1: times 4 dw %2, %1
+%endif
+%if %3
+pw_m%1_m%2: times 4 dw -%1, -%2
+%endif
+%endmacro
+
+;adst4
+pw_1321_3803: times 4 dw 1321, 3803
+pw_2482_m1321: times 4 dw 2482, -1321
+pw_3344_2482: times 4 dw 3344, 2482
+pw_3344_m3803: times 4 dw 3344, -3803
+pw_3344_m3344: times 4 dw 3344, -3344
+pw_0_3344 times 4 dw 0, 3344
+pw_m6688_m3803: times 4 dw -6688, -3803
+
+COEF_PAIR 2896, 2896
+COEF_PAIR 1567, 3784
+COEF_PAIR 799, 4017
+COEF_PAIR 3406, 2276
+COEF_PAIR 401, 4076
+COEF_PAIR 1931, 3612
+COEF_PAIR 3166, 2598
+COEF_PAIR 3920, 1189
+COEF_PAIR 3784, 1567, 1
+COEF_PAIR 995, 3973
+COEF_PAIR 1751, 3703
+COEF_PAIR 3513, 2106
+COEF_PAIR 3857, 1380
+COEF_PAIR 4017, 799, 1
+COEF_PAIR 201, 4091
+COEF_PAIR 2440, 3290
+COEF_PAIR 3035, 2751
+COEF_PAIR 4052, 601
+COEF_PAIR 2276, 3406, 1
+COEF_PAIR 4076, 401, 2
+COEF_PAIR 2598, 3166, 2
+COEF_PAIR 3612, 1931, 2
+COEF_PAIR 1189, 3920, 2
+
+pd_2048: times 4 dd 2048
+pw_2048: times 8 dw 2048
+pw_m2048: times 8 dw -2048
+pw_4096: times 8 dw 4096
+pw_16384: times 8 dw 16384
+pw_m16384: times 8 dw -16384
+pw_1697x16: times 8 dw 1697*16
+pw_1697x8: times 8 dw 1697*8
+pw_2896x8: times 8 dw 2896*8
+pw_3344x8: times 8 dw 3344*8
+pw_8192: times 8 dw 8192
+pw_m8192: times 8 dw -8192
+pw_5: times 8 dw 5
+pw_201x8: times 8 dw 201*8
+pw_4091x8: times 8 dw 4091*8
+pw_m2751x8: times 8 dw -2751*8
+pw_3035x8: times 8 dw 3035*8
+pw_1751x8: times 8 dw 1751*8
+pw_3703x8: times 8 dw 3703*8
+pw_m1380x8: times 8 dw -1380*8
+pw_3857x8: times 8 dw 3857*8
+pw_995x8: times 8 dw 995*8
+pw_3973x8: times 8 dw 3973*8
+pw_m2106x8: times 8 dw -2106*8
+pw_3513x8: times 8 dw 3513*8
+pw_2440x8: times 8 dw 2440*8
+pw_3290x8: times 8 dw 3290*8
+pw_m601x8: times 8 dw -601*8
+pw_4052x8: times 8 dw 4052*8
+
+pw_4095x8: times 8 dw 4095*8
+pw_101x8: times 8 dw 101*8
+pw_2967x8: times 8 dw 2967*8
+pw_m2824x8: times 8 dw -2824*8
+pw_3745x8: times 8 dw 3745*8
+pw_1660x8: times 8 dw 1660*8
+pw_3822x8: times 8 dw 3822*8
+pw_m1474x8: times 8 dw -1474*8
+pw_3996x8: times 8 dw 3996*8
+pw_897x8: times 8 dw 897*8
+pw_3461x8: times 8 dw 3461*8
+pw_m2191x8: times 8 dw -2191*8
+pw_3349x8: times 8 dw 3349*8
+pw_2359x8: times 8 dw 2359*8
+pw_4036x8: times 8 dw 4036*8
+pw_m700x8: times 8 dw -700*8
+pw_4065x8: times 8 dw 4065*8
+pw_501x8: times 8 dw 501*8
+pw_3229x8: times 8 dw 3229*8
+pw_m2520x8: times 8 dw -2520*8
+pw_3564x8: times 8 dw 3564*8
+pw_2019x8: times 8 dw 2019*8
+pw_3948x8: times 8 dw 3948*8
+pw_m1092x8: times 8 dw -1092*8
+pw_3889x8: times 8 dw 3889*8
+pw_1285x8: times 8 dw 1285*8
+pw_3659x8: times 8 dw 3659*8
+pw_m1842x8: times 8 dw -1842*8
+pw_3102x8: times 8 dw 3102*8
+pw_2675x8: times 8 dw 2675*8
+pw_4085x8: times 8 dw 4085*8
+pw_m301x8: times 8 dw -301*8
+
+SECTION .text
+
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+%if ARCH_X86_64
+%define o(x) x
+%else
+%define o(x) r5-$$+x ; PIC
+%endif
+
+%macro WRITE_4X4 9 ;src[1-2], tmp[1-3], row[1-4]
+ lea r2, [dstq+strideq*2]
+%assign %%i 1
+%rotate 5
+%rep 4
+ %if %1 & 2
+ CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1)
+ %else
+ CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
+ %endif
+ %assign %%i %%i + 1
+ %rotate 1
+%endrep
+
+ movd m%3, [%%row_adr1] ;dst0
+ movd m%5, [%%row_adr2] ;dst1
+ punpckldq m%3, m%5 ;high: dst1 :low: dst0
+ movd m%4, [%%row_adr3] ;dst2
+ movd m%5, [%%row_adr4] ;dst3
+ punpckldq m%4, m%5 ;high: dst3 :low: dst2
+
+ pxor m%5, m%5
+ punpcklbw m%3, m%5 ;extend byte to word
+ punpcklbw m%4, m%5 ;extend byte to word
+
+ paddw m%3, m%1 ;high: dst1 + out1 ;low: dst0 + out0
+ paddw m%4, m%2 ;high: dst3 + out3 ;low: dst2 + out2
+
+ packuswb m%3, m%4 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0
+
+ movd [%%row_adr1], m%3 ;store dst0 + out0
+ pshuflw m%4, m%3, q1032
+ movd [%%row_adr2], m%4 ;store dst1 + out1
+ punpckhqdq m%3, m%3
+ movd [%%row_adr3], m%3 ;store dst2 + out2
+ psrlq m%3, 32
+ movd [%%row_adr4], m%3 ;store dst3 + out3
+%endmacro
+
+%macro ITX4_END 4-5 2048 ; row[1-4], rnd
+%if %5
+ mova m2, [o(pw_%5)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+%endif
+
+ WRITE_4X4 0, 1, 2, 3, 4, %1, %2, %3, %4
+ ret
+%endmacro
+
+; flags: 1 = swap, 2: coef_regs, 4: no_pack
+%macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags
+%if %6 & 2
+ pmaddwd m%2, m%4, m%1
+ pmaddwd m%1, m%5
+%elif %6 & 1
+ pmaddwd m%2, m%1, [o(pw_%5_%4)]
+ pmaddwd m%1, [o(pw_%4_m%5)]
+%else
+ pmaddwd m%2, m%1, [o(pw_%4_m%5)]
+ pmaddwd m%1, [o(pw_%5_%4)]
+%endif
+ paddd m%2, m%3
+ paddd m%1, m%3
+ psrad m%2, 12
+ psrad m%1, 12
+%if %6 & 4 == 0
+ packssdw m%1, m%2
+%endif
+%endmacro
+
+%macro IDCT4_1D_PACKED 0-1 ;pw_2896x8
+ mova m3, [o(pd_2048)]
+ punpckhwd m2, m0, m1 ;unpacked in1 in3
+ punpcklwd m0, m1 ;unpacked in0 in2
+ ITX_MUL2X_PACK 2, 1, 3, 1567, 3784
+ ITX_MUL2X_PACK 0, 1, 3, 2896, 2896
+ psubsw m1, m0, m2 ;high: out2 ;low: out3
+ paddsw m0, m2 ;high: out1 ;low: out0
+%endmacro
+
+%macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack
+cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, %4, dst, stride, coeff, eob, tx2
+ %define %%p1 m(i%1_%3_internal_8bpc)
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+%if has_epilogue
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jz %%end
+%endif
+ lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)]
+ call %%p1
+ RET
+%%end:
+%else
+ lea tx2q, [o(m(i%2_%3_internal_8bpc).pass2)]
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jnz %%p1
+%else
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endif
+%endmacro
+
+%macro INV_TXFM_4X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x4, 6
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklqdq m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mov [coeffq], eobd ;0
+ pmulhrsw m0, m1
+ mova m1, m0
+ TAIL_CALL m(iadst_4x4_internal_8bpc).end2
+%endif
+%endmacro
+
+INIT_XMM ssse3
+; itx16 relies on dct_dct being the first function. If you change the order, adjust `itx8_start` in itx16.
+
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+INV_TXFM_4X4_FN dct, identity
+
+cglobal idct_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m0, [coeffq+16*0] ;high: in1 ;low: in0
+ mova m1, [coeffq+16*1] ;high: in3 ;low in2
+
+ IDCT4_1D_PACKED
+
+ mova m2, [o(deint_shuf)]
+ shufps m3, m0, m1, q1331
+ shufps m0, m1, q0220
+ pshufb m0, m2 ;high: in1 ;low: in0
+ pshufb m1, m3, m2 ;high: in3 ;low :in2
+ jmp tx2q
+
+.pass2:
+ IDCT4_1D_PACKED
+
+ pxor m2, m2
+ mova [coeffq+16*0], m2
+ mova [coeffq+16*1], m2 ;memset(coeff, 0, sizeof(*coeff) * sh * sw);
+
+ ITX4_END 0, 1, 3, 2
+
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
+INV_TXFM_4X4_FN adst, identity
+
+cglobal iadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ call .main
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2 ;high: in3 ;low :in2
+ punpcklwd m0, m2 ;high: in1 ;low: in0
+ jmp tx2q
+
+.pass2:
+ call .main
+
+.end:
+ pxor m2, m2
+ mova [coeffq+16*0], m2
+ mova [coeffq+16*1], m2
+
+.end2:
+ ITX4_END 0, 1, 2, 3
+
+ALIGN function_align
+cglobal_label .main
+ punpcklwd m2, m0, m1 ;unpacked in0 in2
+ punpckhwd m0, m1 ;unpacked in1 in3
+ mova m3, m0
+ pmaddwd m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2
+ pmaddwd m0, [o(pw_0_3344)] ;3344 * in3
+ paddd m1, m0 ;t2
+ pmaddwd m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
+ pmaddwd m2, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
+ pmaddwd m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
+ pmaddwd m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3
+ paddd m4, m0 ;t0 + t3
+ pmaddwd m3, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
+ mova m0, [o(pd_2048)]
+ paddd m1, m0 ;t2 + 2048
+ paddd m2, m0
+ paddd m0, m4 ;t0 + t3 + 2048
+ paddd m5, m2 ;t1 + t3 + 2048
+ paddd m2, m4
+ paddd m2, m3 ;t0 + t1 - t3 + 2048
+ REPX {psrad x, 12}, m1, m0, m5, m2
+ packssdw m0, m5 ;high: out1 ;low: out0
+ packssdw m1, m2 ;high: out3 ;low: out3
+ ret
+
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
+INV_TXFM_4X4_FN flipadst, identity
+
+cglobal iflipadst_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ call m(iadst_4x4_internal_8bpc).main
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2 ;high: in3 ;low :in2
+ punpckhwd m1, m2 ;high: in1 ;low: in0
+ jmp tx2q
+
+.pass2:
+ call m(iadst_4x4_internal_8bpc).main
+
+.end:
+ pxor m2, m2
+ mova [coeffq+16*0], m2
+ mova [coeffq+16*1], m2
+
+.end2:
+ ITX4_END 3, 2, 1, 0
+
+INV_TXFM_4X4_FN identity, dct
+INV_TXFM_4X4_FN identity, adst
+INV_TXFM_4X4_FN identity, flipadst
+INV_TXFM_4X4_FN identity, identity
+
+cglobal iidentity_4x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ mova m3, [o(pw_1697x8)]
+ pmulhrsw m2, m0, m3
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2 ;high: in3 ;low :in2
+ punpcklwd m0, m2 ;high: in1 ;low: in0
+ jmp tx2q
+
+.pass2:
+ mova m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ jmp m(iadst_4x4_internal_8bpc).end
+
+%macro IWHT4_1D_PACKED 0
+ punpckhqdq m3, m0, m1 ;low: in1 high: in3
+ punpcklqdq m0, m1 ;low: in0 high: in2
+ psubw m2, m0, m3 ;low: in0 - in1 high: in2 - in3
+ paddw m0, m3 ;low: in0 + in1 high: in2 + in3
+ punpckhqdq m2, m2 ;t2 t2
+ punpcklqdq m0, m0 ;t0 t0
+ psubw m1, m0, m2
+ psraw m1, 1 ;t4 t4
+ psubw m1, m3 ;low: t1/out2 high: t3/out1
+ psubw m0, m1 ;high: out0
+ paddw m2, m1 ;low: out3
+%endmacro
+
+INIT_XMM sse2
+cglobal inv_txfm_add_wht_wht_4x4_8bpc, 3, 3, 4, dst, stride, coeff
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ pxor m2, m2
+ mova [coeffq+16*0], m2
+ mova [coeffq+16*1], m2
+ psraw m0, 2
+ psraw m1, 2
+ IWHT4_1D_PACKED
+ punpckhwd m0, m1
+ punpcklwd m3, m1, m2
+ punpckhdq m1, m0, m3
+ punpckldq m0, m3
+ IWHT4_1D_PACKED
+ shufpd m0, m2, 0x01
+ ITX4_END 0, 3, 2, 1, 0
+
+%macro IDCT8_1D_PACKED 0
+ mova m6, [o(pd_2048)]
+ punpckhwd m4, m0, m3 ;unpacked in1 in7
+ punpcklwd m0, m2 ;unpacked in0 in4
+ punpckhwd m2, m1 ;unpacked in5 in3
+ punpcklwd m1, m3 ;unpacked in2 in6
+ ITX_MUL2X_PACK 4, 3, 6, 799, 4017 ;low: t7a high: t4a
+ ITX_MUL2X_PACK 2, 3, 6, 3406, 2276 ;low: t6a high: t5a
+ ITX_MUL2X_PACK 1, 3, 6, 1567, 3784 ;low: t3 high: t2
+ psubsw m3, m4, m2 ;low: t6a high: t5a
+ paddsw m4, m2 ;low: t7 high: t4
+ pshufb m3, [o(deint_shuf1)]
+ ITX_MUL2X_PACK 0, 2, 6, 2896, 2896 ;low: t0 high: t1
+ ITX_MUL2X_PACK 3, 2, 6, 2896, 2896 ;low: t6 high: t5
+ psubsw m2, m0, m1 ;low: tmp3 high: tmp2
+ paddsw m0, m1 ;low: tmp0 high: tmp1
+ punpcklqdq m1, m4, m3 ;low: t7 high: t6
+ punpckhqdq m4, m3 ;low: t4 high: t5
+ psubsw m3, m0, m1 ;low: out7 high: out6
+ paddsw m0, m1 ;low: out0 high: out1
+ paddsw m1, m2, m4 ;low: out3 high: out2
+ psubsw m2, m4 ;low: out4 high: out5
+%endmacro
+
+;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+%macro ITX_MULSUB_2W 7-8 0 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2_in_tmp1
+ punpckhwd m%4, m%1, m%2
+ punpcklwd m%1, m%2
+%if %7 < 8
+ pmaddwd m%2, m%7, m%1
+ pmaddwd m%3, m%7, m%4
+%else
+ mova m%2, [o(pw_%7_%6)]
+%if %8
+ pmaddwd m%3, m%1, m%2
+ pmaddwd m%2, m%4
+%else
+ pmaddwd m%3, m%4, m%2
+ pmaddwd m%2, m%1
+%endif
+%endif
+ paddd m%3, m%5
+ paddd m%2, m%5
+ psrad m%3, 12
+ psrad m%2, 12
+%if %8
+ packssdw m%3, m%2
+%else
+ packssdw m%2, m%3 ;dst2
+%endif
+%if %7 < 8
+ pmaddwd m%4, m%6
+ pmaddwd m%1, m%6
+%elif %8
+ mova m%2, [o(pw_%6_m%7)]
+ pmaddwd m%4, m%2
+ pmaddwd m%1, m%2
+%else
+ mova m%3, [o(pw_%6_m%7)]
+ pmaddwd m%4, m%3
+ pmaddwd m%1, m%3
+%endif
+ paddd m%4, m%5
+ paddd m%1, m%5
+ psrad m%4, 12
+ psrad m%1, 12
+ packssdw m%1, m%4 ;dst1
+%endmacro
+
+%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
+ ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784, 1 ;t2, t3
+ ITX_MULSUB_2W %1, %3, %4, %6, %7, 2896, 2896, 1 ;t1, t0
+ psubsw m%3, m%1, m%2 ;out2
+ paddsw m%2, m%1 ;out1
+ paddsw m%1, m%5, m%4 ;out0
+ psubsw m%4, m%5 ;out3
+%endmacro
+
+%macro WRITE_4X8 4 ;row[1-4]
+ WRITE_4X4 0, 1, 4, 5, 6, %1, %2, %3, %4
+ lea dstq, [dstq+strideq*4]
+ WRITE_4X4 2, 3, 4, 5, 6, %1, %2, %3, %4
+%endmacro
+
+%macro INV_4X8 0
+ punpckhwd m4, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ punpckhdq m1, m0, m2 ;low: in2 high: in3
+ punpckldq m0, m2 ;low: in0 high: in1
+ punpckldq m2, m3, m4 ;low: in4 high: in5
+ punpckhdq m3, m4 ;low: in6 high: in7
+%endmacro
+
+%macro INV_TXFM_4X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x8, 8
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklqdq m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ pmulhrsw m0, m1
+ pmulhrsw m0, [o(pw_2048)]
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ TAIL_CALL m(iadst_4x8_internal_8bpc).end3
+%endif
+%endmacro
+
+INIT_XMM ssse3
+INV_TXFM_4X8_FN dct, dct
+INV_TXFM_4X8_FN dct, adst
+INV_TXFM_4X8_FN dct, flipadst
+INV_TXFM_4X8_FN dct, identity
+
+cglobal idct_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+.pass1:
+ call m(idct_8x4_internal_8bpc).main
+ jmp m(iadst_4x8_internal_8bpc).pass1_end
+
+.pass2:
+ call .main
+ shufps m1, m1, q1032
+ shufps m3, m3, q1032
+ mova m4, [o(pw_2048)]
+ jmp m(iadst_4x8_internal_8bpc).end2
+
+ALIGN function_align
+cglobal_label .main
+ IDCT8_1D_PACKED
+ ret
+
+
+INV_TXFM_4X8_FN adst, dct
+INV_TXFM_4X8_FN adst, adst
+INV_TXFM_4X8_FN adst, flipadst
+INV_TXFM_4X8_FN adst, identity
+
+cglobal iadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+.pass1:
+ call m(iadst_8x4_internal_8bpc).main
+
+.pass1_end:
+ INV_4X8
+ jmp tx2q
+
+.pass2:
+ shufps m0, m0, q1032
+ shufps m1, m1, q1032
+ call .main
+ mova m4, [o(pw_2048)]
+ pxor m5, m5
+ psubw m5, m4
+
+.end:
+ punpcklqdq m4, m5
+
+.end2:
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ pxor m5, m5
+ mova [coeffq+16*0], m5
+ mova [coeffq+16*1], m5
+ mova [coeffq+16*2], m5
+ mova [coeffq+16*3], m5
+
+.end3:
+ WRITE_4X8 0, 1, 2, 3
+ RET
+
+ALIGN function_align
+cglobal_label .main
+ mova m6, [o(pd_2048)]
+ punpckhwd m4, m3, m0 ;unpacked in7 in0
+ punpckhwd m5, m2, m1 ;unpacked in5 in2
+ punpcklwd m1, m2 ;unpacked in3 in4
+ punpcklwd m0, m3 ;unpacked in1 in6
+ ITX_MUL2X_PACK 4, 2, 6, 401, 4076 ;low: t0a high: t1a
+ ITX_MUL2X_PACK 5, 2, 6, 1931, 3612 ;low: t2a high: t3a
+ ITX_MUL2X_PACK 1, 2, 6, 3166, 2598 ;low: t4a high: t5a
+ ITX_MUL2X_PACK 0, 2, 6, 3920, 1189 ;low: t6a high: t7a
+
+ psubsw m3, m4, m1 ;low: t4 high: t5
+ paddsw m4, m1 ;low: t0 high: t1
+ psubsw m2, m5, m0 ;low: t6 high: t7
+ paddsw m5, m0 ;low: t2 high: t3
+
+ shufps m1, m3, m2, q1032
+ punpckhwd m2, m1
+ punpcklwd m3, m1
+ ITX_MUL2X_PACK 3, 0, 6, 1567, 3784, 1 ;low: t5a high: t4a
+ ITX_MUL2X_PACK 2, 0, 6, 3784, 1567 ;low: t7a high: t6a
+
+ psubsw m1, m4, m5 ;low: t2 high: t3
+ paddsw m4, m5 ;low: out0 high: -out7
+ psubsw m5, m3, m2 ;low: t7 high: t6
+ paddsw m3, m2 ;low: out6 high: -out1
+ shufps m0, m4, m3, q3210 ;low: out0 high: -out1
+ shufps m3, m4, q3210 ;low: out6 high: -out7
+
+ mova m2, [o(pw_2896_m2896)]
+ mova m7, [o(pw_2896_2896)]
+ shufps m4, m1, m5, q1032 ;low: t3 high: t7
+ shufps m1, m5, q3210 ;low: t2 high: t6
+ punpcklwd m5, m1, m4
+ punpckhwd m1, m4
+ pmaddwd m4, m2, m1 ;-out5
+ pmaddwd m2, m5 ; out4
+ pmaddwd m1, m7 ; out2
+ pmaddwd m5, m7 ;-out3
+ REPX {paddd x, m6}, m4, m2, m1, m5
+ REPX {psrad x, 12}, m4, m2, m1, m5
+ packssdw m1, m5 ;low: out2 high: -out3
+ packssdw m2, m4 ;low: out4 high: -out5
+ ret
+
+INV_TXFM_4X8_FN flipadst, dct
+INV_TXFM_4X8_FN flipadst, adst
+INV_TXFM_4X8_FN flipadst, flipadst
+INV_TXFM_4X8_FN flipadst, identity
+
+cglobal iflipadst_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+.pass1:
+ call m(iadst_8x4_internal_8bpc).main
+
+ punpcklwd m4, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m5, m1, m0
+ punpckhwd m1, m0
+ punpckldq m2, m3, m1 ;low: in4 high: in5
+ punpckhdq m3, m1 ;low: in6 high: in7
+ punpckldq m0, m4, m5 ;low: in0 high: in1
+ punpckhdq m1, m4, m5 ;low: in2 high: in3
+ jmp tx2q
+
+.pass2:
+ shufps m0, m0, q1032
+ shufps m1, m1, q1032
+ call m(iadst_4x8_internal_8bpc).main
+
+ mova m4, m0
+ mova m5, m1
+ pshufd m0, m3, q1032
+ pshufd m1, m2, q1032
+ pshufd m2, m5, q1032
+ pshufd m3, m4, q1032
+ mova m5, [o(pw_2048)]
+ pxor m4, m4
+ psubw m4, m5
+ jmp m(iadst_4x8_internal_8bpc).end
+
+INV_TXFM_4X8_FN identity, dct
+INV_TXFM_4X8_FN identity, adst
+INV_TXFM_4X8_FN identity, flipadst
+INV_TXFM_4X8_FN identity, identity
+
+cglobal iidentity_4x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+.pass1:
+ mova m7, [o(pw_1697x8)]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ jmp m(iadst_4x8_internal_8bpc).pass1_end
+
+.pass2:
+ mova m4, [o(pw_4096)]
+ jmp m(iadst_4x8_internal_8bpc).end2
+
+
+%macro WRITE_8X2 5 ;coefs[1-2], tmp[1-3]
+ movq m%3, [dstq ]
+ movq m%4, [dstq+strideq]
+ pxor m%5, m%5
+ punpcklbw m%3, m%5 ;extend byte to word
+ punpcklbw m%4, m%5 ;extend byte to word
+%ifnum %1
+ paddw m%3, m%1
+%else
+ paddw m%3, %1
+%endif
+%ifnum %2
+ paddw m%4, m%2
+%else
+ paddw m%4, %2
+%endif
+ packuswb m%3, m%4
+ movq [dstq ], m%3
+ punpckhqdq m%3, m%3
+ movq [dstq+strideq], m%3
+%endmacro
+
+%macro WRITE_8X4 7 ;coefs[1-4], tmp[1-3]
+ WRITE_8X2 %1, %2, %5, %6, %7
+ lea dstq, [dstq+strideq*2]
+ WRITE_8X2 %3, %4, %5, %6, %7
+%endmacro
+
+%macro INV_TXFM_8X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x4, 8
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklqdq m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ pmulhrsw m0, m1
+ mova m2, [o(pw_2048)]
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ TAIL_CALL m(iadst_8x4_internal_8bpc).end2
+%endif
+%endmacro
+
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+INV_TXFM_8X4_FN dct, identity
+
+cglobal idct_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+ call m(idct_4x8_internal_8bpc).main
+
+ mova m4, [o(deint_shuf1)]
+ mova m5, [o(deint_shuf2)]
+ pshufb m0, m4
+ pshufb m1, m5
+ pshufb m2, m4
+ pshufb m3, m5
+ punpckhdq m4, m0, m1
+ punpckldq m0, m1
+ punpckhdq m5, m2, m3
+ punpckldq m2, m3
+ punpckhqdq m1, m0, m2 ;in1
+ punpcklqdq m0, m2 ;in0
+ punpckhqdq m3, m4, m5 ;in3
+ punpcklqdq m2 ,m4, m5 ;in2
+ jmp tx2q
+
+.pass2:
+ call .main
+ jmp m(iadst_8x4_internal_8bpc).end
+
+ALIGN function_align
+cglobal_label .main
+ mova m6, [o(pd_2048)]
+ IDCT4_1D 0, 1, 2, 3, 4, 5, 6
+ ret
+
+INV_TXFM_8X4_FN adst, dct
+INV_TXFM_8X4_FN adst, adst
+INV_TXFM_8X4_FN adst, flipadst
+INV_TXFM_8X4_FN adst, identity
+
+cglobal iadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+ shufps m0, m0, q1032
+ shufps m1, m1, q1032
+ call m(iadst_4x8_internal_8bpc).main
+
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ pxor m5, m5
+ psubsw m3, m5, m1
+ psubsw m5, m4
+ punpckhdq m4, m5, m3
+ punpckldq m5, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckhwd m1, m0, m5 ;in1
+ punpcklwd m0, m5 ;in0
+ punpcklwd m2, m3, m4 ;in2
+ punpckhwd m3, m4 ;in3
+ jmp tx2q
+
+.pass2:
+ call .main
+
+.end:
+ mova m4, [o(pw_2048)]
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+
+.end2:
+ pxor m6, m6
+ mova [coeffq+16*0], m6
+ mova [coeffq+16*1], m6
+ mova [coeffq+16*2], m6
+ mova [coeffq+16*3], m6
+.end3:
+ WRITE_8X4 0, 1, 2, 3, 4, 5, 6
+ RET
+
+ALIGN function_align
+cglobal_label .main
+ punpckhwd m6, m0, m2 ;unpacked in0 in2
+ punpcklwd m0, m2 ;unpacked in0 in2
+ punpckhwd m7, m1, m3 ;unpacked in1 in3
+ punpcklwd m1, m3 ;unpacked in1 in3
+
+ mova m2, [o(pw_3344_m3344)]
+ mova m4, [o(pw_0_3344)]
+ pmaddwd m3, m2, m6 ;3344 * in0 - 3344 * in2
+ pmaddwd m5, m4, m7 ;3344 * in3
+ pmaddwd m2, m0
+ pmaddwd m4, m1
+ paddd m3, m5
+ paddd m2, m4
+ mova m4, [o(pd_2048)]
+ paddd m3, m4 ;t2 + 2048
+ paddd m2, m4
+ psrad m3, 12
+ psrad m2, 12
+ packssdw m2, m3 ;out2
+
+ pmaddwd m4, m0, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
+ pmaddwd m0, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
+ pmaddwd m3, m1, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
+ pmaddwd m5, m1, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3
+ paddd m3, m4 ;t0 + t3
+
+ pmaddwd m1, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
+ mova m4, [o(pd_2048)]
+ paddd m0, m4
+ paddd m4, m3 ;t0 + t3 + 2048
+ paddd m5, m0 ;t1 + t3 + 2048
+ paddd m3, m0
+ paddd m3, m1 ;t0 + t1 - t3 + 2048
+
+ psrad m4, 12 ;out0
+ psrad m5, 12 ;out1
+ psrad m3, 12 ;out3
+ packssdw m0, m4, m5 ;low: out0 high: out1
+
+ pmaddwd m4, m6, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
+ pmaddwd m6, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
+ pmaddwd m1, m7, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
+ pmaddwd m5, m7, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3
+ paddd m1, m4 ;t0 + t3
+ pmaddwd m7, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
+
+ mova m4, [o(pd_2048)]
+ paddd m6, m4
+ paddd m4, m1 ;t0 + t3 + 2048
+ paddd m5, m6 ;t1 + t3 + 2048
+ paddd m1, m6
+ paddd m1, m7 ;t0 + t1 - t3 + 2048
+
+ psrad m4, 12 ;out0
+ psrad m5, 12 ;out1
+ psrad m1, 12 ;out3
+ packssdw m3, m1 ;out3
+ packssdw m4, m5 ;low: out0 high: out1
+
+ punpckhqdq m1, m0, m4 ;out1
+ punpcklqdq m0, m4 ;out0
+ ret
+
+INV_TXFM_8X4_FN flipadst, dct
+INV_TXFM_8X4_FN flipadst, adst
+INV_TXFM_8X4_FN flipadst, flipadst
+INV_TXFM_8X4_FN flipadst, identity
+
+cglobal iflipadst_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+
+ shufps m0, m0, q1032
+ shufps m1, m1, q1032
+ call m(iadst_4x8_internal_8bpc).main
+
+ punpckhwd m5, m3, m2
+ punpcklwd m3, m2
+ punpckhwd m2, m1, m0
+ punpcklwd m1, m0
+
+ pxor m0, m0
+ psubsw m4, m0, m2
+ psubsw m0, m5
+ punpckhdq m2, m0, m4
+ punpckldq m0, m4
+ punpckhdq m4, m3, m1
+ punpckldq m3, m1
+ punpckhwd m1, m0, m3 ;in1
+ punpcklwd m0, m3 ;in0
+ punpckhwd m3, m2, m4 ;in3
+ punpcklwd m2, m4 ;in2
+ jmp tx2q
+
+.pass2:
+ call m(iadst_8x4_internal_8bpc).main
+ mova m4, m0
+ mova m5, m1
+ mova m0, m3
+ mova m1, m2
+ mova m2, m5
+ mova m3, m4
+ jmp m(iadst_8x4_internal_8bpc).end
+
+INV_TXFM_8X4_FN identity, dct
+INV_TXFM_8X4_FN identity, adst
+INV_TXFM_8X4_FN identity, flipadst
+INV_TXFM_8X4_FN identity, identity
+
+cglobal iidentity_8x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [coeffq+16*0]
+ pmulhrsw m1, m3, [coeffq+16*1]
+ pmulhrsw m2, m3, [coeffq+16*2]
+ pmulhrsw m3, [coeffq+16*3]
+ paddsw m0, m0
+ paddsw m1, m1
+ paddsw m2, m2
+ paddsw m3, m3
+
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m5, m4, m1
+ punpckldq m4, m1
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckhwd m1, m0, m4 ;in1
+ punpcklwd m0, m4 ;in0
+ punpcklwd m2, m3, m5 ;in2
+ punpckhwd m3, m5 ;in3
+ jmp tx2q
+
+.pass2:
+ mova m7, [o(pw_1697x8)]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ jmp m(iadst_8x4_internal_8bpc).end
+
+%macro INV_TXFM_8X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x8, 8, 16*4
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklwd m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mova m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m2
+ psrlw m2, 3
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+.end:
+ mov r3d, 2
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_8x8_8bpc).end3)]
+.loop:
+ WRITE_8X4 0, 0, 0, 0, 1, 2, 3
+ lea dstq, [dstq+strideq*2]
+ dec r3d
+ jg .loop
+ jmp tx2q
+.end3:
+ RET
+%endif
+%endmacro
+
+%macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2
+%if %3
+ mova m7, [o(pw_2896x8)]
+ pmulhrsw m0, m7, [%1+%2*0]
+ pmulhrsw m1, m7, [%1+%2*1]
+ pmulhrsw m2, m7, [%1+%2*2]
+ pmulhrsw m3, m7, [%1+%2*3]
+ pmulhrsw m4, m7, [%1+%2*4]
+ pmulhrsw m5, m7, [%1+%2*5]
+ pmulhrsw m6, m7, [%1+%2*6]
+ pmulhrsw m7, [%1+%2*7]
+%else
+ mova m0, [%1+%2*0]
+ mova m1, [%1+%2*1]
+ mova m2, [%1+%2*2]
+ mova m3, [%1+%2*3]
+ mova m4, [%1+%2*4]
+ mova m5, [%1+%2*5]
+ mova m6, [%1+%2*6]
+ mova m7, [%1+%2*7]
+%endif
+%endmacro
+
+%macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048
+ ITX_MULSUB_2W %1, %4, %5, %6, %7, 799, 4017 ;t4a, t7a
+ ITX_MULSUB_2W %3, %2, %5, %6, %7, 3406, 2276, 1 ;t5a, t6a
+ psubsw m%2, m%4, m%5 ;t6a
+ paddsw m%4, m%5 ;t7
+ psubsw m%5, m%1, m%3 ;t5a
+ paddsw m%1, m%3 ;t4
+ ITX_MULSUB_2W %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+INV_TXFM_8X8_FN dct, identity
+
+cglobal idct_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq, 16
+
+.pass1:
+ call .main
+
+.pass1_end:
+ mova m7, [o(pw_16384)]
+
+.pass1_end1:
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*1], m6
+
+.pass1_end2:
+ REPX {pmulhrsw x, m7}, m1, m3, m5
+ pmulhrsw m7, [rsp+gprsize+16*0]
+
+cglobal_label .pass1_end3
+ punpcklwd m6, m1, m5 ;10 50 11 51 12 52 13 53
+ punpckhwd m1, m5 ;14 54 15 55 16 56 17 57
+ punpckhwd m5, m0, m4 ;04 44 05 45 06 46 07 47
+ punpcklwd m0, m4 ;00 40 01 41 02 42 03 43
+ punpckhwd m4, m3, m7 ;34 74 35 75 36 76 37 77
+ punpcklwd m3, m7 ;30 70 31 71 32 72 33 73
+ punpckhwd m7, m1, m4 ;16 36 56 76 17 37 57 77
+ punpcklwd m1, m4 ;14 34 54 74 15 35 55 75
+ punpckhwd m4, m6, m3 ;12 32 52 72 13 33 53 73
+ punpcklwd m6, m3 ;10 30 50 70 11 31 51 71
+ mova [rsp+gprsize+16*2], m6
+ mova m6, [rsp+gprsize+16*1]
+ punpckhwd m3, m2, m6 ;24 64 25 65 26 66 27 67
+ punpcklwd m2, m6 ;20 60 21 61 22 62 23 63
+ punpckhwd m6, m5, m3 ;06 26 46 66 07 27 47 67
+ punpcklwd m5, m3 ;04 24 44 64 05 25 45 65
+ punpckhwd m3, m0, m2 ;02 22 42 62 03 23 43 63
+ punpcklwd m0, m2 ;00 20 40 60 01 21 41 61
+
+ punpckhwd m2, m6, m7 ;07 17 27 37 47 57 67 77
+ punpcklwd m6, m7 ;06 16 26 36 46 56 66 76
+ mova [rsp+gprsize+16*0], m2
+ punpcklwd m2, m3, m4 ;02 12 22 32 42 52 62 72
+ punpckhwd m3, m4 ;03 13 23 33 43 53 63 73
+ punpcklwd m4, m5, m1 ;04 14 24 34 44 54 64 74
+ punpckhwd m5, m1 ;05 15 25 35 45 55 65 75
+ mova m7, [rsp+gprsize+16*2]
+ punpckhwd m1, m0, m7 ;01 11 21 31 41 51 61 71
+ punpcklwd m0, m7 ;00 10 20 30 40 50 60 70
+ mova m7, [rsp+gprsize+16*0]
+ jmp tx2q
+
+.pass2:
+ lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
+
+.pass2_main:
+ call .main
+
+.end:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*1], m6
+
+.end2:
+ REPX {pmulhrsw x, m7}, m1, m3, m5
+ pmulhrsw m7, [rsp+gprsize+16*0]
+ mova [rsp+gprsize+16*2], m5
+ mova [rsp+gprsize+16*0], m7
+
+.end3:
+ WRITE_8X4 0, 1, 2, 3, 5, 6, 7
+ lea dstq, [dstq+strideq*2]
+ WRITE_8X4 4, [rsp+gprsize+16*2], [rsp+gprsize+16*1], [rsp+gprsize+16*0], 5, 6, 7
+ jmp tx2q
+
+.end4:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ ret
+
+ALIGN function_align
+cglobal_label .main
+ mova [rsp+gprsize*2+16*0], m7
+ mova [rsp+gprsize*2+16*1], m3
+ mova [rsp+gprsize*2+16*2], m1
+ mova m7, [o(pd_2048)]
+ IDCT4_1D 0, 2, 4, 6, 1, 3, 7
+ mova m3, [rsp+gprsize*2+16*2]
+ mova [rsp+gprsize*2+16*2], m2
+ mova m2, [rsp+gprsize*2+16*1]
+ mova [rsp+gprsize*2+16*1], m4
+ mova m4, [rsp+gprsize*2+16*0]
+ mova [rsp+gprsize*2+16*0], m6
+ IDCT8_1D_ODDHALF 3, 2, 5, 4, 1, 6, 7
+ mova m6, [rsp+gprsize*2+16*0]
+ psubsw m7, m0, m4 ;out7
+ paddsw m0, m4 ;out0
+ mova [rsp+gprsize*2+16*0], m7
+ mova m1, [rsp+gprsize*2+16*2]
+ psubsw m4, m6, m3 ;out4
+ paddsw m3, m6 ;out3
+ mova m7, [rsp+gprsize*2+16*1]
+ psubsw m6, m1, m5 ;out6
+ paddsw m1, m5 ;out1
+ psubsw m5, m7, m2 ;out5
+ paddsw m2, m7 ;out2
+ ret
+
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+
+cglobal iadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq, 16
+
+.pass1:
+ call .main
+ call .main_pass1_end
+
+.pass1_end:
+ mova m7, [o(pw_16384)]
+
+.pass1_end1:
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*1], m6
+ pxor m6, m6
+ psubw m6, m7
+ mova m7, m6
+ jmp m(idct_8x8_internal_8bpc).pass1_end2
+
+ALIGN function_align
+.pass2:
+ lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
+
+.pass2_main:
+ call .main
+ call .main_pass2_end
+
+.end:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*1], m6
+ pxor m6, m6
+ psubw m6, m7
+ mova m7, m6
+ jmp m(idct_8x8_internal_8bpc).end2
+
+ALIGN function_align
+cglobal_label .main
+ mova [rsp+gprsize*2+16*0], m7
+ mova [rsp+gprsize*2+16*1], m3
+ mova [rsp+gprsize*2+16*2], m4
+ mova m7, [o(pd_2048)]
+ ITX_MULSUB_2W 5, 2, 3, 4, 7, 1931, 3612 ;t3a, t2a
+ ITX_MULSUB_2W 1, 6, 3, 4, 7, 3920, 1189 ;t7a, t6a
+ paddsw m3, m2, m6 ;t2
+ psubsw m2, m6 ;t6
+ paddsw m4, m5, m1 ;t3
+ psubsw m5, m1 ;t7
+ ITX_MULSUB_2W 5, 2, 1, 6, 7, 3784, 1567 ;t6a, t7a
+
+ mova m6, [rsp+gprsize*2+16*2]
+ mova [rsp+gprsize*2+16*2], m5
+ mova m1, [rsp+gprsize*2+16*1]
+ mova [rsp+gprsize*2+16*1], m2
+ mova m5, [rsp+gprsize*2+16*0]
+ mova [rsp+gprsize*2+16*0], m3
+ ITX_MULSUB_2W 5, 0, 2, 3, 7, 401, 4076 ;t1a, t0a
+ ITX_MULSUB_2W 1, 6, 2, 3, 7, 3166, 2598 ;t5a, t4a
+ psubsw m2, m0, m6 ;t4
+ paddsw m0, m6 ;t0
+ paddsw m3, m5, m1 ;t1
+ psubsw m5, m1 ;t5
+ ITX_MULSUB_2W 2, 5, 1, 6, 7, 1567, 3784 ;t5a, t4a
+
+ mova m7, [rsp+gprsize*2+16*0]
+ paddsw m1, m3, m4 ;-out7
+ psubsw m3, m4 ;t3
+ mova [rsp+gprsize*2+16*0], m1
+ psubsw m4, m0, m7 ;t2
+ paddsw m0, m7 ;out0
+ mova m6, [rsp+gprsize*2+16*2]
+ mova m7, [rsp+gprsize*2+16*1]
+ paddsw m1, m5, m6 ;-out1
+ psubsw m5, m6 ;t6
+ paddsw m6, m2, m7 ;out6
+ psubsw m2, m7 ;t7
+ ret
+ALIGN function_align
+.main_pass1_end:
+ mova [rsp+gprsize*2+16*1], m1
+ mova [rsp+gprsize*2+16*2], m6
+ punpckhwd m1, m4, m3
+ punpcklwd m4, m3
+ punpckhwd m7, m5, m2
+ punpcklwd m5, m2
+ mova m2, [o(pw_2896_2896)]
+ mova m6, [o(pd_2048)]
+ pmaddwd m3, m2, m7
+ pmaddwd m2, m5
+ paddd m3, m6
+ paddd m2, m6
+ psrad m3, 12
+ psrad m2, 12
+ packssdw m2, m3 ;out2
+ mova m3, [o(pw_2896_m2896)]
+ pmaddwd m7, m3
+ pmaddwd m5, m3
+ paddd m7, m6
+ paddd m5, m6
+ psrad m7, 12
+ psrad m5, 12
+ packssdw m5, m7 ;-out5
+ mova m3, [o(pw_2896_2896)]
+ pmaddwd m7, m3, m1
+ pmaddwd m3, m4
+ paddd m7, m6
+ paddd m3, m6
+ psrad m7, 12
+ psrad m3, 12
+ packssdw m3, m7 ;-out3
+ mova m7, [o(pw_2896_m2896)]
+ pmaddwd m1, m7
+ pmaddwd m4, m7
+ paddd m1, m6
+ paddd m4, m6
+ psrad m1, 12
+ psrad m4, 12
+ packssdw m4, m1 ;-out5
+ mova m1, [rsp+gprsize*2+16*1]
+ mova m6, [rsp+gprsize*2+16*2]
+ ret
+ALIGN function_align
+cglobal_label .main_pass2_end
+ paddsw m7, m4, m3 ;t2 + t3
+ psubsw m4, m3 ;t2 - t3
+ paddsw m3, m5, m2 ;t6 + t7
+ psubsw m5, m2 ;t6 - t7
+ mova m2, [o(pw_2896x8)]
+ pmulhrsw m4, m2 ;out4
+ pmulhrsw m5, m2 ;-out5
+ pmulhrsw m7, m2 ;-out3
+ pmulhrsw m2, m3 ;out2
+ mova m3, m7
+ ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, flipadst
+INV_TXFM_8X8_FN flipadst, identity
+
+cglobal iflipadst_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq, 16
+
+.pass1:
+ call m(iadst_8x8_internal_8bpc).main
+ call m(iadst_8x8_internal_8bpc).main_pass1_end
+
+.pass1_end:
+ mova m7, [o(pw_m16384)]
+
+.pass1_end1:
+ pmulhrsw m1, m7
+ mova [rsp+gprsize+16*1], m1
+ mova m1, m6
+ mova m6, m2
+ pmulhrsw m2, m5, m7
+ mova m5, m6
+ mova m6, m4
+ pmulhrsw m4, m3, m7
+ mova m3, m6
+ mova m6, m0
+ mova m0, m7
+ pxor m7, m7
+ psubw m7, m0
+ pmulhrsw m0, [rsp+gprsize+16*0]
+ REPX {pmulhrsw x, m7}, m1, m3, m5
+ pmulhrsw m7, m6
+ jmp m(idct_8x8_internal_8bpc).pass1_end3
+
+ALIGN function_align
+.pass2:
+ lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
+
+.pass2_main:
+ call m(iadst_8x8_internal_8bpc).main
+ call m(iadst_8x8_internal_8bpc).main_pass2_end
+
+.end:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova [rsp+gprsize+16*2], m2
+ mova m2, m0
+ pxor m0, m0
+ psubw m0, m7
+ mova m7, m2
+ pmulhrsw m1, m0
+ pmulhrsw m2, m5, m0
+ mova [rsp+gprsize+16*1], m1
+ mova m5, m4
+ mova m1, m6
+ pmulhrsw m4, m3, m0
+ pmulhrsw m0, [rsp+gprsize+16*0]
+ mova m3, m5
+ mova [rsp+gprsize+16*0], m7
+ jmp m(idct_8x8_internal_8bpc).end3
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq, 16
+ mova [rsp+gprsize+16*1], m6
+ jmp m(idct_8x8_internal_8bpc).pass1_end3
+
+ALIGN function_align
+.pass2:
+ lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
+
+.end:
+ pmulhrsw m7, [o(pw_4096)]
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_4096)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ mova [rsp+gprsize+16*2], m5
+ mova [rsp+gprsize+16*1], m6
+ jmp m(idct_8x8_internal_8bpc).end3
+
+
+%macro INV_TXFM_4X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x16, 8
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklwd m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mov [coeffq], eobd
+ pmulhrsw m0, [o(pw_16384)]
+ pmulhrsw m0, m1
+ pmulhrsw m0, [o(pw_2048)]
+.end:
+ WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
+ lea dstq, [dstq+strideq*4]
+ WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
+ lea dstq, [dstq+strideq*4]
+ WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
+ lea dstq, [dstq+strideq*4]
+ WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
+ RET
+%endif
+%endmacro
+
+INV_TXFM_4X16_FN dct, dct
+INV_TXFM_4X16_FN dct, adst
+INV_TXFM_4X16_FN dct, flipadst
+INV_TXFM_4X16_FN dct, identity
+
+cglobal idct_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(idct_4x8_internal_8bpc).pass1)]
+
+.pass1:
+ mova m0, [coeffq+16*1]
+ mova m1, [coeffq+16*3]
+ mova m2, [coeffq+16*5]
+ mova m3, [coeffq+16*7]
+ push tx2q
+ lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_2)]
+ jmp r3
+
+.pass1_2:
+ mova [coeffq+16*1], m0
+ mova [coeffq+16*3], m1
+ mova [coeffq+16*5], m2
+ mova [coeffq+16*7], m3
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*2]
+ mova m2, [coeffq+16*4]
+ mova m3, [coeffq+16*6]
+ lea tx2q, [o(m(idct_4x16_internal_8bpc).pass1_end)]
+ jmp r3
+
+.pass1_end:
+ pop tx2q
+
+ mova m4, [coeffq+16*1]
+ mova m5, [coeffq+16*3]
+ mova m6, [coeffq+16*5]
+ mova m7, [o(pw_16384)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+
+ pmulhrsw m7, [coeffq+16*7]
+ mova [coeffq+16*7], m7
+ jmp tx2q
+
+.pass2:
+ call m(idct_16x4_internal_8bpc).main
+
+.end:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [coeffq+16*7]
+ mova [coeffq+16*4], m4
+
+.end1:
+ mova [coeffq+16*5], m5
+ mova [coeffq+16*6], m6
+ mov r3, coeffq
+ WRITE_4X8 0, 1, 3, 2
+
+ mova m0, [r3+16*4]
+ mova m1, [r3+16*5]
+ mova m2, [r3+16*6]
+ mova m3, m7
+ lea dstq, [dstq+strideq*4]
+ WRITE_4X8 0, 1, 3, 2
+
+.end2:
+ pxor m7, m7
+ REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ ret
+
+INV_TXFM_4X16_FN adst, dct
+INV_TXFM_4X16_FN adst, adst
+INV_TXFM_4X16_FN adst, flipadst
+INV_TXFM_4X16_FN adst, identity
+
+cglobal iadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(iadst_4x8_internal_8bpc).pass1)]
+ jmp m(idct_4x16_internal_8bpc).pass1
+
+.pass2:
+ call m(iadst_16x4_internal_8bpc).main
+ call m(iadst_16x4_internal_8bpc).main_pass2_end
+
+ punpcklqdq m6, m5, m4 ;low: -out5 high: -out7
+ punpckhqdq m4, m5 ;low: out8 high: out10
+ punpcklqdq m5, m7, m2 ;low: out4 high: out6
+ punpckhqdq m2, m7 ;low: -out9 high: -out11
+ mova [coeffq+16*4], m2
+ mova [coeffq+16*5], m6
+ mova m2, [coeffq+16*6]
+ mova m6, [coeffq+16*7]
+ punpckhqdq m1, m6, m0 ;low: -out13 high: -out15
+ punpcklqdq m0, m6 ;low: out0 high: out2
+ punpckhqdq m6, m3, m2 ;low: out12 high: out14
+ punpcklqdq m2, m3 ;low: -out1 high: -out3
+
+ mova m7, [o(pw_2048)]
+
+.end1:
+ REPX {pmulhrsw x, m7}, m0, m5, m4, m6
+ pxor m3, m3
+ psubw m3, m7
+ mova m7, [coeffq+16*4]
+ REPX {pmulhrsw x, m3}, m2, m7, m1
+ pmulhrsw m3, [coeffq+16*5]
+ mova [coeffq+16*7], m5
+
+ punpckhqdq m5, m4, m7 ;low: out10 high: out11
+ punpcklqdq m4, m7 ;low: out8 high: out9
+ punpckhqdq m7, m6, m1 ;low: out14 high: out15
+ punpcklqdq m6, m1 ;low: out12 high: out13
+ punpckhqdq m1, m0, m2 ;low: out2 high: out3
+ punpcklqdq m0, m2 ;low: out0 high: out1
+ mova [coeffq+16*4], m4
+ mova m4, [coeffq+16*7]
+ punpcklqdq m2, m4, m3 ;low: out4 high: out5
+ punpckhqdq m4, m3 ;low: out6 high: out7
+ mova m3, m4
+
+.end2:
+ mova [coeffq+16*5], m5
+ mova [coeffq+16*6], m6
+ mov r3, coeffq
+ WRITE_4X8 0, 1, 2, 3
+
+ mova m0, [r3+16*4]
+ mova m1, [r3+16*5]
+ mova m2, [r3+16*6]
+ mova m3, m7
+ lea dstq, [dstq+strideq*4]
+ WRITE_4X8 0, 1, 2, 3
+
+.end3:
+ pxor m7, m7
+ REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ ret
+
+
+INV_TXFM_4X16_FN flipadst, dct
+INV_TXFM_4X16_FN flipadst, adst
+INV_TXFM_4X16_FN flipadst, flipadst
+INV_TXFM_4X16_FN flipadst, identity
+
+cglobal iflipadst_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(iflipadst_4x8_internal_8bpc).pass1)]
+ jmp m(idct_4x16_internal_8bpc).pass1
+
+.pass2:
+ call m(iadst_16x4_internal_8bpc).main
+ call m(iadst_16x4_internal_8bpc).main_pass2_end
+
+ punpckhqdq m6, m5, m4 ;low: out5 high: out7
+ punpcklqdq m4, m5 ;low: -out8 high: -out10
+ punpckhqdq m5, m7, m2 ;low: -out4 high: -out6
+ punpcklqdq m2, m7 ;low: out9 high: out11
+ mova [coeffq+16*4], m2
+ mova [coeffq+16*5], m6
+ mova m2, [coeffq+16*6]
+ mova m6, [coeffq+16*7]
+ punpcklqdq m1, m6, m0 ;low: out13 high: out15
+ punpckhqdq m0, m6 ;low: -out0 high: -out2
+ punpcklqdq m6, m3, m2 ;low: -out12 high: -out14
+ punpckhqdq m2, m3 ;low: out1 high: out3
+
+ mova m7, [o(pw_m2048)]
+ jmp m(iadst_4x16_internal_8bpc).end1
+
+
+INV_TXFM_4X16_FN identity, dct
+INV_TXFM_4X16_FN identity, adst
+INV_TXFM_4X16_FN identity, flipadst
+INV_TXFM_4X16_FN identity, identity
+
+%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
+ pmulhrsw m%2, m%3, m%1
+%if %0 == 4 ; if downshifting by 1
+ pmulhrsw m%2, m%4
+%else
+ paddsw m%1, m%1
+%endif
+ paddsw m%1, m%2
+%endmacro
+
+cglobal iidentity_4x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m0, [coeffq+16*1]
+ mova m6, [o(pw_1697x8)]
+ mova m1, [coeffq+16*3]
+ mova m2, [coeffq+16*5]
+ mova m3, [coeffq+16*7]
+ pcmpeqw m7, m7
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_2)]
+.pass1:
+ pmulhrsw m4, m6, m0
+ pmulhrsw m5, m6, m1
+ pavgw m4, m0
+ pcmpeqw m0, m7
+ pavgw m5, m1
+ pcmpeqw m1, m7
+ pandn m0, m4
+ pmulhrsw m4, m6, m2
+ pandn m1, m5
+ pmulhrsw m5, m6, m3
+ pavgw m4, m2
+ pcmpeqw m2, m7
+ pavgw m5, m3
+ pcmpeqw m3, m7
+ pandn m2, m4
+ pandn m3, m5
+ jmp m(iadst_4x8_internal_8bpc).pass1_end
+.pass1_2:
+ mova [coeffq+16*1], m0
+ mova [coeffq+16*3], m1
+ mova [coeffq+16*5], m2
+ mova [coeffq+16*7], m3
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*2]
+ mova m2, [coeffq+16*4]
+ mova m3, [coeffq+16*6]
+ lea tx2q, [o(.pass1_end)]
+ jmp .pass1
+.pass1_end:
+ mova m4, [coeffq+16*1]
+ mova m5, [coeffq+16*3]
+ mova m6, [coeffq+16*5]
+ jmp r3
+.pass2:
+ mova m7, [o(pw_1697x16)]
+ mova [coeffq+16*6], m6
+ REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
+ mova m6, [coeffq+16*7]
+ IDTX16 6, 7, 7
+ mova [coeffq+16*7], m6
+ mova m6, [coeffq+16*6]
+ pmulhrsw m7, m6, [o(pw_1697x16)]
+ paddsw m6, m6
+ paddsw m6, m7
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [coeffq+16*7]
+ mova [coeffq+16*4], m4
+ jmp m(iadst_4x16_internal_8bpc).end2
+
+
+%macro INV_TXFM_16X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x4, 8
+%ifidn %1_%2, dct_dct
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ mov r2d, 2
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_16x4_8bpc).end)]
+.dconly:
+ pmulhrsw m0, m2
+ movd m2, [o(pw_2048)] ;intentionally rip-relative
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ pshuflw m0, m0, q0000
+ punpcklwd m0, m0
+ pxor m5, m5
+.dconly_loop:
+ mova m1, [dstq]
+ mova m3, [dstq+strideq]
+ punpckhbw m2, m1, m5
+ punpcklbw m1, m5
+ punpckhbw m4, m3, m5
+ punpcklbw m3, m5
+ paddw m2, m0
+ paddw m1, m0
+ paddw m4, m0
+ paddw m3, m0
+ packuswb m1, m2
+ packuswb m3, m4
+ mova [dstq], m1
+ mova [dstq+strideq], m3
+ lea dstq, [dstq+strideq*2]
+ dec r2d
+ jg .dconly_loop
+ jmp tx2q
+.end:
+ RET
+%endif
+%endmacro
+
+%macro LOAD_7ROWS 2 ;src, stride
+ mova m0, [%1+%2*0]
+ mova m1, [%1+%2*1]
+ mova m2, [%1+%2*2]
+ mova m3, [%1+%2*3]
+ mova m4, [%1+%2*4]
+ mova m5, [%1+%2*5]
+ mova m6, [%1+%2*6]
+%endmacro
+
+%macro SAVE_7ROWS 2 ;src, stride
+ mova [%1+%2*0], m0
+ mova [%1+%2*1], m1
+ mova [%1+%2*2], m2
+ mova [%1+%2*3], m3
+ mova [%1+%2*4], m4
+ mova [%1+%2*5], m5
+ mova [%1+%2*6], m6
+%endmacro
+
+%macro IDCT16_1D_PACKED_ODDHALF 7 ;src[1-4], tmp[1-3]
+ punpckhwd m%5, m%4, m%1 ;packed in13 in3
+ punpcklwd m%1, m%4 ;packed in1 in15
+ punpcklwd m%4, m%3, m%2 ;packed in9 in7
+ punpckhwd m%2, m%3 ;packed in5 in11
+ mova m%7, [o(pd_2048)]
+ ITX_MUL2X_PACK %1, %6, %7, 401, 4076, 1 ;low: t8a high: t15a
+ ITX_MUL2X_PACK %4, %6, %7, 3166, 2598, 1 ;low: t9a high: t14a
+ ITX_MUL2X_PACK %2, %6, %7, 1931, 3612, 1 ;low: t10a high: t13a
+ ITX_MUL2X_PACK %5, %6, %7, 3920, 1189, 1 ;low: t11a high: t12a
+ psubsw m%6, m%1, m%4 ;low: t9 high: t14
+ paddsw m%1, m%4 ;low: t8 high: t15
+ psubsw m%4, m%5, m%2 ;low: t10 high: t13
+ paddsw m%5, m%2 ;low: t11 high: t12
+ mova m%2, [o(deint_shuf2)]
+ pshufb m%6, m%2
+ pshufb m%4, m%2
+ ITX_MUL2X_PACK %6, %3, %7, 1567, 3784, 1 ;low: t9a high: t14a
+ ITX_MUL2X_PACK %4, %3, %7, m3784, 1567, 1 ;low: t10a high: t13a
+ psubsw m%3, m%1, m%5 ;low: t11a high: t12a
+ paddsw m%1, m%5 ;low: t8a high: t15a
+ psubsw m%5, m%6, m%4 ;low: t10 high: t13
+ paddsw m%6, m%4 ;low: t9 high: t14
+ pshufb m%3, m%2
+ pshufb m%5, m%2
+ ITX_MUL2X_PACK %3, %2, %7, 2896, 2896, 4 ;t12, t11
+ ITX_MUL2X_PACK %5, %4, %7, 2896, 2896, 4 ;t13a, t10a
+ packssdw m%2, m%4 ;low: t11 high: t10a
+ packssdw m%3, m%5 ;low: t12 high: t13a
+ punpckhqdq m%4, m%1, m%6 ;low: t15a high: t14
+ punpcklqdq m%1, m%6 ;low: t8a high: t9
+%endmacro
+
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+INV_TXFM_16X4_FN dct, identity
+
+cglobal idct_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_7ROWS coeffq, 16
+ call .main
+
+.pass1_end:
+ punpckhwd m7, m0, m2 ;packed out1, out5
+ punpcklwd m0, m2 ;packed out0, out4
+ punpcklwd m2, m1, m3 ;packed out3, out7
+ punpckhwd m1, m3 ;packed out2, out6
+ mova [coeffq+16*6], m7
+ mova m7, [coeffq+16*7]
+ punpckhwd m3, m4, m6 ;packed out9, out13
+ punpcklwd m4, m6 ;packed out8, out12
+ punpcklwd m6, m5, m7 ;packed out11, out15
+ punpckhwd m5, m7 ;packed out10, out14
+
+.pass1_end2:
+ mova m7, [o(pw_16384)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [coeffq+16*6]
+ mova [coeffq+16*6], m7
+
+.pass1_end3:
+ punpckhwd m7, m3, m6 ;packed 9, 11, 13, 15 high
+ punpcklwd m3, m6 ;packed 9, 10, 13, 15 low
+ punpckhwd m6, m4, m5 ;packed 8, 10, 12, 14 high
+ punpcklwd m4, m5 ;packed 8, 10, 12, 14 low
+ punpckhwd m5, m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(1)
+ punpcklwd m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(0)
+ punpckhwd m3, m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(3)
+ punpcklwd m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(2)
+ mova [coeffq+16*7], m3
+ mova m3, [coeffq+16*6]
+ punpckhwd m7, m3, m2 ;packed 1, 3, 5, 7 high
+ punpcklwd m3, m2 ;packed 1, 3, 5, 7 low
+ punpckhwd m2, m0, m1 ;packed 0, 2, 4, 6 high
+ punpcklwd m0, m1 ;packed 0, 2, 4, 6 low
+ punpckhwd m1, m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(1)
+ punpcklwd m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(0)
+ punpckhwd m3, m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(3)
+ punpcklwd m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(2)
+ jmp tx2q
+
+.pass2:
+ lea tx2q, [o(m(idct_8x4_internal_8bpc).pass2)]
+
+.pass2_end:
+ mova [coeffq+16*4], m4
+ mova [coeffq+16*5], m5
+ mova [coeffq+16*6], m6
+ lea r3, [dstq+8]
+ call tx2q
+
+ add coeffq, 16*4
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ mova m2, [coeffq+16*2]
+ mova m3, [coeffq+16*3]
+ mov dstq, r3
+ jmp tx2q
+
+ALIGN function_align
+cglobal_label .main
+ punpckhqdq m7, m0, m1 ;low:in1 high:in3
+ punpcklqdq m0, m1
+ punpcklqdq m1, m2, m3
+ punpckhqdq m3, m2 ;low:in7 high:in5
+ mova [coeffq+16*4], m7
+ mova [coeffq+16*5], m3
+ mova m7, [coeffq+16*7]
+ punpcklqdq m2, m4, m5
+ punpckhqdq m4, m5 ;low:in9 high:in11
+ punpcklqdq m3, m6, m7
+ punpckhqdq m7, m6 ;low:in15 high:in13
+ mova [coeffq+16*6], m4
+ IDCT8_1D_PACKED
+ mova m6, [coeffq+16*4]
+ mova m4, [coeffq+16*5]
+ mova m5, [coeffq+16*6]
+ mova [coeffq+16*4], m1
+ mova [coeffq+16*5], m2
+ mova [coeffq+16*6], m3
+
+ IDCT16_1D_PACKED_ODDHALF 6, 4, 5, 7, 1, 2, 3
+
+ mova m1, [coeffq+16*4]
+ psubsw m3, m0, m7 ;low:out15 high:out14
+ paddsw m0, m7 ;low:out0 high:out1
+ psubsw m7, m1, m5 ;low:out12 high:out13
+ paddsw m1, m5 ;low:out3 high:out2
+ mova [coeffq+16*7], m3
+ mova m2, [coeffq+16*5]
+ mova m3, [coeffq+16*6]
+ psubsw m5, m2, m4 ;low:out11 high:out10
+ paddsw m2, m4 ;low:out4 high:out5
+ psubsw m4, m3, m6 ;low:out8 high:out9
+ paddsw m3, m6 ;low:out7 high:out6
+ mova m6, m7
+ ret
+
+INV_TXFM_16X4_FN adst, dct
+INV_TXFM_16X4_FN adst, adst
+INV_TXFM_16X4_FN adst, flipadst
+INV_TXFM_16X4_FN adst, identity
+
+cglobal iadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_7ROWS coeffq, 16
+ call .main
+ call .main_pass1_end
+
+ punpckhwd m6, m7, m0 ;packed -out11, -out15
+ punpcklwd m0, m7 ;packed out0, out4
+ punpcklwd m7, m3, m4 ;packed -out3, -out7
+ punpckhwd m4, m3 ;packed out8, out12
+ mova m1, [coeffq+16*6]
+ punpcklwd m3, m1, m5 ;packed -out1, -out5
+ punpckhwd m5, m1 ;packed out10, out14
+ mova m1, [coeffq+16*7]
+ mova [coeffq+16*6], m3
+ mova [coeffq+16*7], m7
+ punpckhwd m3, m2, m1 ;packed -out9, -out13
+ punpcklwd m1, m2 ;packed out2, out6
+
+ mova m7, [o(pw_16384)]
+
+.pass1_end:
+ REPX {pmulhrsw x, m7}, m0, m1, m4, m5
+ pxor m2, m2
+ psubw m2, m7
+ mova m7, [coeffq+16*6]
+ REPX {pmulhrsw x, m2}, m7, m3, m6
+ pmulhrsw m2, [coeffq+16*7]
+ mova [coeffq+16*6], m7
+ jmp m(idct_16x4_internal_8bpc).pass1_end3
+
+.pass2:
+ lea tx2q, [o(m(iadst_8x4_internal_8bpc).pass2)]
+ jmp m(idct_16x4_internal_8bpc).pass2_end
+
+ALIGN function_align
+cglobal_label .main
+ mova [coeffq+16*6], m0
+ pshufd m0, m1, q1032
+ pshufd m2, m2, q1032
+ punpckhwd m1, m6, m0 ;packed in13, in2
+ punpcklwd m0, m6 ;packed in3, in12
+ punpckhwd m7, m5, m2 ;packed in11, in4
+ punpcklwd m2, m5 ;packed in5, in10
+ mova m6, [o(pd_2048)]
+ ITX_MUL2X_PACK 1, 5, 6, 995, 3973 ;low:t2 high:t3
+ ITX_MUL2X_PACK 7, 5, 6, 1751, 3703 ;low:t4 high:t5
+ ITX_MUL2X_PACK 2, 5, 6, 3513, 2106 ;low:t10 high:t11
+ ITX_MUL2X_PACK 0, 5, 6, 3857, 1380 ;low:t12 high:t13
+ psubsw m5, m1, m2 ;low:t10a high:t11a
+ paddsw m1, m2 ;low:t2a high:t3a
+ psubsw m2, m7, m0 ;low:t12a high:t13a
+ paddsw m7, m0 ;low:t4a high:t5a
+ punpcklqdq m0, m5
+ punpckhwd m0, m5 ;packed t10a, t11a
+ punpcklqdq m5, m2
+ punpckhwd m2, m5 ;packed t13a, t12a
+ ITX_MUL2X_PACK 0, 5, 6, 3406, 2276 ;low:t10 high:t11
+ ITX_MUL2X_PACK 2, 5, 6, 4017, 799, 1 ;low:t12 high:t13
+ mova [coeffq+16*4], m1
+ mova [coeffq+16*5], m7
+ mova m1, [coeffq+16*6]
+ mova m7, [coeffq+16*7]
+ pshufd m1, m1, q1032
+ pshufd m3, m3, q1032
+ punpckhwd m5, m7, m1 ;packed in15, in0
+ punpcklwd m1, m7 ;packed in1, in14
+ punpckhwd m7, m4, m3 ;packed in9, in6
+ punpcklwd m3, m4 ;packed in7, in8
+ ITX_MUL2X_PACK 5, 4, 6, 201, 4091 ;low:t0 high:t1
+ ITX_MUL2X_PACK 7, 4, 6, 2440, 3290 ;low:t6 high:t7
+ ITX_MUL2X_PACK 3, 4, 6, 3035, 2751 ;low:t8 high:t9
+ ITX_MUL2X_PACK 1, 4, 6, 4052, 601 ;low:t14 high:t15
+ psubsw m4, m5, m3 ;low:t8a high:t9a
+ paddsw m5, m3 ;low:t0a high:t1a
+ psubsw m3, m7, m1 ;low:t14a high:t15a
+ paddsw m7, m1 ;low:t6a high:t7a
+ punpcklqdq m1, m4
+ punpckhwd m1, m4 ;packed t8a, t9a
+ punpcklqdq m4, m3
+ punpckhwd m3, m4 ;packed t15a, t14a
+ ITX_MUL2X_PACK 1, 4, 6, 799, 4017 ;low:t8 high:t9
+ ITX_MUL2X_PACK 3, 4, 6, 2276, 3406, 1 ;low:t14 high:t15
+ paddsw m4, m1, m2 ;low:t12a high:t13a
+ psubsw m1, m2 ;low:t8a high:t9a
+ psubsw m2, m0, m3 ;low:t14a high:t15a
+ paddsw m0, m3 ;low:t10a high:t11a
+ punpcklqdq m3, m1
+ punpckhwd m3, m1 ;packed t12a, t13a
+ punpcklqdq m1, m2
+ punpckhwd m2, m1 ;packed t15a, t14a
+ ITX_MUL2X_PACK 3, 1, 6, 1567, 3784 ;low:t12 high:t13
+ ITX_MUL2X_PACK 2, 1, 6, 3784, 1567, 1 ;low:t14 high:t15
+ psubsw m1, m3, m2 ;low:t14a high:t15a
+ paddsw m3, m2 ;low:out2 high:-out13
+ psubsw m2, m4, m0 ;low:t10 high:t11
+ paddsw m0, m4 ;low:-out1 high:out14
+ mova [coeffq+16*6], m0
+ mova [coeffq+16*7], m3
+ mova m0, [coeffq+16*4]
+ mova m3, [coeffq+16*5]
+ psubsw m4, m5, m3 ;low:t4 high:t5
+ paddsw m5, m3 ;low:t0 high:t1
+ psubsw m3, m0, m7 ;low:t6 high:t7
+ paddsw m0, m7 ;low:t2 high:t3
+ punpcklqdq m7, m4
+ punpckhwd m7, m4 ;packed t4, t5
+ punpcklqdq m4, m3
+ punpckhwd m3, m4 ;packed t7, t6
+ ITX_MUL2X_PACK 7, 4, 6, 1567, 3784 ;low:t4a high:t5a
+ ITX_MUL2X_PACK 3, 4, 6, 3784, 1567, 1 ;low:t6a high:t7a
+ psubsw m4, m5, m0 ;low:t2a high:t3a
+ paddsw m0, m5 ;low:out0 high:-out15
+ psubsw m5, m7, m3 ;low:t6 high:t7
+ paddsw m3, m7 ;low:-out3 high:out12
+ ret
+ALIGN function_align
+.main_pass1_end:
+ mova m7, [o(deint_shuf1)]
+ mova [coeffq+16*4], m0
+ mova [coeffq+16*5], m3
+ mova m0, [o(pw_2896_m2896)]
+ mova m3, [o(pw_2896_2896)]
+ pshufb m1, m7 ;t14a t15a
+ pshufb m2, m7 ;t10 t11
+ pshufb m4, m7 ;t2a t3a
+ pshufb m5, m7 ;t6 t7
+ pmaddwd m7, m0, m2
+ pmaddwd m2, m3
+ paddd m7, m6
+ paddd m2, m6
+ psrad m7, 12
+ psrad m2, 12
+ packssdw m2, m7 ;low:out6 high:-out9
+ pmaddwd m7, m0, m4
+ pmaddwd m4, m3
+ paddd m7, m6
+ paddd m4, m6
+ psrad m7, 12
+ psrad m4, 12
+ packssdw m4, m7 ;low:-out7 high:out8
+ pmaddwd m7, m3, m5
+ pmaddwd m5, m0
+ paddd m7, m6
+ paddd m5, m6
+ psrad m7, 12
+ psrad m5, 12
+ packssdw m7, m5 ;low:out4 high:-out11
+ pmaddwd m5, m3, m1
+ pmaddwd m1, m0
+ paddd m5, m6
+ paddd m1, m6
+ psrad m5, 12
+ psrad m1, 12
+ packssdw m5, m1 ;low:-out5 high:out10
+ mova m0, [coeffq+16*4]
+ mova m3, [coeffq+16*5]
+ ret
+ALIGN function_align
+cglobal_label .main_pass2_end
+ mova m7, [o(pw_2896x8)]
+ punpckhqdq m6, m2, m1 ;low:t11 high:t15a
+ punpcklqdq m2, m1 ;low:t10 high:t14a
+ psubsw m1, m2, m6
+ paddsw m2, m6
+ punpckhqdq m6, m4, m5 ;low:t3a high:t7
+ punpcklqdq m4, m5 ;low:t2a high:t6
+ psubsw m5, m4, m6
+ paddsw m4, m6
+ pmulhrsw m1, m7 ;low:-out9 high:out10
+ pmulhrsw m2, m7 ;low:out6 high:-out5
+ pmulhrsw m5, m7 ;low:out8 high:-out11
+ pmulhrsw m4, m7 ;low:-out7 high:out4
+ punpckhqdq m7, m4, m5 ;low:out4 high:-out11
+ punpcklqdq m4, m5 ;low:-out7 high:out8
+ punpckhqdq m5, m2, m1 ;low:-out5 high:out10
+ punpcklqdq m2, m1 ;low:out6 high:-out9
+ ret
+
+
+INV_TXFM_16X4_FN flipadst, dct
+INV_TXFM_16X4_FN flipadst, adst
+INV_TXFM_16X4_FN flipadst, flipadst
+INV_TXFM_16X4_FN flipadst, identity
+
+cglobal iflipadst_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_7ROWS coeffq, 16
+ call m(iadst_16x4_internal_8bpc).main
+ call m(iadst_16x4_internal_8bpc).main_pass1_end
+
+ punpcklwd m6, m7, m0 ;packed out11, out15
+ punpckhwd m0, m7 ;packed -out0, -out4
+ punpckhwd m7, m3, m4 ;packed out3, out7
+ punpcklwd m4, m3 ;packed -out8, -out12
+ mova m1, [coeffq+16*6]
+ punpckhwd m3, m1, m5 ;packed out1, out5
+ punpcklwd m5, m1 ;packed -out10, -out14
+ mova m1, [coeffq+16*7]
+ mova [coeffq+16*6], m3
+ mova [coeffq+16*7], m7
+ punpcklwd m3, m2, m1 ;packed out9, out13
+ punpckhwd m1, m2 ;packed -out2, -out6
+
+ mova m7, [o(pw_m16384)]
+ jmp m(iadst_16x4_internal_8bpc).pass1_end
+
+.pass2:
+ lea tx2q, [o(m(iflipadst_8x4_internal_8bpc).pass2)]
+ jmp m(idct_16x4_internal_8bpc).pass2_end
+
+
+INV_TXFM_16X4_FN identity, dct
+INV_TXFM_16X4_FN identity, adst
+INV_TXFM_16X4_FN identity, flipadst
+INV_TXFM_16X4_FN identity, identity
+
+cglobal iidentity_16x4_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m1, [coeffq+16*6]
+ mova m0, [coeffq+16*5]
+ mova m2, [coeffq+16*7]
+ mova m6, [o(pw_1697x16)]
+ mova m7, [o(pw_16384)]
+ pmulhrsw m4, m6, m1
+ pmulhrsw m3, m6, m0
+ pmulhrsw m5, m6, m2
+ pmulhrsw m4, m7
+ pmulhrsw m3, m7
+ pmulhrsw m5, m7
+ paddsw m1, m4
+ paddsw m0, m3
+ paddsw m5, m2
+ mova m2, [coeffq+16*2]
+ mova m3, [coeffq+16*3]
+ mova m4, [coeffq+16*4]
+ mova [coeffq+16*6], m1
+ mova [coeffq+16*5], m0
+ mova [coeffq+16*7], m5
+ pmulhrsw m0, m6, m2
+ pmulhrsw m1, m6, m3
+ pmulhrsw m5, m6, m4
+ pmulhrsw m0, m7
+ pmulhrsw m1, m7
+ pmulhrsw m5, m7
+ paddsw m2, m0
+ paddsw m3, m1
+ paddsw m4, m5
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+ pmulhrsw m5, m6, m0
+ pmulhrsw m6, m1
+ pmulhrsw m5, m7
+ pmulhrsw m6, m7
+ paddsw m0, m5
+ paddsw m1, m6
+ mova m6, [coeffq+16*6]
+ mova m5, [coeffq+16*5]
+ punpckhwd m7, m0, m2 ;packed out1, out5
+ punpcklwd m0, m2 ;packed out0, out4
+ punpckhwd m2, m1, m3 ;packed out3, out7
+ punpcklwd m1, m3 ;packed out2, out6
+ mova [coeffq+16*6], m7
+ mova m7, [coeffq+16*7]
+ punpckhwd m3, m4, m6 ;packed out9, out13
+ punpcklwd m4, m6 ;packed out8, out12
+ punpckhwd m6, m5, m7 ;packed out11, out15
+ punpcklwd m5, m7 ;packed out10, out14
+ jmp m(idct_16x4_internal_8bpc).pass1_end3
+
+.pass2:
+ lea tx2q, [o(m(iidentity_8x4_internal_8bpc).pass2)]
+ jmp m(idct_16x4_internal_8bpc).pass2_end
+
+
+%macro SAVE_8ROWS 2 ;src, stride
+ mova [%1+%2*0], m0
+ mova [%1+%2*1], m1
+ mova [%1+%2*2], m2
+ mova [%1+%2*3], m3
+ mova [%1+%2*4], m4
+ mova [%1+%2*5], m5
+ mova [%1+%2*6], m6
+ mova [%1+%2*7], m7
+%endmacro
+
+%macro INV_TXFM_8X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x16, 8, 16*16
+%ifidn %1_%2, dct_dct
+ pshuflw m0, [coeffq], q0000
+ punpcklwd m0, m0
+ mova m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mova m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ psrlw m2, 3 ; pw_2048
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ mov r3d, 4
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_8x16_8bpc).end)]
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop
+.end:
+ RET
+%endif
+%endmacro
+
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, adst
+INV_TXFM_8X16_FN dct, flipadst
+INV_TXFM_8X16_FN dct, identity
+
+cglobal idct_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(idct_8x8_internal_8bpc).pass1)]
+
+.pass1:
+ LOAD_8ROWS coeffq+16*1, 32, 1
+ mov [rsp+gprsize+16*11], tx2q
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).pass1_end)]
+ jmp r3
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS coeffq+16*0, 32, 1
+ mov tx2q, [rsp+gprsize+16*11]
+ jmp r3
+
+.pass2:
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end)]
+
+.pass2_pre:
+ mova [coeffq+16*2 ], m1
+ mova [coeffq+16*6 ], m3
+ mova [coeffq+16*10], m5
+ mova [coeffq+16*14], m7
+ mova m1, m2
+ mova m2, m4
+ mova m3, m6
+ mova m4, [coeffq+16*1 ]
+ mova m5, [coeffq+16*5 ]
+ mova m6, [coeffq+16*9 ]
+ mova m7, [coeffq+16*13]
+
+.pass2_main:
+ call m(idct_8x8_internal_8bpc).main
+
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ mova m0, [coeffq+16*2 ]
+ mova m1, [coeffq+16*6 ]
+ mova m2, [coeffq+16*10]
+ mova m3, [coeffq+16*14]
+ mova m4, [coeffq+16*3 ]
+ mova m5, [coeffq+16*7 ]
+ mova m6, [coeffq+16*11]
+ mova m7, [coeffq+16*15]
+ call m(idct_16x8_internal_8bpc).main
+
+ mov r3, dstq
+ lea dstq, [dstq+strideq*8]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ ret
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, adst
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, identity
+
+cglobal iadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(iadst_8x8_internal_8bpc).pass1)]
+ jmp m(idct_8x16_internal_8bpc).pass1
+
+.pass2:
+ lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)]
+
+.pass2_pre:
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*5], m6
+ mova [rsp+gprsize+16*6], m7
+ mova m0, m2
+ mova m1, m3
+ mova m2, m4
+ mova m3, m5
+
+.pass2_main:
+ mova m4, [coeffq+16*1 ]
+ mova m5, [coeffq+16*3 ]
+ mova m6, [coeffq+16*13]
+ mova m7, [coeffq+16*15]
+ mova [rsp+gprsize+16*3], m4
+ mova [rsp+gprsize+16*4], m5
+ mova [rsp+gprsize+16*9], m6
+ mova [rsp+gprsize+32*5], m7
+ mova m4, [coeffq+16*5 ]
+ mova m5, [coeffq+16*7 ]
+ mova m6, [coeffq+16*9 ]
+ mova m7, [coeffq+16*11]
+
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass2_end
+
+ mov r3, dstq
+ lea dstq, [dstq+strideq*8]
+ jmp m(iadst_8x8_internal_8bpc).end
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(iadst_8x8_internal_8bpc).end
+
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+INV_TXFM_8X16_FN flipadst, identity
+
+cglobal iflipadst_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ lea r3, [o(m(iflipadst_8x8_internal_8bpc).pass1)]
+ jmp m(idct_8x16_internal_8bpc).pass1
+
+.pass2:
+ lea tx2q, [o(m(iflipadst_8x16_internal_8bpc).end)]
+ lea r3, [dstq+strideq*8]
+
+.pass2_pre:
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*5], m6
+ mova [rsp+gprsize+16*6], m7
+ mova m0, m2
+ mova m1, m3
+ mova m2, m4
+ mova m3, m5
+
+.pass2_main:
+ mova m4, [coeffq+16*1 ]
+ mova m5, [coeffq+16*3 ]
+ mova m6, [coeffq+16*13]
+ mova m7, [coeffq+16*15]
+ mova [rsp+gprsize+16*3], m4
+ mova [rsp+gprsize+16*4], m5
+ mova [rsp+gprsize+16*9], m6
+ mova [rsp+gprsize+32*5], m7
+ mova m4, [coeffq+16*5 ]
+ mova m5, [coeffq+16*7 ]
+ mova m6, [coeffq+16*9 ]
+ mova m7, [coeffq+16*11]
+
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass2_end
+ jmp m(iflipadst_8x8_internal_8bpc).end
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(iflipadst_8x8_internal_8bpc).end
+
+
+INV_TXFM_8X16_FN identity, dct
+INV_TXFM_8X16_FN identity, adst
+INV_TXFM_8X16_FN identity, flipadst
+INV_TXFM_8X16_FN identity, identity
+
+cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq+16*1, 32, 1
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ mova [rsp+gprsize+16*1], m6
+ jmp m(idct_8x8_internal_8bpc).pass1_end3
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS coeffq+16*0, 32, 1
+ mov tx2q, r3
+ mova [rsp+gprsize+16*1], m6
+ jmp m(idct_8x8_internal_8bpc).pass1_end3
+
+.pass2:
+ lea tx2q, [o(.end1)]
+
+.end:
+ mova [rsp+gprsize+16*0], m7
+ mova [rsp+gprsize+16*1], m6
+ mova m7, [o(pw_1697x16)]
+ REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
+ mova m6, [rsp+gprsize+16*1]
+ mova [rsp+gprsize+16*2], m5
+ IDTX16 6, 5, 7
+ mova m5, [rsp+gprsize+16*0]
+ IDTX16 5, 7, 7
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [rsp+gprsize+16*2]
+ mova [rsp+gprsize+16*0], m5
+ mova [rsp+gprsize+16*1], m6
+ mova [rsp+gprsize+16*2], m7
+ jmp m(idct_8x8_internal_8bpc).end3
+
+.end1:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ lea dstq, [dstq+strideq*2]
+ jmp .end
+
+
+%macro INV_TXFM_16X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x8, 8, 16*16
+%ifidn %1_%2, dct_dct
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ mov r2d, 4
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_16x8_8bpc).end)]
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+.end:
+ RET
+%endif
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, adst
+INV_TXFM_16X8_FN dct, flipadst
+INV_TXFM_16X8_FN dct, identity
+
+cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq+16*0, 32, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ LOAD_8ROWS coeffq+16*1, 32, 1
+ call .main
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass2:
+ lea tx2q, [o(.end)]
+ lea r3, [dstq+8]
+ jmp m(idct_8x8_internal_8bpc).pass2_main
+
+.end:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(idct_8x8_internal_8bpc).pass2_main
+
+
+ALIGN function_align
+cglobal_label .main
+ mova [rsp+gprsize*2+16*1], m2
+ mova [rsp+gprsize*2+16*2], m6
+ mova [rsp+gprsize*2+32*5], m5
+
+ mova m6, [o(pd_2048)]
+ ITX_MULSUB_2W 0, 7, 2, 5, 6, 401, 4076 ;t8a, t15a
+ ITX_MULSUB_2W 4, 3, 2, 5, 6, 3166, 2598 ;t9a, t14a
+ psubsw m2, m0, m4 ;t9
+ paddsw m0, m4 ;t8
+ psubsw m4, m7, m3 ;t14
+ paddsw m7, m3 ;t15
+ ITX_MULSUB_2W 4, 2, 3, 5, 6, 1567, 3784 ;t9a, t14a
+ mova m3, [rsp+gprsize*2+16*1]
+ mova m5, [rsp+gprsize*2+32*5]
+ mova [rsp+gprsize*2+16*1], m2
+ mova [rsp+gprsize*2+32*5], m4
+ mova m2, [rsp+gprsize*2+16*2]
+ mova [rsp+gprsize*2+16*2], m7
+ ITX_MULSUB_2W 3, 5, 7, 4, 6, 1931, 3612 ;t10a, t13a
+ ITX_MULSUB_2W 2, 1, 7, 4, 6, 3920, 1189 ;t11a, t12a
+ psubsw m4, m2, m3 ;t10
+ paddsw m2, m3 ;t11
+ psubsw m3, m1, m5 ;t13
+ paddsw m1, m5 ;t12
+ ITX_MULSUB_2W 3, 4, 7, 5, 6, m3784, 1567 ;t10a, t13a
+ mova m7, [rsp+gprsize*2+32*5]
+ psubsw m6, m0, m2 ;t11a
+ paddsw m0, m2 ;t8a
+ paddsw m2, m7, m3 ;t9
+ psubsw m7, m3 ;t10
+ mova m5, [rsp+gprsize*2+16*0]
+ psubsw m3, m5, m0 ;out8
+ paddsw m0, m5 ;out7
+ mova [rsp+gprsize*2+32*5], m0
+ mova m5, [rsp+gprsize*2+16*9]
+ psubsw m0, m5, m2 ;out9
+ paddsw m2, m5 ;out6
+ mova [rsp+gprsize*2+16*0], m0
+ mova [rsp+gprsize*2+16*9], m2
+ mova m0, [rsp+gprsize*2+16*1]
+ mova m2, [rsp+gprsize*2+16*2]
+ mova [rsp+gprsize*2+16*1], m3
+ psubsw m5, m0, m4 ;t13
+ paddsw m0, m4 ;t14
+ mova m3, [o(pd_2048)]
+ psubsw m4, m2, m1 ;t12a
+ paddsw m1, m2 ;t15a
+ mova [rsp+gprsize*2+16*2], m1
+ ITX_MULSUB_2W 5, 7, 1, 2, 3, 2896, 2896 ;t10a, t13a
+ ITX_MULSUB_2W 4, 6, 1, 2, 3, 2896, 2896 ;t11, t12
+ mova m3, [rsp+gprsize*2+16*8]
+ psubsw m2, m3, m5 ;out10
+ paddsw m3, m5 ;out5
+ mova m5, [rsp+gprsize*2+16*7]
+ mova [rsp+gprsize*2+16*8], m3
+ psubsw m3, m5, m4 ;out11
+ paddsw m5, m4 ;out4
+ mova m4, [rsp+gprsize*2+16*6]
+ mova [rsp+gprsize*2+16*7], m5
+ paddsw m5, m4, m6 ;out3
+ psubsw m4, m6 ;out12
+ mova m6, [rsp+gprsize*2+16*5]
+ mova [rsp+gprsize*2+16*6], m5
+ psubsw m5, m6, m7 ;out13
+ paddsw m6, m7 ;out2
+ mova m7, [rsp+gprsize*2+16*4]
+ mova [rsp+gprsize*2+16*5], m6
+ psubsw m6, m7, m0 ;out14
+ paddsw m7, m0 ;out1
+ mova m1, [rsp+gprsize*2+16*2]
+ mova m0, [rsp+gprsize*2+16*3]
+ mova [rsp+gprsize*2+16*4], m7
+ psubsw m7, m0, m1 ;out15
+ paddsw m0, m1 ;out0
+ mova [rsp+gprsize*2+16*3], m0
+ mova m1, [rsp+gprsize*2+16*0]
+ mova m0, [rsp+gprsize*2+16*1]
+ mova [rsp+gprsize*2+16*0], m7
+ ret
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, adst
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, identity
+
+cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m7, [o(pw_2896x8)]
+ pmulhrsw m0, m7, [coeffq+16*0 ]
+ pmulhrsw m1, m7, [coeffq+16*1 ]
+ pmulhrsw m2, m7, [coeffq+16*14]
+ pmulhrsw m3, m7, [coeffq+16*15]
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*9], m2
+ mova [rsp+gprsize+32*5], m3
+ pmulhrsw m0, m7, [coeffq+16*6 ]
+ pmulhrsw m1, m7, [coeffq+16*7 ]
+ pmulhrsw m2, m7, [coeffq+16*8 ]
+ pmulhrsw m3, m7, [coeffq+16*9 ]
+ mova [rsp+gprsize+16*3], m2
+ mova [rsp+gprsize+16*4], m3
+ mova [rsp+gprsize+16*5], m0
+ mova [rsp+gprsize+16*6], m1
+ pmulhrsw m0, m7, [coeffq+16*2 ]
+ pmulhrsw m1, m7, [coeffq+16*3 ]
+ pmulhrsw m2, m7, [coeffq+16*4 ]
+ pmulhrsw m3, m7, [coeffq+16*5 ]
+ pmulhrsw m4, m7, [coeffq+16*10]
+ pmulhrsw m5, m7, [coeffq+16*11]
+ pmulhrsw m6, m7, [coeffq+16*12]
+ pmulhrsw m7, [coeffq+16*13]
+
+ call .main
+ call .main_pass1_end
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ jmp m(iadst_8x8_internal_8bpc).pass1_end
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ jmp m(iadst_8x8_internal_8bpc).pass1_end
+
+.pass2:
+ lea tx2q, [o(.end)]
+ lea r3, [dstq+8]
+ jmp m(iadst_8x8_internal_8bpc).pass2_main
+
+.end:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(iadst_8x8_internal_8bpc).pass2_main
+
+ALIGN function_align
+cglobal_label .main
+ mova [rsp+gprsize*2+16*0], m1
+ mova [rsp+gprsize*2+16*1], m2
+ mova [rsp+gprsize*2+16*2], m6
+
+ mova m6, [o(pd_2048)]
+ ITX_MULSUB_2W 7, 0, 1, 2, 6, 995, 3973 ;t3, t2
+ ITX_MULSUB_2W 3, 4, 1, 2, 6, 3513, 2106 ;t11, t10
+ psubsw m1, m0, m4 ;t10a
+ paddsw m0, m4 ;t2a
+ psubsw m4, m7, m3 ;t11a
+ paddsw m3, m7 ;t3a
+ ITX_MULSUB_2W 1, 4, 7, 2, 6, 3406, 2276 ;t11, t10
+ mova m2, [rsp+gprsize*2+16*0] ;in3
+ mova m7, [rsp+gprsize*2+16*1] ;in4
+ mova [rsp+gprsize*2+16*0], m1 ;t11
+ mova [rsp+gprsize*2+16*1], m4 ;t10
+ mova m1, [rsp+gprsize*2+16*2] ;in12
+ mova [rsp+gprsize*2+16*2], m0 ;t2a
+ ITX_MULSUB_2W 5, 7, 0, 4, 6, 1751, 3703 ;t5, t4
+ ITX_MULSUB_2W 2, 1, 0, 4, 6, 3857, 1380 ;t13, t12
+ psubsw m0, m7, m1 ;t12a
+ paddsw m1, m7 ;t4a
+ psubsw m4, m5, m2 ;t13a
+ paddsw m5, m2 ;t5a
+ ITX_MULSUB_2W 4, 0, 7, 2, 6, 4017, 799 ;t12, t13
+ mova m2, [rsp+gprsize*2+16*8] ;in1
+ mova m7, [rsp+gprsize*2+16*9] ;in14
+ mova [rsp+gprsize*2+16*8], m4 ;t12
+ mova [rsp+gprsize*2+16*9], m0 ;t13
+ mova m4, [rsp+gprsize*2+16*4] ;in9
+ mova m0, [rsp+gprsize*2+16*5] ;in6
+ mova [rsp+gprsize*2+16*4], m1 ;t4a
+ mova [rsp+gprsize*2+16*5], m5 ;t5a
+ ITX_MULSUB_2W 2, 7, 1, 5, 6, 4052, 601 ;t15, t14
+ ITX_MULSUB_2W 4, 0, 1, 5, 6, 2440, 3290 ;t7, t6
+ psubsw m1, m0, m7 ;t14a
+ paddsw m0, m7 ;t6a
+ psubsw m5, m4, m2 ;t15a
+ paddsw m4, m2 ;t7a
+ ITX_MULSUB_2W 5, 1, 7, 2, 6, 2276, 3406 ;t14, t15
+ mova m2, [rsp+gprsize*2+16*2] ;t2a
+ mova [rsp+gprsize*2+16*2], m5 ;t14
+ psubsw m7, m2, m0 ;t6
+ paddsw m2, m0 ;t2
+ psubsw m0, m3, m4 ;t7
+ paddsw m3, m4 ;t3
+ ITX_MULSUB_2W 0, 7, 4, 5, 6, 3784, 1567 ;t6a, t7a
+ mova m4, [rsp+gprsize*2+16*7] ;in0
+ mova m5, [rsp+gprsize*2+32*5] ;in15
+ mova [rsp+gprsize*2+16*7], m3 ;t3
+ mova [rsp+gprsize*2+32*5], m1 ;t15
+ mova m1, [rsp+gprsize*2+16*6] ;in7
+ mova m3, [rsp+gprsize*2+16*3] ;in8
+ mova [rsp+gprsize*2+16*6], m7 ;t7a
+ mova [rsp+gprsize*2+16*3], m0 ;t6a
+ ITX_MULSUB_2W 5, 4, 0, 7, 6, 201, 4091 ;t1, t0
+ ITX_MULSUB_2W 1, 3, 0, 7, 6, 3035, 2751 ;t9, t8
+ psubsw m0, m4, m3 ;t8a
+ paddsw m4, m3 ;t0a
+ psubsw m3, m5, m1 ;t9a
+ paddsw m5, m1 ;t1a
+ ITX_MULSUB_2W 0, 3, 1, 7, 6, 799, 4017 ;t9, t8
+ mova m1, [rsp+gprsize*2+16*4] ;t4a
+ mova m7, [rsp+gprsize*2+16*5] ;t5a
+ mova [rsp+gprsize*2+16*4], m3 ;t8
+ mova [rsp+gprsize*2+16*5], m0 ;t9
+ psubsw m0, m4, m1 ;t4
+ paddsw m4, m1 ;t0
+ psubsw m3, m5, m7 ;t5
+ paddsw m5, m7 ;t1
+ ITX_MULSUB_2W 0, 3, 1, 7, 6, 1567, 3784 ;t5a, t4a
+ mova m7, [rsp+gprsize*2+16*3] ;t6a
+ psubsw m1, m4, m2 ;t2a
+ paddsw m4, m2 ;out0
+ mova [rsp+gprsize*2+16*3], m4 ;out0
+ mova m4, [rsp+gprsize*2+16*6] ;t7a
+ psubsw m2, m3, m7 ;t6
+ paddsw m3, m7 ;-out3
+ mova [rsp+gprsize*2+16*6], m3 ;-out3
+ psubsw m3, m0, m4 ;t7
+ paddsw m0, m4 ;out12
+ mova [rsp+gprsize*2+16*12], m3
+ mova m3, [rsp+gprsize*2+16*7] ;t3
+ mova [rsp+gprsize*2+16* 7], m2 ;out4
+ psubsw m2, m5, m3 ;t3a
+ paddsw m5, m3 ;-out15
+ mova [rsp+gprsize*2+16*11], m2
+ mova m2, [rsp+gprsize*2+32*5] ;t15
+ mova [rsp+gprsize*2+16*10], m1 ;-out7
+ mova m1, [rsp+gprsize*2+16*0] ;t11
+ mova [rsp+gprsize*2+16*0 ], m5 ;-out15
+ mova m3, [rsp+gprsize*2+16*1] ;t10
+ mova [rsp+gprsize*2+16*1 ], m4 ;-out11
+ mova m4, [rsp+gprsize*2+16*2] ;t14
+ mova [rsp+gprsize*2+16*2 ], m0 ;out12
+ psubsw m0, m3, m4 ;t14a
+ paddsw m3, m4 ;t10a
+ psubsw m5, m1, m2 ;t15a
+ paddsw m1, m2 ;t11a
+ ITX_MULSUB_2W 5, 0, 2, 4, 6, 3784, 1567 ;t14, t15
+ mova m2, [rsp+gprsize*2+16*4] ;t8
+ mova m4, [rsp+gprsize*2+16*5] ;t9
+ mova [rsp+gprsize*2+16*4], m3 ;t10a
+ mova [rsp+gprsize*2+16*5], m1 ;t11a
+ mova m3, [rsp+gprsize*2+16*8] ;t12
+ mova m1, [rsp+gprsize*2+16*9] ;t13
+ mova [rsp+gprsize*2+16*8], m5 ;t14
+ mova [rsp+gprsize*2+16*9], m0 ;t15
+ psubsw m5, m2, m3 ;t12a
+ paddsw m2, m3 ;t8a
+ psubsw m0, m4, m1 ;t13a
+ paddsw m4, m1 ;t9a
+ ITX_MULSUB_2W 5, 0, 1, 3, 6, 1567, 3784 ;t13, t12
+ mova m6, [rsp+gprsize*2+16*4] ;t10a
+ mova m1, [rsp+gprsize*2+16*5] ;t11a
+ psubsw m3, m2, m6 ;t10
+ paddsw m2, m6 ;-out1
+ paddsw m6, m4, m1 ;out14
+ psubsw m4, m1 ;t11
+ mova [rsp+gprsize*2+16*14], m4
+ mova [rsp+gprsize*2+16* 4], m2 ;-out1
+ mova m4, [rsp+gprsize*2+16*8] ;t14
+ mova m2, [rsp+gprsize*2+16*9] ;t15
+ mova [rsp+gprsize*2+16* 9], m3 ;out6
+ psubsw m3, m0, m4 ;t14a
+ paddsw m0, m4 ;out2
+ psubsw m4, m5, m2 ;t15a
+ paddsw m5, m2 ;-out13
+ mova [rsp+gprsize*2+16* 5], m0 ;out2
+ ret
+ALIGN function_align
+.main_pass1_end:
+ mova m0, [rsp+gprsize*2+16*14]
+ mova [rsp+gprsize*2+16*14], m5
+ mova [rsp+gprsize*2+16*15], m6
+ mova m5, [o(pw_2896_2896)]
+ mova m6, [o(pw_2896_m2896)]
+ mova m7, [o(pd_2048)]
+ punpcklwd m2, m3, m4
+ punpckhwd m3, m4
+ pmaddwd m4, m5, m2
+ pmaddwd m2, m6
+ pmaddwd m1, m5, m3
+ pmaddwd m3, m6
+ REPX {paddd x, m7}, m4, m2, m1, m3
+ REPX {psrad x, 12}, m4, m1, m2, m3
+ packssdw m4, m1 ;-out5
+ packssdw m2, m3 ;out10
+ mova [rsp+gprsize*2+16* 8], m4
+ mova m3, [rsp+gprsize*2+16* 9]
+ punpcklwd m1, m3, m0
+ punpckhwd m3, m0
+ pmaddwd m0, m5, m1
+ pmaddwd m1, m6
+ pmaddwd m4, m5, m3
+ pmaddwd m3, m6
+ REPX {paddd x, m7}, m0, m1, m4, m3
+ REPX {psrad x, 12}, m0, m4, m1, m3
+ packssdw m0, m4 ;out6
+ packssdw m1, m3 ;-out9
+ mova [rsp+gprsize*2+16* 9], m0
+ mova m0, [rsp+gprsize*2+16* 7]
+ mova m4, [rsp+gprsize*2+16*12]
+ punpcklwd m3, m0, m4
+ punpckhwd m0, m4
+ pmaddwd m4, m5, m3
+ pmaddwd m3, m6
+ pmaddwd m5, m0
+ pmaddwd m0, m6
+ REPX {paddd x, m7}, m4, m3, m5, m0
+ REPX {psrad x, 12}, m4, m5, m3, m0
+ packssdw m4, m5 ;out4
+ packssdw m3, m0 ;-out11
+ mova [rsp+gprsize*2+16* 7], m4
+ mova m4, [rsp+gprsize*2+16*10]
+ mova m5, [rsp+gprsize*2+16*11]
+ punpcklwd m0, m4, m5
+ punpckhwd m4, m5
+ pmaddwd m5, m0, [o(pw_2896_2896)]
+ pmaddwd m0, m6
+ pmaddwd m6, m4
+ pmaddwd m4, [o(pw_2896_2896)]
+ REPX {paddd x, m7}, m5, m0, m6, m4
+ REPX {psrad x, 12}, m0, m6, m5, m4
+ packssdw m0, m6 ;out8
+ packssdw m5, m4 ;-out7
+ mova [rsp+gprsize*2+16*10], m5
+ mova m4, [rsp+gprsize*2+16* 2] ;out12
+ mova m5, [rsp+gprsize*2+16*14] ;-out13
+ mova m6, [rsp+gprsize*2+16*15] ;out14
+ ret
+ALIGN function_align
+cglobal_label .main_pass2_end
+ mova m7, [o(pw_2896x8)]
+ mova m1, [rsp+gprsize*2+16* 9]
+ mova m2, [rsp+gprsize*2+16*14]
+ paddsw m0, m1, m2
+ psubsw m1, m2
+ pmulhrsw m0, m7 ;out6
+ pmulhrsw m1, m7 ;-out9
+ mova [rsp+gprsize*2+16* 9], m0
+ psubsw m2, m3, m4
+ paddsw m3, m4
+ pmulhrsw m2, m7 ;out10
+ pmulhrsw m3, m7 ;-out5
+ mova [rsp+gprsize*2+16* 8], m3
+ mova m3, [rsp+gprsize*2+16* 7]
+ mova m4, [rsp+gprsize*2+16*12]
+ paddsw m0, m3, m4
+ psubsw m3, m4
+ pmulhrsw m0, m7 ;out4
+ pmulhrsw m3, m7 ;-out11
+ mova [rsp+gprsize*2+16* 7], m0
+ mova m0, [rsp+gprsize*2+16*10]
+ paddsw m4, m0, [rsp+gprsize*2+16*11]
+ psubsw m0, [rsp+gprsize*2+16*11]
+ pmulhrsw m4, m7 ;-out7
+ pmulhrsw m0, m7 ;out8
+ mova [rsp+gprsize*2+16*10], m4
+ mova m4, [rsp+gprsize*2+16*2 ] ;out12
+ ret
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+INV_TXFM_16X8_FN flipadst, identity
+
+cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mova m7, [o(pw_2896x8)]
+ pmulhrsw m0, m7, [coeffq+16*0 ]
+ pmulhrsw m1, m7, [coeffq+16*1 ]
+ pmulhrsw m2, m7, [coeffq+16*14]
+ pmulhrsw m3, m7, [coeffq+16*15]
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*9], m2
+ mova [rsp+gprsize+32*5], m3
+ pmulhrsw m0, m7, [coeffq+16*6 ]
+ pmulhrsw m1, m7, [coeffq+16*7 ]
+ pmulhrsw m2, m7, [coeffq+16*8 ]
+ pmulhrsw m3, m7, [coeffq+16*9 ]
+ mova [rsp+gprsize+16*3], m2
+ mova [rsp+gprsize+16*4], m3
+ mova [rsp+gprsize+16*5], m0
+ mova [rsp+gprsize+16*6], m1
+ pmulhrsw m0, m7, [coeffq+16*2 ]
+ pmulhrsw m1, m7, [coeffq+16*3 ]
+ pmulhrsw m2, m7, [coeffq+16*4 ]
+ pmulhrsw m3, m7, [coeffq+16*5 ]
+ pmulhrsw m4, m7, [coeffq+16*10]
+ pmulhrsw m5, m7, [coeffq+16*11]
+ pmulhrsw m6, m7, [coeffq+16*12]
+ pmulhrsw m7, [coeffq+16*13]
+
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS coeffq+16*0, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ jmp m(iflipadst_8x8_internal_8bpc).pass1_end
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS coeffq+16*0, 32
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ jmp m(iflipadst_8x8_internal_8bpc).pass1_end
+
+.pass2:
+ lea tx2q, [o(.end)]
+ lea r3, [dstq+8]
+ jmp m(iflipadst_8x8_internal_8bpc).pass2_main
+
+.end:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(iflipadst_8x8_internal_8bpc).pass2_main
+
+
+INV_TXFM_16X8_FN identity, dct
+INV_TXFM_16X8_FN identity, adst
+INV_TXFM_16X8_FN identity, flipadst
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ add coeffq, 16*16
+ mova m4, [coeffq-16*7]
+ mova m5, [coeffq-16*5]
+ mova m6, [coeffq-16*3]
+ mova m7, [coeffq-16*1]
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+
+.pass1:
+ mova m0, [o(pw_2896x8)]
+ mova m2, [o(pw_1697x16)]
+ mova m3, [o(pw_16384)]
+ sub coeffq, 8*16
+ REPX {pmulhrsw x, m0}, m4, m5, m6, m7
+ pmulhrsw m1, m2, m4
+ pmulhrsw m1, m3
+ paddsw m1, m4 ; 1
+ pmulhrsw m4, m2, m5
+ pmulhrsw m4, m3
+ paddsw m4, m5 ; 3
+ pmulhrsw m5, m2, m6
+ pmulhrsw m5, m3
+ paddsw m5, m6 ; 5
+ pmulhrsw m6, m2, m7
+ pmulhrsw m6, m3
+ paddsw m7, m6 ; 7
+ pmulhrsw m6, m0, [coeffq+16*6]
+ mova [rsp+gprsize+16*0], m4
+ pmulhrsw m4, m2, m6
+ pmulhrsw m4, m3
+ paddsw m6, m4 ; 6
+ pmulhrsw m4, m0, [coeffq+16*4]
+ mova [rsp+gprsize+16*1], m6
+ pmulhrsw m6, m2, m4
+ pmulhrsw m6, m3
+ paddsw m4, m6 ; 4
+ pmulhrsw m6, m0, [coeffq+16*2]
+ pmulhrsw m0, [coeffq+16*0]
+ pmulhrsw m2, m6
+ pmulhrsw m2, m3
+ paddsw m2, m6 ; 2
+ pmulhrsw m6, m0, [o(pw_1697x16)]
+ pmulhrsw m6, m3
+ mova m3, [rsp+gprsize+16*0]
+ paddsw m0, m6
+ jmp m(idct_8x8_internal_8bpc).pass1_end3
+
+.pass1_end:
+ mova [coeffq+16*1], m4
+ mova [coeffq+16*3], m5
+ mova [coeffq+16*5], m6
+ mova [coeffq+16*7], m7
+ mova m4, [coeffq-16*7]
+ mova m5, [coeffq-16*5]
+ mova m6, [coeffq-16*3]
+ mova m7, [coeffq-16*1]
+ mova [coeffq-16*7], m0
+ mova [coeffq-16*5], m1
+ mova [coeffq-16*3], m2
+ mova [coeffq-16*1], m3
+ mov tx2q, r3
+ jmp .pass1
+
+.pass2:
+ lea tx2q, [o(.end)]
+ lea r3, [dstq+8]
+ jmp m(iidentity_8x8_internal_8bpc).end
+
+.end:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ mov dstq, r3
+ jmp m(iidentity_8x8_internal_8bpc).end
+
+
+%macro INV_TXFM_16X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x16, 8, 16*16
+%ifidn %1_%2, dct_dct
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ mov r2d, 8
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_16x16_8bpc).end)]
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+.end:
+ RET
+%endif
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, adst
+INV_TXFM_16X16_FN dct, flipadst
+INV_TXFM_16X16_FN dct, identity
+
+cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq+16*1, 64
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+16*3, 64
+ call m(idct_16x8_internal_8bpc).main
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ mova m7, [o(pw_8192)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*17, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end1)]
+ mova m7, [o(pw_8192)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS coeffq+16*0, 64
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+16*2, 64
+ call m(idct_16x8_internal_8bpc).main
+ lea tx2q, [o(.pass1_end2)]
+ mova m7, [o(pw_8192)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+16*16, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ mova m7, [o(pw_8192)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass2:
+ lea tx2q, [o(.end)]
+ jmp m(idct_8x16_internal_8bpc).pass2_pre
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.end1)]
+ mov dstq, r3
+ lea r3, [dstq+8]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+ add coeffq, 32*8
+ mov dstq, r3
+
+ mova m0, [coeffq+16*0 ]
+ mova m1, [coeffq+16*4 ]
+ mova m2, [coeffq+16*8 ]
+ mova m3, [coeffq+16*12]
+ mova m4, [coeffq+16*1 ]
+ mova m5, [coeffq+16*5 ]
+ mova m6, [coeffq+16*9 ]
+ mova m7, [coeffq+16*13]
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end)]
+ jmp m(idct_8x16_internal_8bpc).pass2_main
+
+
+%macro ITX_16X16_ADST_LOAD_ODD_COEFS 0
+ mova m0, [coeffq+16*1 ]
+ mova m1, [coeffq+16*3 ]
+ mova m2, [coeffq+16*29]
+ mova m3, [coeffq+16*31]
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*9], m2
+ mova [rsp+gprsize+32*5], m3
+ mova m0, [coeffq+16*13]
+ mova m1, [coeffq+16*15]
+ mova m2, [coeffq+16*17]
+ mova m3, [coeffq+16*19]
+ mova [rsp+gprsize+16*3], m2
+ mova [rsp+gprsize+16*4], m3
+ mova [rsp+gprsize+16*5], m0
+ mova [rsp+gprsize+16*6], m1
+ mova m0, [coeffq+16*5 ]
+ mova m1, [coeffq+16*7 ]
+ mova m2, [coeffq+16*9 ]
+ mova m3, [coeffq+16*11]
+ mova m4, [coeffq+16*21]
+ mova m5, [coeffq+16*23]
+ mova m6, [coeffq+16*25]
+ mova m7, [coeffq+16*27]
+%endmacro
+
+%macro ITX_16X16_ADST_LOAD_EVEN_COEFS 0
+ mova m0, [coeffq+16*0 ]
+ mova m1, [coeffq+16*2 ]
+ mova m2, [coeffq+16*28]
+ mova m3, [coeffq+16*30]
+ mova [rsp+gprsize+16*7], m0
+ mova [rsp+gprsize+16*8], m1
+ mova [rsp+gprsize+16*9], m2
+ mova [rsp+gprsize+32*5], m3
+ mova m0, [coeffq+16*12]
+ mova m1, [coeffq+16*14]
+ mova m2, [coeffq+16*16]
+ mova m3, [coeffq+16*18]
+ mova [rsp+gprsize+16*3], m2
+ mova [rsp+gprsize+16*4], m3
+ mova [rsp+gprsize+16*5], m0
+ mova [rsp+gprsize+16*6], m1
+ mova m0, [coeffq+16*4 ]
+ mova m1, [coeffq+16*6 ]
+ mova m2, [coeffq+16*8 ]
+ mova m3, [coeffq+16*10]
+ mova m4, [coeffq+16*20]
+ mova m5, [coeffq+16*22]
+ mova m6, [coeffq+16*24]
+ mova m7, [coeffq+16*26]
+%endmacro
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, adst
+INV_TXFM_16X16_FN adst, flipadst
+
+cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ ITX_16X16_ADST_LOAD_ODD_COEFS
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ mova m7, [o(pw_8192)]
+ jmp m(iadst_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*17, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end1)]
+ mova m7, [o(pw_8192)]
+ jmp m(iadst_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+16*1, 32
+ ITX_16X16_ADST_LOAD_EVEN_COEFS
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+
+ lea tx2q, [o(.pass1_end2)]
+ mova m7, [o(pw_8192)]
+ jmp m(iadst_8x8_internal_8bpc).pass1_end1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+16*16, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ mova m7, [o(pw_8192)]
+ jmp m(iadst_8x8_internal_8bpc).pass1_end1
+
+.pass2:
+ lea tx2q, [o(.end)]
+ jmp m(iadst_8x16_internal_8bpc).pass2_pre
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.end1)]
+ mov dstq, r3
+ lea r3, [dstq+8]
+ jmp m(iadst_8x8_internal_8bpc).end
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+ add coeffq, 32*8
+ mov dstq, r3
+
+ mova m4, [coeffq+16*0 ]
+ mova m5, [coeffq+16*2 ]
+ mova m0, [coeffq+16*4 ]
+ mova m1, [coeffq+16*6 ]
+ mova m2, [coeffq+16*8 ]
+ mova m3, [coeffq+16*10]
+ mova m6, [coeffq+16*12]
+ mova m7, [coeffq+16*14]
+ mova [rsp+gprsize+16*7], m4
+ mova [rsp+gprsize+16*8], m5
+ mova [rsp+gprsize+16*5], m6
+ mova [rsp+gprsize+16*6], m7
+ lea tx2q, [o(m(iadst_8x16_internal_8bpc).end)]
+ jmp m(iadst_8x16_internal_8bpc).pass2_main
+
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ ITX_16X16_ADST_LOAD_ODD_COEFS
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+ mova m7, [o(pw_m8192)]
+ jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*1, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end1)]
+ mova m7, [o(pw_m8192)]
+ jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+16*17, 32
+ ITX_16X16_ADST_LOAD_EVEN_COEFS
+ call m(iadst_16x8_internal_8bpc).main
+ call m(iadst_16x8_internal_8bpc).main_pass1_end
+
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS coeffq+16*0, 32
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end2)]
+ mova m7, [o(pw_m8192)]
+ jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+16*16, 32
+ LOAD_8ROWS coeffq+16* 0, 32
+ mova [rsp+gprsize+16*0], m7
+ mov tx2q, r3
+ mova m7, [o(pw_m8192)]
+ jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
+
+.pass2:
+ lea tx2q, [o(.end)]
+ lea r3, [dstq+8]
+ jmp m(iflipadst_8x16_internal_8bpc).pass2_pre
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.end1)]
+ lea dstq, [dstq+strideq*2]
+ jmp m(iflipadst_8x8_internal_8bpc).end
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+ add coeffq, 32*8
+
+ mova m4, [coeffq+16*0 ]
+ mova m5, [coeffq+16*2 ]
+ mova m0, [coeffq+16*4 ]
+ mova m1, [coeffq+16*6 ]
+ mova m2, [coeffq+16*8 ]
+ mova m3, [coeffq+16*10]
+ mova m6, [coeffq+16*12]
+ mova m7, [coeffq+16*14]
+ mova [rsp+gprsize+16*7], m4
+ mova [rsp+gprsize+16*8], m5
+ mova [rsp+gprsize+16*5], m6
+ mova [rsp+gprsize+16*6], m7
+
+ lea tx2q, [o(.end2)]
+ mov dstq, r3
+ jmp m(iflipadst_8x16_internal_8bpc).pass2_main
+
+.end2:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ lea dstq, [dstq+strideq*2]
+ jmp m(iflipadst_8x8_internal_8bpc).end
+
+
+%macro IDTX16B 3 ; src/dst, tmp, pw_1697x16
+ pmulhrsw m%2, m%3, m%1
+ psraw m%2, 1
+ pavgw m%1, m%2
+%endmacro
+
+INV_TXFM_16X16_FN identity, dct
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ add coeffq, 16*17
+ mov r3, tx2q
+ lea tx2q, [o(.pass1_end)]
+
+.pass1:
+ mova m6, [o(pw_1697x16)]
+ mova m7, [coeffq+32*6]
+ mova m0, [coeffq+32*0]
+ mova m1, [coeffq+32*1]
+ mova m2, [coeffq+32*2]
+ mova m3, [coeffq+32*3]
+ mova m4, [coeffq+32*4]
+ REPX {IDTX16B x, 5, 6}, 7, 0, 1, 2, 3, 4
+ mova m5, [coeffq+32*5]
+ mova [rsp+gprsize+16*1], m7
+ IDTX16B 5, 7, 6
+ mova m7, [coeffq+32*7]
+ IDTX16B 7, 6, 6
+ jmp m(idct_8x8_internal_8bpc).pass1_end3
+
+.pass1_end:
+ SAVE_8ROWS coeffq, 32
+ sub coeffq, 16
+ lea tx2q, [o(.pass1_end1)]
+ jmp .pass1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq, 32
+ sub coeffq, 15*16
+ lea tx2q, [o(.pass1_end2)]
+ jmp .pass1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq, 32
+ sub coeffq, 16
+ mov tx2q, r3
+ jmp .pass1
+
+.pass2:
+ lea r3, [dstq+8]
+ lea tx2q, [o(.end1)]
+
+.end:
+ mova [rsp+gprsize+16*0], m7
+ mova [rsp+gprsize+16*1], m4
+ mova m7, [o(pw_1697x16)]
+ REPX {IDTX16 x, 4, 7}, 5, 6, 0, 1, 2, 3
+ mova m4, [o(pw_2048)]
+ pmulhrsw m5, m4
+ pmulhrsw m6, m4
+ mova [rsp+gprsize+16*2], m5
+ mova m5, [rsp+gprsize+16*1]
+ mova [rsp+gprsize+16*1], m6
+ IDTX16 5, 6, 7
+ mova m6, [rsp+gprsize+16*0]
+ IDTX16 6, 7, 7
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3, m6
+ pmulhrsw m4, m5
+ mova [rsp+gprsize+16*0], m6
+ jmp m(idct_8x8_internal_8bpc).end3
+
+.end1:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(.end2)]
+ lea dstq, [dstq+strideq*2]
+ jmp .end
+
+.end2:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+ add coeffq, 32*8
+ LOAD_8ROWS coeffq, 32
+ lea tx2q, [o(.end3)]
+ mov dstq, r3
+ jmp .end
+
+.end3:
+ LOAD_8ROWS coeffq+16*1, 32
+ lea tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
+ lea dstq, [dstq+strideq*2]
+ jmp .end
+
+
+cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+ call m(idct_8x32_internal_8bpc)
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m2
+ psrlw m2, 2 ;pw_2048
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ pshuflw m0, m0, q0000
+ punpcklwd m0, m0
+ mov r3d, 8
+ lea tx2q, [o(.end)]
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop
+
+.end:
+ RET
+
+
+
+cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ cmp eobd, 106
+ jle .fast
+
+ LOAD_8ROWS coeffq+16*3, 64
+ call m(idct_8x8_internal_8bpc).main
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1:
+ mova [rsp+gprsize+16*9 ], m0 ;in24
+ mova [rsp+gprsize+16*10], m4 ;in28
+ mova [rsp+gprsize+16*17], m2 ;in26
+ mova [rsp+gprsize+16*18], m6 ;in30
+ mova [rsp+gprsize+16*31], m1 ;in25
+ mova [rsp+gprsize+16*30], m3 ;in27
+ mova [rsp+gprsize+16*27], m5 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+ LOAD_8ROWS coeffq+16*2, 64
+ call m(idct_8x8_internal_8bpc).main
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_1:
+ mova [rsp+gprsize+16*7 ], m0 ;in16
+ mova [rsp+gprsize+16*8 ], m4 ;in20
+ mova [rsp+gprsize+16*15], m2 ;in18
+ mova [rsp+gprsize+16*16], m6 ;in22
+ mova [rsp+gprsize+16*33], m1 ;in17
+ mova [rsp+gprsize+16*28], m3 ;in19
+ mova [rsp+gprsize+16*29], m5 ;in21
+ mova [rsp+gprsize+16*32], m7 ;in23
+
+.fast:
+ LOAD_8ROWS coeffq+16*1, 64
+ call m(idct_8x8_internal_8bpc).main
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ mova [rsp+gprsize+16*5 ], m0 ;in8
+ mova [rsp+gprsize+16*6 ], m4 ;in12
+ mova [rsp+gprsize+16*13], m2 ;in10
+ mova [rsp+gprsize+16*14], m6 ;in14
+ mova [rsp+gprsize+16*21], m1 ;in9
+ mova [rsp+gprsize+16*24], m3 ;in11
+ mova [rsp+gprsize+16*25], m5 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+ LOAD_8ROWS coeffq+16*0, 64
+ call m(idct_8x8_internal_8bpc).main
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ mova [rsp+gprsize+16*11], m2 ;in2
+ mova [rsp+gprsize+16*12], m6 ;in6
+ mova [rsp+gprsize+16*19], m1 ;in1
+ mova [rsp+gprsize+16*26], m3 ;in3
+ mova [rsp+gprsize+16*23], m5 ;in5
+ mova [rsp+gprsize+16*22], m7 ;in7
+ mova m1, m4 ;in4
+ mova m2, [rsp+gprsize+16*5 ] ;in8
+ mova m3, [rsp+gprsize+16*6 ] ;in12
+
+ cmp eobd, 106
+ jg .full
+
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3 , 16
+ mova m0, [rsp+gprsize+16*11]
+ mova m1, [rsp+gprsize+16*12]
+ mova m2, [rsp+gprsize+16*13]
+ mova m3, [rsp+gprsize+16*14]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call .main_fast
+ jmp .pass2
+
+.full:
+ mova m4, [rsp+gprsize+16*7 ] ;in16
+ mova m5, [rsp+gprsize+16*8 ] ;in20
+ mova m6, [rsp+gprsize+16*9 ] ;in24
+ mova m7, [rsp+gprsize+16*10] ;in28
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3 , 16
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+ call .main
+
+.pass2:
+ lea r3, [o(.end6)]
+
+.end:
+ mova [rsp+gprsize+16*0 ], m7
+ lea tx2q, [o(.end2)]
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \
+ 8, 9, 10, 11, 12, 13, 14, 15, \
+ 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+
+ jmp tx2q
+
+.end2:
+ lea tx2q, [o(.end3)]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end3:
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0 ], m7
+ lea dstq, [dstq+strideq*2]
+ lea tx2q, [o(.end4)]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end4:
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0 ], m7
+ lea dstq, [dstq+strideq*2]
+ lea tx2q, [o(.end5)]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end5:
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0 ], m7
+ lea dstq, [dstq+strideq*2]
+ mov tx2q, r3
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end6:
+ ret
+
+ALIGN function_align
+cglobal_label .main_veryfast
+ mova m0, [rsp+gprsize*2+16*19] ;in1
+ pmulhrsw m3, m0, [o(pw_4091x8)] ;t30,t31
+ pmulhrsw m0, [o(pw_201x8)] ;t16,t17
+ mova m7, [o(pd_2048)]
+ mova [rsp+gprsize*2+16*19], m0 ;t16
+ mova [rsp+gprsize*2+16*34], m3 ;t31
+ ITX_MULSUB_2W 3, 0, 1, 2, 7, 799, 4017 ;t17a, t30a
+ mova [rsp+gprsize*2+16*20], m3 ;t17a
+ mova [rsp+gprsize*2+16*33], m0 ;t30a
+ mova m1, [rsp+gprsize*2+16*22] ;in7
+ pmulhrsw m2, m1, [o(pw_3857x8)] ;t28,t29
+ pmulhrsw m1, [o(pw_m1380x8)] ;t18,t19
+ mova [rsp+gprsize*2+16*22], m1 ;t19
+ mova [rsp+gprsize*2+16*31], m2 ;t28
+ ITX_MULSUB_2W 2, 1, 0, 3, 7, m4017, 799 ;t18a, t29a
+ mova [rsp+gprsize*2+16*21], m2 ;t18a
+ mova [rsp+gprsize*2+16*32], m1 ;t29a
+ mova m0, [rsp+gprsize*2+16*23] ;in5
+ pmulhrsw m3, m0, [o(pw_3973x8)] ;t26, t27
+ pmulhrsw m0, [o(pw_995x8)] ;t20, t21
+ mova [rsp+gprsize*2+16*23], m0 ;t20
+ mova [rsp+gprsize*2+16*30], m3 ;t27
+ ITX_MULSUB_2W 3, 0, 1, 2, 7, 3406, 2276 ;t21a, t26a
+ mova [rsp+gprsize*2+16*24], m3 ;t21a
+ mova [rsp+gprsize*2+16*29], m0 ;t26a
+ mova m2, [rsp+gprsize*2+16*26] ;in3
+ pxor m0, m0
+ mova m3, m0
+ pmulhrsw m1, m2, [o(pw_4052x8)]
+ pmulhrsw m2, [o(pw_m601x8)]
+ jmp .main2
+
+ALIGN function_align
+cglobal_label .main_fast ;bottom half is zero
+ mova m0, [rsp+gprsize*2+16*19] ;in1
+ mova m1, [rsp+gprsize*2+16*20] ;in15
+ pmulhrsw m3, m0, [o(pw_4091x8)] ;t31a
+ pmulhrsw m0, [o(pw_201x8)] ;t16a
+ pmulhrsw m2, m1, [o(pw_3035x8)] ;t30a
+ pmulhrsw m1, [o(pw_m2751x8)] ;t17a
+ mova m7, [o(pd_2048)]
+ psubsw m4, m0, m1 ;t17
+ paddsw m0, m1 ;t16
+ psubsw m5, m3, m2 ;t30
+ paddsw m3, m2 ;t31
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a
+ mova [rsp+gprsize*2+16*19], m0 ;t16
+ mova [rsp+gprsize*2+16*20], m5 ;t17a
+ mova [rsp+gprsize*2+16*33], m4 ;t30a
+ mova [rsp+gprsize*2+16*34], m3 ;t31
+ mova m0, [rsp+gprsize*2+16*21] ;in9
+ mova m1, [rsp+gprsize*2+16*22] ;in7
+ pmulhrsw m3, m0, [o(pw_3703x8)]
+ pmulhrsw m0, [o(pw_1751x8)]
+ pmulhrsw m2, m1, [o(pw_3857x8)]
+ pmulhrsw m1, [o(pw_m1380x8)]
+ psubsw m4, m1, m0 ;t18
+ paddsw m0, m1 ;t19
+ psubsw m5, m2, m3 ;t29
+ paddsw m3, m2 ;t28
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a
+ mova [rsp+gprsize*2+16*21], m5 ;t18a
+ mova [rsp+gprsize*2+16*22], m0 ;t19
+ mova [rsp+gprsize*2+16*31], m3 ;t28
+ mova [rsp+gprsize*2+16*32], m4 ;t29a
+ mova m0, [rsp+gprsize*2+16*23] ;in5
+ mova m1, [rsp+gprsize*2+16*24] ;in11
+ pmulhrsw m3, m0, [o(pw_3973x8)]
+ pmulhrsw m0, [o(pw_995x8)]
+ pmulhrsw m2, m1, [o(pw_3513x8)]
+ pmulhrsw m1, [o(pw_m2106x8)]
+ psubsw m4, m0, m1 ;t21
+ paddsw m0, m1 ;t20
+ psubsw m5, m3, m2 ;t26
+ paddsw m3, m2 ;t27
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a
+ mova [rsp+gprsize*2+16*23], m0 ;t20
+ mova [rsp+gprsize*2+16*24], m5 ;t21a
+ mova [rsp+gprsize*2+16*29], m4 ;t26a
+ mova [rsp+gprsize*2+16*30], m3 ;t27
+ mova m0, [rsp+gprsize*2+16*25] ;in13
+ mova m2, [rsp+gprsize*2+16*26] ;in3
+ pmulhrsw m3, m0, [o(pw_3290x8)]
+ pmulhrsw m0, [o(pw_2440x8)]
+ pmulhrsw m1, m2, [o(pw_4052x8)]
+ pmulhrsw m2, [o(pw_m601x8)]
+ jmp .main2
+
+ALIGN function_align
+cglobal_label .main
+ mova m7, [o(pd_2048)]
+ mova m0, [rsp+gprsize*2+16*19] ;in1
+ mova m1, [rsp+gprsize*2+16*20] ;in15
+ mova m2, [rsp+gprsize*2+16*33] ;in17
+ mova m3, [rsp+gprsize*2+16*34] ;in31
+ ITX_MULSUB_2W 0, 3, 4, 5, 7, 201, 4091 ;t16a, t31a
+ ITX_MULSUB_2W 2, 1, 4, 5, 7, 3035, 2751 ;t17a, t30a
+ psubsw m4, m0, m2 ;t17
+ paddsw m0, m2 ;t16
+ psubsw m5, m3, m1 ;t30
+ paddsw m3, m1 ;t31
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 799, 4017 ;t17a, t30a
+ mova [rsp+gprsize*2+16*19], m0 ;t16
+ mova [rsp+gprsize*2+16*20], m5 ;t17a
+ mova [rsp+gprsize*2+16*33], m4 ;t30a
+ mova [rsp+gprsize*2+16*34], m3 ;t31
+ mova m0, [rsp+gprsize*2+16*21] ;in9
+ mova m1, [rsp+gprsize*2+16*22] ;in7
+ mova m2, [rsp+gprsize*2+16*31] ;in25
+ mova m3, [rsp+gprsize*2+16*32] ;in23
+ ITX_MULSUB_2W 0, 3, 4, 5, 7, 1751, 3703 ;t18a, t29a
+ ITX_MULSUB_2W 2, 1, 4, 5, 7, 3857, 1380 ;t19a, t28a
+ psubsw m4, m2, m0 ;t18
+ paddsw m0, m2 ;t19
+ psubsw m5, m1, m3 ;t29
+ paddsw m3, m1 ;t28
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m4017, 799 ;t18a, t29a
+ mova [rsp+gprsize*2+16*21], m5 ;t18a
+ mova [rsp+gprsize*2+16*22], m0 ;t19
+ mova [rsp+gprsize*2+16*31], m3 ;t28
+ mova [rsp+gprsize*2+16*32], m4 ;t29a
+ mova m0, [rsp+gprsize*2+16*23] ;in5
+ mova m1, [rsp+gprsize*2+16*24] ;in11
+ mova m2, [rsp+gprsize*2+16*29] ;in21
+ mova m3, [rsp+gprsize*2+16*30] ;in27
+ ITX_MULSUB_2W 0, 3, 4, 5, 7, 995, 3973 ;t20a, t27a
+ ITX_MULSUB_2W 2, 1, 4, 5, 7, 3513, 2106 ;t21a, t26a
+ psubsw m4, m0, m2 ;t21
+ paddsw m0, m2 ;t20
+ psubsw m5, m3, m1 ;t26
+ paddsw m3, m1 ;t27
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 3406, 2276 ;t21a, t26a
+ mova [rsp+gprsize*2+16*23], m0 ;t20
+ mova [rsp+gprsize*2+16*24], m5 ;t21a
+ mova [rsp+gprsize*2+16*29], m4 ;t26a
+ mova [rsp+gprsize*2+16*30], m3 ;t27
+ mova m0, [rsp+gprsize*2+16*25] ;in13
+ mova m1, [rsp+gprsize*2+16*26] ;in3
+ mova m2, [rsp+gprsize*2+16*27] ;in29
+ mova m3, [rsp+gprsize*2+16*28] ;in19
+ ITX_MULSUB_2W 0, 3, 4, 5, 7, 2440, 3290 ;t22a, t25a
+ ITX_MULSUB_2W 2, 1, 4, 5, 7, 4052, 601 ;t23a, t24a
+
+.main2:
+ psubsw m4, m2, m0 ;t22
+ paddsw m0, m2 ;t23
+ psubsw m5, m1, m3 ;t25
+ paddsw m3, m1 ;t24
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m2276, 3406 ;t22a, t25a
+ mova m2, [rsp+gprsize*2+16*24] ;t21a
+ psubsw m1, m5, m2 ;t21
+ paddsw m5, m2 ;t22
+ mova [rsp+gprsize*2+16*25], m5 ;t22
+ mova m2, [rsp+gprsize*2+16*29] ;t26a
+ psubsw m5, m4, m2 ;t26
+ paddsw m4, m2 ;t25
+ mova [rsp+gprsize*2+16*28], m4 ;t25
+ ITX_MULSUB_2W 5, 1, 2, 4, 7, m3784, 1567 ;t21a, t26a
+ mova [rsp+gprsize*2+16*24], m5 ;t21a
+ mova [rsp+gprsize*2+16*29], m1 ;t26a
+
+ mova m1, [rsp+gprsize*2+16*23] ;t20
+ mova m5, [rsp+gprsize*2+16*30] ;t27
+ psubsw m2, m0, m1 ;t20a
+ paddsw m0, m1 ;t23a
+ psubsw m6, m3, m5 ;t27a
+ paddsw m3, m5 ;t24a
+ ITX_MULSUB_2W 6, 2, 1, 5, 7, m3784, 1567 ;t20, t27
+ mova [rsp+gprsize*2+16*26], m0 ;t23a
+ mova [rsp+gprsize*2+16*27], m3 ;t24a
+ mova [rsp+gprsize*2+16*30], m2 ;t27
+
+ mova m0, [rsp+gprsize*2+16*20] ;t17a
+ mova m1, [rsp+gprsize*2+16*21] ;t18a
+ mova m2, [rsp+gprsize*2+16*32] ;t29a
+ mova m3, [rsp+gprsize*2+16*33] ;t30a
+ psubsw m4, m0, m1 ;t18
+ paddsw m0, m1 ;t17
+ psubsw m5, m3, m2 ;t29
+ paddsw m3, m2 ;t30
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t18a, t29a
+ mova [rsp+gprsize*2+16*20], m0 ;t17
+ mova [rsp+gprsize*2+16*21], m5 ;t18a
+ mova [rsp+gprsize*2+16*32], m4 ;t29a
+ mova [rsp+gprsize*2+16*33], m3 ;t30
+ mova m0, [rsp+gprsize*2+16*19] ;t16
+ mova m1, [rsp+gprsize*2+16*22] ;t19
+ mova m2, [rsp+gprsize*2+16*31] ;t28
+ mova m3, [rsp+gprsize*2+16*34] ;t31
+ psubsw m4, m0, m1 ;t19a
+ paddsw m0, m1 ;t16a
+ psubsw m5, m3, m2 ;t28a
+ paddsw m3, m2 ;t31a
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 1567, 3784 ;t19, t28
+ mova m2, [rsp+gprsize*2+16*15] ;tmp12
+ psubsw m1, m5, m6 ;t20a
+ paddsw m5, m6 ;t19a
+ psubsw m6, m2, m5 ;out19
+ paddsw m2, m5 ;out12
+ mova m5, [rsp+gprsize*2+16*30] ;t27
+ mova [rsp+gprsize*2+16*22], m6 ;out19
+ mova [rsp+gprsize*2+16*15], m2 ;out12
+ psubsw m6, m4, m5 ;t27a
+ paddsw m4, m5 ;t28a
+ ITX_MULSUB_2W 6, 1, 2, 5, 7, 2896, 2896 ;t20, t27
+ mova m2, [rsp+gprsize*2+16*6 ] ;tmp3
+ psubsw m5, m2, m4 ;out28
+ paddsw m2, m4 ;out3
+ mova m4, [rsp+gprsize*2+16*14] ;tmp11
+ mova [rsp+gprsize*2+16*31], m5 ;out28
+ mova [rsp+gprsize*2+16*6 ], m2 ;out3
+ psubsw m5, m4, m6 ;out20
+ paddsw m4, m6 ;out11
+ mova m2, [rsp+gprsize*2+16*7 ] ;tmp4
+ mova [rsp+gprsize*2+16*23], m5 ;out20
+ mova [rsp+gprsize*2+16*14], m4 ;out11
+ psubsw m5, m2, m1 ;out27
+ paddsw m2, m1 ;out4
+ mova m1, [rsp+gprsize*2+16*26] ;t23a
+ mova m4, [rsp+gprsize*2+16*27] ;t24a
+ mova [rsp+gprsize*2+16*30], m5 ;out27
+ mova [rsp+gprsize*2+16*7 ], m2 ;out4
+ psubsw m5, m0, m1 ;t23
+ paddsw m0, m1 ;t16
+ psubsw m2, m3, m4 ;t24
+ paddsw m3, m4 ;t31
+ ITX_MULSUB_2W 2, 5, 4, 6, 7, 2896, 2896 ;t23a, t24a
+ mova m6, [rsp+gprsize*2+16*18] ;tmp15
+ psubsw m4, m6, m0 ;out16
+ paddsw m6, m0 ;out15
+ mova m0, [rsp+gprsize*2+16*3 ] ;tmp0
+ mova m1, [rsp+gprsize*2+16*11] ;tmp8
+ mova [rsp+gprsize*2+16*18], m6 ;out15
+ mova [rsp+gprsize*2+16*19], m4 ;out16
+ psubsw m6, m0, m3 ;out31
+ paddsw m0, m3 ;out0
+ psubsw m4, m1, m2 ;out23
+ paddsw m1, m2 ;out8
+ mova m3, [rsp+gprsize*2+16*10] ;tmp7
+ mova [rsp+gprsize*2+16*34], m6 ;out31
+ mova [rsp+gprsize*2+16*11], m1 ;out8
+ mova [rsp+gprsize*2+16*26], m4 ;out23
+ paddsw m6, m3, m5 ;out7
+ psubsw m3, m5 ;out24
+ mova m1, [rsp+gprsize*2+16*20] ;t17
+ mova m5, [rsp+gprsize*2+16*25] ;t22
+ mova m2, [rsp+gprsize*2+16*17] ;tmp14
+ mova [rsp+gprsize*2+16*27], m3 ;out24
+ psubsw m4, m1, m5 ;t22a
+ paddsw m1, m5 ;t17a
+ psubsw m3, m2, m1 ;out17
+ paddsw m2, m1 ;out14
+ mova m5, [rsp+gprsize*2+16*28] ;t25
+ mova m1, [rsp+gprsize*2+16*33] ;t30
+ mova [rsp+gprsize*2+16*17], m2 ;out14
+ mova [rsp+gprsize*2+16*20], m3 ;out17
+ psubsw m2, m1, m5 ;t25a
+ paddsw m1, m5 ;t30a
+ ITX_MULSUB_2W 2, 4, 3, 5, 7, 2896, 2896 ;t22, t25
+ mova m5, [rsp+gprsize*2+16*4 ] ;tmp1
+ psubsw m3, m5, m1 ;out30
+ paddsw m5, m1 ;out1
+ mova m1, [rsp+gprsize*2+16*12] ;tmp9
+ mova [rsp+gprsize*2+16*33], m3 ;out30
+ mova [rsp+gprsize*2+16*4 ], m5 ;out1
+ psubsw m3, m1, m2 ;out22
+ paddsw m1, m2 ;out9
+ mova m5, [rsp+gprsize*2+16*9 ] ;tmp6
+ mova [rsp+gprsize*2+16*25], m3 ;out22
+ mova [rsp+gprsize*2+16*12], m1 ;out9
+ psubsw m3, m5, m4 ;out25
+ paddsw m5, m4 ;out6
+ mova m4, [rsp+gprsize*2+16*21] ;t18a
+ mova m1, [rsp+gprsize*2+16*24] ;t21a
+ mova m2, [rsp+gprsize*2+16*16] ;tmp13
+ mova [rsp+gprsize*2+16*28], m3 ;out25
+ mova [rsp+gprsize*2+16*9 ], m5 ;out6
+ paddsw m3, m4, m1 ;t18
+ psubsw m4, m1 ;t21
+ psubsw m5, m2, m3 ;out18
+ paddsw m2, m3 ;out13
+ mova m1, [rsp+gprsize*2+16*29] ;t26a
+ mova m3, [rsp+gprsize*2+16*32] ;t29a
+ mova [rsp+gprsize*2+16*21], m5 ;out18
+ mova [rsp+gprsize*2+16*16], m2 ;out13
+ psubsw m5, m3, m1 ;t26
+ paddsw m3, m1 ;t29
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 2896, 2896 ;t21a, t26a
+ mova m2, [rsp+gprsize*2+16*5 ] ;tmp2
+ psubsw m1, m2, m3 ;out29
+ paddsw m2, m3 ;out2
+ mova m3, [rsp+gprsize*2+16*13] ;tmp10
+ mova [rsp+gprsize*2+16*32], m1 ;out29
+ psubsw m7, m3, m5 ;out21
+ paddsw m3, m5 ;out10
+ mova m5, [rsp+gprsize*2+16*8 ] ;tmp5
+ mova [rsp+gprsize*2+16*24], m7 ;out21
+ mova [rsp+gprsize*2+16*13], m3 ;out10
+ psubsw m1, m5, m4 ;out26
+ paddsw m5, m4 ;out5
+ mova m7, m6 ;out7
+ mova m3, [rsp+gprsize*2+16*6 ] ;out3
+ mova m4, [rsp+gprsize*2+16*7 ] ;out4
+ mova [rsp+gprsize*2+16*29], m1 ;out26
+ mova m6, [rsp+gprsize*2+16*9 ] ;out6
+ mova m1, [rsp+gprsize*2+16*4 ] ;out1
+ ret
+
+
+cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+ call m(idct_32x8_internal_8bpc)
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ mov r3d, 8
+ lea tx2q, [o(.end)]
+
+.body:
+ pmulhrsw m0, m2
+ movd m2, [o(pw_2048)] ;intentionally rip-relative
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ pshuflw m0, m0, q0000
+ punpcklwd m0, m0
+ pxor m5, m5
+
+.loop:
+ mova m1, [dstq+16*0]
+ mova m3, [dstq+16*1]
+ punpckhbw m2, m1, m5
+ punpcklbw m1, m5
+ punpckhbw m4, m3, m5
+ punpcklbw m3, m5
+ paddw m2, m0
+ paddw m1, m0
+ paddw m4, m0
+ paddw m3, m0
+ packuswb m1, m2
+ packuswb m3, m4
+ mova [dstq+16*0], m1
+ mova [dstq+16*1], m3
+ add dstq, strideq
+ dec r3d
+ jg .loop
+ jmp tx2q
+
+.end:
+ RET
+
+
+cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq+16*0, 64
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ LOAD_8ROWS coeffq+16*2, 64
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+16*1, 32
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+ mova [rsp+gprsize+16*21], m4 ;in9
+ mova [rsp+gprsize+16*24], m5 ;in11
+ mova [rsp+gprsize+16*25], m6 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+
+ cmp eobd, 106
+ jg .full
+ call m(idct_8x32_internal_8bpc).main_fast
+ jmp .pass2
+
+.full:
+ LOAD_8ROWS coeffq+16*17, 32
+ mova [rsp+gprsize+16*33], m0 ;in17
+ mova [rsp+gprsize+16*28], m1 ;in19
+ mova [rsp+gprsize+16*29], m2 ;in21
+ mova [rsp+gprsize+16*32], m3 ;in23
+ mova [rsp+gprsize+16*31], m4 ;in25
+ mova [rsp+gprsize+16*30], m5 ;in27
+ mova [rsp+gprsize+16*27], m6 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+ call m(idct_8x32_internal_8bpc).main
+
+.pass2:
+ mova [rsp+gprsize+16*0 ], m7
+ lea tx2q, [o(.end)]
+ jmp m(idct_8x32_internal_8bpc).end1
+
+.end:
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.end1:
+ lea r3, [dstq+8]
+ lea tx2q, [o(.end2)]
+ jmp m(idct_8x8_internal_8bpc).pass2_main
+
+.end2:
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0 ], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.end3:
+ mov dstq, r3
+ add r3, 8
+ lea tx2q, [o(.end4)]
+ jmp m(idct_8x8_internal_8bpc).pass2_main
+
+.end4:
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0 ], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.end5)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.end5:
+ mov dstq, r3
+ add r3, 8
+ lea tx2q, [o(.end6)]
+ jmp m(idct_8x8_internal_8bpc).pass2_main
+
+.end6:
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0 ], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.end7)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.end7:
+ mov dstq, r3
+ lea tx2q, [o(.end8)]
+ jmp m(idct_8x8_internal_8bpc).pass2_main
+
+.end8:
+ ret
+
+
+cglobal inv_txfm_add_identity_identity_8x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+ mov r5d, 4
+ mov tx2d, 2
+ cmp eobd, 107
+ cmovns tx2d, r5d
+ mov r3d, tx2d
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)]
+.loop:
+ LOAD_8ROWS coeffq+16*0, 64
+ paddsw m6, [o(pw_5)]
+ mova [rsp+16*1], m6
+ mova m6, [o(pw_5)]
+ REPX {paddsw x, m6}, m0, m1, m2, m3, m4, m5, m7
+ call m(idct_8x8_internal_8bpc).pass1_end3
+ REPX {psraw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
+ mova [rsp+16*2], m5
+ mova [rsp+16*1], m6
+ mova [rsp+16*0], m7
+ call m(idct_8x8_internal_8bpc).end3
+ lea dstq, [dstq+strideq*2]
+ pxor m7, m7
+ REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ add coeffq, 16
+ dec r3d
+ jg .loop
+ RET
+
+cglobal inv_txfm_add_identity_identity_32x8_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+ mov r5d, 4
+ mov tx2d, 2
+ cmp eobd, 107
+ cmovns tx2d, r5d
+ mov r3d, tx2d
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+.loop:
+ LOAD_8ROWS coeffq+16*0, 16
+ pmulhrsw m6, [o(pw_4096)]
+ mova [rsp+16*1], m6
+ mova m6, [o(pw_4096)]
+ REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7
+ lea tx2q, [o(m(idct_32x8_internal_8bpc).end8)]
+ call m(idct_8x8_internal_8bpc).pass1_end3
+
+ mov [rsp+16*3], dstq
+ mova [rsp+16*2], m5
+ mova [rsp+16*1], m6
+ mova [rsp+16*0], m7
+ lea tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
+ call m(idct_8x8_internal_8bpc).end3
+
+ add coeffq, 16*8
+ mov dstq, [rsp+16*3]
+ lea dstq, [dstq+8]
+ dec r3d
+ jg .loop
+ jnc .loop
+ RET
+
+
+cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+ call m(idct_16x32_internal_8bpc)
+.end:
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ mov r2d, 16
+ lea tx2q, [o(.end)]
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+
+
+cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ LOAD_8ROWS coeffq+16*1, 128, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+16*5, 128, 1
+ call m(idct_16x8_internal_8bpc).main
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end:
+ SAVE_8ROWS coeffq+16*33, 64 ;in8~in15
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end1:
+ mova [coeffq+16*1 ], m0 ;in8
+ mova [coeffq+16*5 ], m4 ;in12
+ mova [rsp+gprsize+16*13], m2 ;in10
+ mova [rsp+gprsize+16*14], m6 ;in14
+ mova [rsp+gprsize+16*21], m1 ;in9
+ mova [rsp+gprsize+16*24], m3 ;in11
+ mova [rsp+gprsize+16*25], m5 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+ LOAD_8ROWS coeffq+16*0, 128, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+16*4, 128, 1
+ call m(idct_16x8_internal_8bpc).main
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+16*32, 64 ;in0~in7
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end3:
+ mova [rsp+gprsize+16*11], m2 ;in2
+ mova [rsp+gprsize+16*12], m6 ;in6
+ mova [rsp+gprsize+16*19], m1 ;in1
+ mova [rsp+gprsize+16*26], m3 ;in3
+ mova [rsp+gprsize+16*23], m5 ;in5
+ mova [rsp+gprsize+16*22], m7 ;in7
+
+ cmp eobd, 150
+ jg .full
+
+ mova m1, m4 ;in4
+ mova m2, [coeffq+16*1 ] ;in8
+ mova m3, [coeffq+16*5 ] ;in12
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ mova m0, [rsp+gprsize+16*11] ;in2
+ mova m1, [rsp+gprsize+16*12] ;in6
+ mova m2, [rsp+gprsize+16*13] ;in10
+ mova m3, [rsp+gprsize+16*14] ;in14
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ jmp .pass2
+
+.full:
+ mova [coeffq+16*0 ], m0 ;in0
+ mova [coeffq+16*4 ], m4 ;in4
+
+ LOAD_8ROWS coeffq+16*2, 128, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+16*6, 128, 1
+ call m(idct_16x8_internal_8bpc).main
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end4:
+ SAVE_8ROWS coeffq+16*34, 64 ;in16~in23
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end5)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end5:
+ mova [coeffq+16*2 ], m0 ;in16
+ mova [coeffq+16*6 ], m4 ;in20
+ mova [rsp+gprsize+16*15], m2 ;in18
+ mova [rsp+gprsize+16*16], m6 ;in22
+ mova [rsp+gprsize+16*33], m1 ;in17
+ mova [rsp+gprsize+16*28], m3 ;in19
+ mova [rsp+gprsize+16*29], m5 ;in21
+ mova [rsp+gprsize+16*32], m7 ;in23
+
+ LOAD_8ROWS coeffq+16*3, 128, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+16*7, 128, 1
+ call m(idct_16x8_internal_8bpc).main
+ lea tx2q, [o(.pass1_end6)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end6:
+ SAVE_8ROWS coeffq+16*35, 64 ;in24~in31
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end7)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end7:
+ mova [rsp+gprsize+16*17], m2 ;in26
+ mova [rsp+gprsize+16*18], m6 ;in30
+ mova [rsp+gprsize+16*31], m1 ;in25
+ mova [rsp+gprsize+16*30], m3 ;in27
+ mova [rsp+gprsize+16*27], m5 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+
+ mova m6, m0 ;in24
+ mova m7, m4 ;in28
+ mova m0, [coeffq+16*0 ] ;in0
+ mova m1, [coeffq+16*4 ] ;in4
+ mova m2, [coeffq+16*1 ] ;in8
+ mova m3, [coeffq+16*5 ] ;in12
+ mova m4, [coeffq+16*2 ] ;in16
+ mova m5, [coeffq+16*6 ] ;in20
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3 , 16
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call m(idct_8x32_internal_8bpc).main
+
+.pass2:
+ mov [rsp+gprsize*1+16*35], eobd
+ lea r3, [dstq+8]
+ mov [rsp+gprsize*2+16*35], r3
+ lea r3, [o(.end)]
+ jmp m(idct_8x32_internal_8bpc).end
+
+.end:
+ mov dstq, [rsp+gprsize*2+16*35]
+ mov eobd, [rsp+gprsize*1+16*35]
+ add coeffq, 16*32
+
+ mova m0, [coeffq+16*4 ] ;in1
+ mova m1, [coeffq+16*12] ;in3
+ mova m2, [coeffq+16*20] ;in5
+ mova m3, [coeffq+16*28] ;in7
+ mova m4, [coeffq+16*5 ] ;in9
+ mova m5, [coeffq+16*13] ;in11
+ mova m6, [coeffq+16*21] ;in13
+ mova m7, [coeffq+16*29] ;in15
+
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+ mova [rsp+gprsize+16*21], m4 ;in9
+ mova [rsp+gprsize+16*24], m5 ;in11
+ mova [rsp+gprsize+16*25], m6 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+
+ mova m0, [coeffq+16*0 ] ;in0
+ mova m1, [coeffq+16*16] ;in4
+ mova m2, [coeffq+16*1 ] ;in8
+ mova m3, [coeffq+16*17] ;in12
+
+ cmp eobd, 150
+ jg .full1
+
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ mova m0, [coeffq+16*8 ] ;in2
+ mova m1, [coeffq+16*24] ;in6
+ mova m2, [coeffq+16*9 ] ;in10
+ mova m3, [coeffq+16*25] ;in14
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ jmp m(idct_8x32_internal_8bpc).pass2
+
+.full1:
+ mova m4, [coeffq+16*2 ] ;in16
+ mova m5, [coeffq+16*18] ;in20
+ mova m6, [coeffq+16*3 ] ;in24
+ mova m7, [coeffq+16*19] ;in26
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ mova m0, [coeffq+16*8 ] ;in2
+ mova m1, [coeffq+16*24] ;in6
+ mova m2, [coeffq+16*9 ] ;in10
+ mova m3, [coeffq+16*25] ;in14
+ mova m4, [coeffq+16*10] ;in18
+ mova m5, [coeffq+16*26] ;in22
+ mova m6, [coeffq+16*11] ;in26
+ mova m7, [coeffq+16*27] ;in30
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ mova m0, [coeffq+16*6 ] ;in17
+ mova m1, [coeffq+16*14] ;in19
+ mova m2, [coeffq+16*22] ;in21
+ mova m3, [coeffq+16*30] ;in23
+ mova m4, [coeffq+16*7 ] ;in25
+ mova m5, [coeffq+16*15] ;in27
+ mova m6, [coeffq+16*23] ;in29
+ mova m7, [coeffq+16*31] ;in31
+
+ mova [rsp+gprsize+16*33], m0 ;in17
+ mova [rsp+gprsize+16*28], m1 ;in19
+ mova [rsp+gprsize+16*29], m2 ;in21
+ mova [rsp+gprsize+16*32], m3 ;in23
+ mova [rsp+gprsize+16*31], m4 ;in25
+ mova [rsp+gprsize+16*30], m5 ;in27
+ mova [rsp+gprsize+16*27], m6 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+
+ call m(idct_8x32_internal_8bpc).main
+ jmp m(idct_8x32_internal_8bpc).pass2
+
+
+cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+
+ call m(idct_32x16_internal_8bpc)
+ call m(idct_8x16_internal_8bpc).pass2
+
+ add coeffq, 16*16
+ lea dstq, [r3+8]
+ LOAD_8ROWS rsp+16*11, 16
+ mova [rsp+16*0], m7
+ lea tx2q, [o(m(idct_32x16_internal_8bpc).end)]
+ call m(idct_8x8_internal_8bpc).pass1_end
+ call m(idct_8x16_internal_8bpc).pass2
+
+ add coeffq, 16*16
+ lea dstq, [r3+8]
+ LOAD_8ROWS rsp+16*19, 16
+ mova [rsp+16*0], m7
+ lea tx2q, [o(m(idct_32x16_internal_8bpc).end)]
+ call m(idct_8x8_internal_8bpc).pass1_end
+ call m(idct_8x16_internal_8bpc).pass2
+
+ add coeffq, 16*16
+ lea dstq, [r3+8]
+ LOAD_8ROWS rsp+16*27, 16
+ mova [rsp+16*0], m7
+ lea tx2q, [o(m(idct_32x16_internal_8bpc).end)]
+ call m(idct_8x8_internal_8bpc).pass1_end
+ call m(idct_8x16_internal_8bpc).pass2
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ mov r3d, 16
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)]
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
+
+
+cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ add coeffq, 16
+ lea r3, [o(.pass1_end1)]
+.pass1:
+ LOAD_8ROWS coeffq+16*0, 128, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ LOAD_8ROWS coeffq+16*4, 128, 1
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+16*2, 64, 1
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+ mova [rsp+gprsize+16*21], m4 ;in9
+ mova [rsp+gprsize+16*24], m5 ;in11
+ mova [rsp+gprsize+16*25], m6 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+
+ LOAD_8ROWS coeffq+16*34, 64, 1
+ mova [rsp+gprsize+16*33], m0 ;in17
+ mova [rsp+gprsize+16*28], m1 ;in19
+ mova [rsp+gprsize+16*29], m2 ;in21
+ mova [rsp+gprsize+16*32], m3 ;in23
+ mova [rsp+gprsize+16*31], m4 ;in25
+ mova [rsp+gprsize+16*30], m5 ;in27
+ mova [rsp+gprsize+16*27], m6 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+ call m(idct_8x32_internal_8bpc).main
+
+.pass1_end:
+ mova [rsp+gprsize+16*0 ], m7
+ mov tx2q, r3
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+16*0, 32
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0 ], m7
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+16*16, 32
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0 ], m7
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end3:
+ SAVE_8ROWS coeffq+16*32, 32
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0 ], m7
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end4:
+ SAVE_8ROWS coeffq+16*48, 32
+
+ sub coeffq, 16
+ lea r3, [o(.end)]
+ jmp .pass1
+
+.end:
+ ret
+
+
+cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+ mov r4d, eobd
+ cmp eobd, 43 ;if (eob > 43)
+ sbb r3d, r3d ; iteration_count++
+ cmp r4d, 150 ;if (eob > 150)
+ sbb r3d, 0 ; iteration_count++
+ cmp r4d, 278 ;if (eob > 278)
+ sbb r3d, -4 ; iteration_count++
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ lea r4, [dstq+8]
+ mov [rsp+16*3], r4
+ mov [rsp+gprsize+16*3], r3d
+ mov [rsp+gprsize*2+16*3], coeffq
+
+.loop:
+ LOAD_8ROWS coeffq, 64, 1
+ mova [rsp+16*1], m6
+ pxor m6, m6
+ REPX {mova [coeffq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+ lea tx2q, [o(m(idct_32x16_internal_8bpc).end)]
+ call m(idct_8x8_internal_8bpc).pass1_end3
+ mova [rsp+16*0], m2
+ mova [rsp+16*1], m3
+ mova [rsp+16*2], m4
+ mova m3, [o(pw_1697x16)]
+ mova m4, [o(pw_16384)]
+ REPX {IDTX16 x, 2, 3, 4}, 5, 6, 7, 0, 1
+ mova m2, [o(pw_8192)]
+ REPX {pmulhrsw x, m2}, m5, m6, m7, m0, m1
+ mova m2, [rsp+16*0]
+ mova [rsp+16*0], m7
+ IDTX16 2, 7, 3, 4
+ mova m7, [rsp+16*2]
+ mova [rsp+16*2], m5
+ IDTX16 7, 5, 3, 4
+ mova m5, [rsp+16*1]
+ mova [rsp+16*1], m6
+ pmulhrsw m3, m5
+ pmulhrsw m3, m4
+ psrlw m4, 1 ; pw_8192
+ paddsw m3, m5
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ pmulhrsw m4, m7
+ call m(idct_8x8_internal_8bpc).end3
+ lea dstq, [dstq+strideq*2]
+ add coeffq, 16
+ dec r3d
+ jg .loop
+ mov coeffq, [rsp+gprsize*2+16*3]
+ add coeffq, 64*8
+ mov r3d, [rsp+gprsize+16*3]
+ xor dstq, dstq
+ mov [rsp+gprsize+16*3], dstq
+ mov dstq, [rsp+16*3]
+ test r3d, r3d
+ jnz .loop
+ RET
+
+
+cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
+ mov r4d, 12 ;0100b
+ mov r5d, 136 ;1000 1000b
+ cmp eobd, 44 ;if (eob > 43)
+ cmovns r4d, r5d ; iteration_count+2
+ cmp eobd, 151 ;if (eob > 150)
+ mov r3d, 34952 ;1000 1000 1000 1000b
+ cmovs r3d, r4d ; iteration_count += 4
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ lea r4, [dstq+8]
+ mov [rsp+16*3], r4
+
+.loop:
+ LOAD_8ROWS coeffq, 32, 1
+ REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7
+ mova [rsp+16*1], m6
+ lea tx2q, [o(m(idct_32x16_internal_8bpc).end)]
+ call m(idct_8x8_internal_8bpc).pass1_end3
+ mova [rsp+16*1], m5
+ mova [rsp+16*2], m6
+ mova m6, [o(pw_1697x16)]
+ REPX {IDTX16 x, 5, 6}, 7, 0, 1, 2, 3, 4
+ pmulhrsw m7, [o(pw_2048)]
+ mova m5, [rsp+16*1]
+ mova [rsp+16*0], m7
+ IDTX16 5, 7, 6
+ mova m7, [rsp+16*2]
+ IDTX16 7, 6, 6
+ mova m6, [o(pw_2048)]
+ REPX {pmulhrsw x, m6}, m0, m1, m2, m3, m4, m5, m7
+ mova [rsp+16*2], m5
+ mova [rsp+16*1], m7
+ call m(idct_8x8_internal_8bpc).end3
+ lea dstq, [dstq+strideq*2]
+ pxor m7, m7
+ REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+
+.loop_end:
+ add coeffq, 16
+ shr r3d, 2
+ jz .ret
+ test r3d, 2
+ jnz .loop
+ mov r4d, r3d
+ and r4d, 1
+ lea coeffq, [coeffq+r4*8+32*7]
+ mov dstq, [rsp+16*3]
+ lea r4, [dstq+8]
+ mov [rsp+16*3], r4
+ jmp .loop
+
+.ret:
+ RET
+
+
+cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+
+ call m(idct_32x32_internal_8bpc)
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ mov r3d, 32
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)]
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
+
+
+cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mov r4d, 2
+ sub eobd, 136
+ mov [rsp+gprsize*1+16*35], eobd
+ mov r3d, 4
+ cmovs r3d, r4d
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+ mov [rsp+gprsize*2+16*35], coeffq
+
+.pass1_loop:
+ LOAD_8ROWS coeffq+64*1, 64*2
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+ mova [rsp+gprsize+16*21], m4 ;in9
+ mova [rsp+gprsize+16*24], m5 ;in11
+ mova [rsp+gprsize+16*25], m6 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+
+ mov tx2d, [rsp+gprsize*1+16*35]
+ test tx2d, tx2d
+ jl .fast
+
+.full:
+ LOAD_8ROWS coeffq+64*0, 64*4
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+64*2, 64*4
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+64*17, 64*2
+ mova [rsp+gprsize+16*33], m0 ;in17
+ mova [rsp+gprsize+16*28], m1 ;in19
+ mova [rsp+gprsize+16*29], m2 ;in21
+ mova [rsp+gprsize+16*32], m3 ;in23
+ mova [rsp+gprsize+16*31], m4 ;in25
+ mova [rsp+gprsize+16*30], m5 ;in27
+ mova [rsp+gprsize+16*27], m6 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+
+ call m(idct_8x32_internal_8bpc).main
+ jmp .pass1_end
+
+.fast:
+ mova m0, [coeffq+256*0]
+ mova m1, [coeffq+256*1]
+ mova m2, [coeffq+256*2]
+ mova m3, [coeffq+256*3]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ mova m0, [coeffq+128*1]
+ mova m1, [coeffq+128*3]
+ mova m2, [coeffq+128*5]
+ mova m3, [coeffq+128*7]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call m(idct_8x32_internal_8bpc).main_fast
+
+.pass1_end:
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+64*0, 64
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end3:
+ SAVE_8ROWS coeffq+64*16, 64
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end4:
+ SAVE_8ROWS coeffq+64*24, 64
+
+ add coeffq, 16
+ dec r3d
+ jg .pass1_loop
+
+
+.pass2:
+ mov coeffq, [rsp+gprsize*2+16*35]
+ mov r3d, 4
+ lea tx2q, [o(.pass2_end)]
+
+.pass2_loop:
+ mov [rsp+gprsize*3+16*35], r3d
+ lea r3, [dstq+8]
+ mov [rsp+gprsize*2+16*35], r3
+
+ mova m0, [coeffq+16*4 ]
+ mova m1, [coeffq+16*12]
+ mova m2, [coeffq+16*20]
+ mova m3, [coeffq+16*28]
+ mova m4, [coeffq+16*5 ]
+ mova m5, [coeffq+16*13]
+ mova m6, [coeffq+16*21]
+ mova m7, [coeffq+16*29]
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+ mova [rsp+gprsize+16*21], m4 ;in9
+ mova [rsp+gprsize+16*24], m5 ;in11
+ mova [rsp+gprsize+16*25], m6 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+
+ mov eobd, [rsp+gprsize*1+16*35]
+ test eobd, eobd
+ jl .fast1
+
+.full1:
+ mova m0, [coeffq+16*0 ]
+ mova m1, [coeffq+16*16]
+ mova m2, [coeffq+16*1 ]
+ mova m3, [coeffq+16*17]
+ mova m4, [coeffq+16*2 ]
+ mova m5, [coeffq+16*18]
+ mova m6, [coeffq+16*3 ]
+ mova m7, [coeffq+16*19]
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ mova m0, [coeffq+16*8 ]
+ mova m1, [coeffq+16*24]
+ mova m2, [coeffq+16*9 ]
+ mova m3, [coeffq+16*25]
+ mova m4, [coeffq+16*10]
+ mova m5, [coeffq+16*26]
+ mova m6, [coeffq+16*11]
+ mova m7, [coeffq+16*27]
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ mova m0, [coeffq+16*6 ]
+ mova m1, [coeffq+16*14]
+ mova m2, [coeffq+16*22]
+ mova m3, [coeffq+16*30]
+ mova m4, [coeffq+16*7 ]
+ mova m5, [coeffq+16*15]
+ mova m6, [coeffq+16*23]
+ mova m7, [coeffq+16*31]
+ mova [rsp+gprsize+16*33], m0 ;in17
+ mova [rsp+gprsize+16*28], m1 ;in19
+ mova [rsp+gprsize+16*29], m2 ;in21
+ mova [rsp+gprsize+16*32], m3 ;in23
+ mova [rsp+gprsize+16*31], m4 ;in25
+ mova [rsp+gprsize+16*30], m5 ;in27
+ mova [rsp+gprsize+16*27], m6 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+
+ call m(idct_8x32_internal_8bpc).main
+ jmp tx2q
+
+.fast1:
+ mova m0, [coeffq+16*0 ]
+ mova m1, [coeffq+16*16]
+ mova m2, [coeffq+16*1 ]
+ mova m3, [coeffq+16*17]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ mova m0, [coeffq+16*8 ]
+ mova m1, [coeffq+16*24]
+ mova m2, [coeffq+16*9 ]
+ mova m3, [coeffq+16*25]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ jmp tx2q
+
+.pass2_end:
+ lea r3, [o(.pass2_end1)]
+ jmp m(idct_8x32_internal_8bpc).end
+
+.pass2_end1:
+ lea tx2q, [o(.pass2_end)]
+ add coeffq, 16*32
+ mov dstq, [rsp+gprsize*2+16*35]
+ mov r3d, [rsp+gprsize*3+16*35]
+ dec r3d
+ jg .pass2_loop
+
+ ret
+
+
+cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2
+ mov r4d, 2
+ cmp eobd, 136
+ mov r3d, 4
+ cmovs r3d, r4d
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*0+16*3], r4
+ mov [rsp+gprsize*1+16*3], r3d
+ mov [rsp+gprsize*2+16*3], r3d
+ mov [rsp+gprsize*3+16*3], coeffq
+
+.loop:
+ LOAD_8ROWS coeffq, 64
+ mova [rsp+16*1], m6
+ lea tx2q, [o(m(idct_32x16_internal_8bpc).end)]
+ call m(idct_8x8_internal_8bpc).pass1_end3
+ pmulhrsw m7, [o(pw_8192)]
+ mova [rsp+16*0], m7
+ mova m7, [o(pw_8192)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ mova [rsp+16*1], m6
+ mova [rsp+16*2], m5
+ call m(idct_8x8_internal_8bpc).end3
+ lea dstq, [dstq+strideq*2]
+
+ pxor m7, m7
+ REPX {mova [coeffq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+
+ add coeffq, 16
+ dec r3d
+ jg .loop
+
+ mov r4d, [rsp+gprsize*2+16*3]
+ dec r4d
+ jle .ret
+
+ mov dstq, [rsp+gprsize*0+16*3]
+ mov coeffq, [rsp+gprsize*3+16*3]
+ mov [rsp+gprsize*2+16*3], r4
+ lea r3, [dstq+8]
+ add coeffq, 64*8
+ mov [rsp+gprsize*0+16*3], r3
+ mov r3d, [rsp+gprsize*1+16*3]
+ mov [rsp+gprsize*3+16*3], coeffq
+ jmp .loop
+
+.ret:
+ RET
+
+
+cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+ call m(idct_16x64_internal_8bpc)
+.end:
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ mov r2d, 32
+ lea tx2q, [o(.end)]
+ jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
+
+
+cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mov r4d, 2
+ sub eobd, 151
+ mov [rsp+gprsize*1+16*67], eobd
+ mov r3d, 4
+ cmovs r3d, r4d
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+ mov [rsp+gprsize*2+16*67], coeffq
+
+.pass1_loop:
+ LOAD_8ROWS coeffq+64*0, 64*2
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+64*1, 64*2
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ SAVE_8ROWS coeffq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+64*0, 64
+
+ add coeffq, 16
+ dec r3d
+ jg .pass1_loop
+
+ mov coeffq, [rsp+gprsize*2+16*67]
+ mov r3d, 2
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*2+16*67], r4
+ lea r4, [o(.end1)]
+
+.pass2_loop:
+ mov [rsp+gprsize*3+16*67], r3d
+ mov eobd, [rsp+gprsize*1+16*67]
+
+ mova m0, [coeffq+16*4 ] ;in1
+ mova m1, [coeffq+16*12] ;in3
+ mova m2, [coeffq+16*20] ;in5
+ mova m3, [coeffq+16*28] ;in7
+ mova m4, [coeffq+16*5 ] ;in9
+ mova m5, [coeffq+16*13] ;in11
+ mova m6, [coeffq+16*21] ;in13
+ mova m7, [coeffq+16*29] ;in15
+ mova [rsp+gprsize+16*35], m0 ;in1
+ mova [rsp+gprsize+16*49], m1 ;in3
+ mova [rsp+gprsize+16*43], m2 ;in5
+ mova [rsp+gprsize+16*41], m3 ;in7
+ mova [rsp+gprsize+16*39], m4 ;in9
+ mova [rsp+gprsize+16*45], m5 ;in11
+ mova [rsp+gprsize+16*47], m6 ;in13
+ mova [rsp+gprsize+16*37], m7 ;in15
+
+ pxor m4, m4
+ mova m0, [coeffq+16*0]
+ mova m1, [coeffq+16*1]
+
+ test eobd, eobd
+ jl .fast
+
+.full:
+ mova m2, [coeffq+16*2]
+ mova m3, [coeffq+16*3]
+
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ pxor m4, m4
+ mova m0, [coeffq+16*16]
+ mova m1, [coeffq+16*17]
+ mova m2, [coeffq+16*18]
+ mova m3, [coeffq+16*19]
+
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ mova m0, [coeffq+16*8 ]
+ mova m1, [coeffq+16*24]
+ mova m2, [coeffq+16*9 ]
+ mova m3, [coeffq+16*25]
+ mova m4, [coeffq+16*10]
+ mova m5, [coeffq+16*26]
+ mova m6, [coeffq+16*11]
+ mova m7, [coeffq+16*27]
+ mova [rsp+gprsize+16*19], m0
+ mova [rsp+gprsize+16*26], m1
+ mova [rsp+gprsize+16*23], m2
+ mova [rsp+gprsize+16*22], m3
+ mova [rsp+gprsize+16*21], m4
+ mova [rsp+gprsize+16*24], m5
+ mova [rsp+gprsize+16*25], m6
+ mova [rsp+gprsize+16*20], m7
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ SAVE_8ROWS rsp+gprsize+16*3, 16
+
+ mova m0, [coeffq+16*6 ] ;in17
+ mova m1, [coeffq+16*14] ;in19
+ mova m2, [coeffq+16*22] ;in21
+ mova m3, [coeffq+16*30] ;in23
+ mova m4, [coeffq+16*7 ] ;in25
+ mova m5, [coeffq+16*15] ;in27
+ mova m6, [coeffq+16*23] ;in29
+ mova m7, [coeffq+16*31] ;in31
+ mova [rsp+gprsize+16*63], m0 ;in17
+ mova [rsp+gprsize+16*53], m1 ;in19
+ mova [rsp+gprsize+16*55], m2 ;in21
+ mova [rsp+gprsize+16*61], m3 ;in23
+ mova [rsp+gprsize+16*59], m4 ;in25
+ mova [rsp+gprsize+16*57], m5 ;in27
+ mova [rsp+gprsize+16*51], m6 ;in29
+ mova [rsp+gprsize+16*65], m7 ;in31
+
+ call .main
+ jmp .end
+
+.fast:
+ REPX {mova x, m4}, m2, m3, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ pxor m4, m4
+ mova m0, [coeffq+16*16]
+ mova m1, [coeffq+16*17]
+
+ REPX {mova x, m4}, m2, m3, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ mova m0, [coeffq+16*8 ]
+ mova m1, [coeffq+16*24]
+ mova m2, [coeffq+16*9 ]
+ mova m3, [coeffq+16*25]
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+
+ call m(idct_8x32_internal_8bpc).main_veryfast
+ SAVE_8ROWS rsp+gprsize+16*3, 16
+
+ call .main_fast
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mov r3, r4
+ jmp m(idct_8x32_internal_8bpc).end2
+
+.end1:
+ LOAD_8ROWS rsp+gprsize+16*35, 16
+ lea dstq, [dstq+strideq*2]
+ lea r3, [rsp+16*32+gprsize]
+ call .write
+ mov dstq, [rsp+gprsize*2+16*67]
+ mov r3d, [rsp+gprsize*3+16*67]
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*2+16*67], r4
+ lea r4, [o(.end1)]
+
+ dec r3d
+ jg .pass2_loop
+ ret
+.write:
+ mova [r3+16*0], m7
+ mov r4, -16*32
+ pxor m7, m7
+ sub coeffq, r4
+.zero_loop:
+ mova [coeffq+r4+16*0], m7
+ mova [coeffq+r4+16*1], m7
+ add r4, 16*2
+ jl .zero_loop
+ call .write_main2
+ LOAD_8ROWS r3+16*11, 16
+ call .write_main
+ LOAD_8ROWS r3+16*19, 16
+ call .write_main
+ LOAD_8ROWS r3+16*27, 16
+.write_main:
+ mova [r3+16*0], m7
+.write_main2:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [r3+16*0]
+ mova [r3+16*2], m5
+ mova [r3+16*1], m6
+ mova [r3+16*0], m7
+ WRITE_8X4 0, 1, 2, 3, 5, 6, 7
+ lea dstq, [dstq+strideq*2]
+ WRITE_8X4 4, [r3+16*2], [r3+16*1], [r3+16*0], 5, 6, 7
+ lea dstq, [dstq+strideq*2]
+ ret
+
+
+ALIGN function_align
+cglobal_label .main_fast
+ mova m0, [rsp+gprsize*2+16*35] ;in1
+ pmulhrsw m3, m0, [o(pw_4095x8)] ;t62,t63
+ pmulhrsw m0, [o(pw_101x8)] ;t32,t33
+ mova m7, [o(pd_2048)]
+ mova [rsp+gprsize*2+16*35], m0 ;t32
+ mova [rsp+gprsize*2+16*66], m3 ;t63
+ ITX_MULSUB_2W 3, 0, 1, 2, 7, 401, 4076 ;t33a, t62a
+ mova [rsp+gprsize*2+16*36], m3 ;t33a
+ mova [rsp+gprsize*2+16*65], m0 ;t62a
+
+ mova m1, [rsp+gprsize*2+16*37] ;in15
+ pmulhrsw m2, m1, [o(pw_3822x8)] ;t60,t61
+ pmulhrsw m1, [o(pw_m1474x8)] ;t34,t35
+ mova [rsp+gprsize*2+16*38], m1 ;t35
+ mova [rsp+gprsize*2+16*63], m2 ;t60
+ ITX_MULSUB_2W 2, 1, 0, 3, 7, m4076, 401 ;t34a, t61a
+ mova [rsp+gprsize*2+16*37], m2 ;t34a
+ mova [rsp+gprsize*2+16*64], m1 ;t61a
+
+ mova m0, [rsp+gprsize*2+16*39] ;in9
+ pmulhrsw m3, m0, [o(pw_3996x8)] ;t58,t59
+ pmulhrsw m0, [o(pw_897x8)] ;t36,t37
+ mova [rsp+gprsize*2+16*39], m0 ;t36
+ mova [rsp+gprsize*2+16*62], m3 ;t59
+ ITX_MULSUB_2W 3, 0, 1, 2, 7, 3166, 2598 ;t37a, t58a
+ mova [rsp+gprsize*2+16*40], m3 ;t37a
+ mova [rsp+gprsize*2+16*61], m0 ;t58a
+
+ mova m1, [rsp+gprsize*2+16*41] ;in7
+ pmulhrsw m2, m1, [o(pw_4036x8)] ;t56,t57
+ pmulhrsw m1, [o(pw_m700x8)] ;t38,t39
+ mova [rsp+gprsize*2+16*42], m1 ;t39
+ mova [rsp+gprsize*2+16*59], m2 ;t56
+ ITX_MULSUB_2W 2, 1, 0, 3, 7, m2598, 3166 ;t38a, t57a
+ mova [rsp+gprsize*2+16*41], m2 ;t38a
+ mova [rsp+gprsize*2+16*60], m1 ;t57a
+
+ mova m0, [rsp+gprsize*2+16*43] ;in5
+ pmulhrsw m3, m0, [o(pw_4065x8)] ;t54,t55
+ pmulhrsw m0, [o(pw_501x8)] ;t40,t41
+ mova [rsp+gprsize*2+16*43], m0 ;t40
+ mova [rsp+gprsize*2+16*58], m3 ;t55
+ ITX_MULSUB_2W 3, 0, 1, 2, 7, 1931, 3612 ;t41a, t54a
+ mova [rsp+gprsize*2+16*44], m3 ;t41a
+ mova [rsp+gprsize*2+16*57], m0 ;t54a
+
+ mova m1, [rsp+gprsize*2+16*45] ;in11
+ pmulhrsw m2, m1, [o(pw_3948x8)] ;t52,t53
+ pmulhrsw m1, [o(pw_m1092x8)] ;t42,t43
+ mova [rsp+gprsize*2+16*46], m1 ;t43
+ mova [rsp+gprsize*2+16*55], m2 ;t52
+ ITX_MULSUB_2W 2, 1, 0, 3, 7, m3612, 1931 ;t42a, t53a
+ mova [rsp+gprsize*2+16*45], m2 ;t42a
+ mova [rsp+gprsize*2+16*56], m1 ;t53a
+
+ mova m0, [rsp+gprsize*2+16*47] ;in13
+ pmulhrsw m3, m0, [o(pw_3889x8)] ;t50,t51
+ pmulhrsw m0, [o(pw_1285x8)] ;t44,t45
+ mova m6, m0
+ mova [rsp+gprsize*2+16*54], m3 ;t51
+ ITX_MULSUB_2W 3, 0, 1, 2, 7, 3920, 1189 ;t45a, t50a
+ mova [rsp+gprsize*2+16*48], m3 ;t45a
+ mova [rsp+gprsize*2+16*53], m0 ;t50a
+
+ mova m0, [rsp+gprsize*2+16*49] ;in3
+ pmulhrsw m3, m0, [o(pw_4085x8)] ;t48,t49
+ pmulhrsw m0, [o(pw_m301x8)] ;t46,t47
+ mova m4, m3
+ mova m5, m0
+
+ jmp .main2
+
+ALIGN function_align
+cglobal_label .main
+ mova m0, [rsp+gprsize*2+16*35] ;in1
+ mova m1, [rsp+gprsize*2+16*65] ;in31
+ pmulhrsw m3, m0, [o(pw_4095x8)] ;t63a
+ pmulhrsw m0, [o(pw_101x8)] ;t32a
+ pmulhrsw m2, m1, [o(pw_2967x8)] ;t62a
+ pmulhrsw m1, [o(pw_m2824x8)] ;t33a
+ mova m7, [o(pd_2048)]
+ psubsw m4, m0, m1 ;t33
+ paddsw m0, m1 ;t32
+ psubsw m5, m3, m2 ;t62
+ paddsw m3, m2 ;t63
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 401, 4076 ;t33a, t62a
+ mova [rsp+gprsize*2+16*35], m0 ;t32
+ mova [rsp+gprsize*2+16*36], m5 ;t33a
+ mova [rsp+gprsize*2+16*65], m4 ;t62a
+ mova [rsp+gprsize*2+16*66], m3 ;t63
+
+ mova m0, [rsp+gprsize*2+16*63] ;in17
+ mova m1, [rsp+gprsize*2+16*37] ;in15
+ pmulhrsw m3, m0, [o(pw_3745x8)] ;t61a
+ pmulhrsw m0, [o(pw_1660x8)] ;t34a
+ pmulhrsw m2, m1, [o(pw_3822x8)] ;t60a
+ pmulhrsw m1, [o(pw_m1474x8)] ;t35a
+ psubsw m4, m1, m0 ;t34
+ paddsw m0, m1 ;t35
+ psubsw m5, m2, m3 ;t61
+ paddsw m3, m2 ;t60
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m4076, 401 ;t34a, t61a
+ mova [rsp+gprsize*2+16*37], m5 ;t34a
+ mova [rsp+gprsize*2+16*38], m0 ;t35
+ mova [rsp+gprsize*2+16*63], m3 ;t60
+ mova [rsp+gprsize*2+16*64], m4 ;t61a
+
+ mova m0, [rsp+gprsize*2+16*39] ;in9
+ mova m1, [rsp+gprsize*2+16*61] ;in23
+ pmulhrsw m3, m0, [o(pw_3996x8)] ;t59a
+ pmulhrsw m0, [o(pw_897x8)] ;t36a
+ pmulhrsw m2, m1, [o(pw_3461x8)] ;t58a
+ pmulhrsw m1, [o(pw_m2191x8)] ;t37a
+ psubsw m4, m0, m1 ;t37
+ paddsw m0, m1 ;t36
+ psubsw m5, m3, m2 ;t58
+ paddsw m3, m2 ;t59
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 3166, 2598 ;t37a, t58a
+ mova [rsp+gprsize*2+16*39], m0 ;t36
+ mova [rsp+gprsize*2+16*40], m5 ;t37a
+ mova [rsp+gprsize*2+16*61], m4 ;t58a
+ mova [rsp+gprsize*2+16*62], m3 ;t59
+
+ mova m0, [rsp+gprsize*2+16*59] ;in25
+ mova m1, [rsp+gprsize*2+16*41] ;in7
+ pmulhrsw m3, m0, [o(pw_3349x8)] ;t57a
+ pmulhrsw m0, [o(pw_2359x8)] ;t38a
+ pmulhrsw m2, m1, [o(pw_4036x8)] ;t56a
+ pmulhrsw m1, [o(pw_m700x8)] ;t39a
+ psubsw m4, m1, m0 ;t38
+ paddsw m0, m1 ;t39
+ psubsw m5, m2, m3 ;t57
+ paddsw m3, m2 ;t56
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m2598, 3166 ;t38a, t57a
+ mova [rsp+gprsize*2+16*41], m5 ;t38a
+ mova [rsp+gprsize*2+16*42], m0 ;t39
+ mova [rsp+gprsize*2+16*59], m3 ;t56
+ mova [rsp+gprsize*2+16*60], m4 ;t57a
+
+ mova m0, [rsp+gprsize*2+16*43] ;in5
+ mova m1, [rsp+gprsize*2+16*57] ;in27
+ pmulhrsw m3, m0, [o(pw_4065x8)] ;t55a
+ pmulhrsw m0, [o(pw_501x8)] ;t40a
+ pmulhrsw m2, m1, [o(pw_3229x8)] ;t54a
+ pmulhrsw m1, [o(pw_m2520x8)] ;t41a
+ psubsw m4, m0, m1 ;t41
+ paddsw m0, m1 ;t40
+ psubsw m5, m3, m2 ;t54
+ paddsw m3, m2 ;t55
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 1931, 3612 ;t41a, t54a
+ mova [rsp+gprsize*2+16*43], m0 ;t40
+ mova [rsp+gprsize*2+16*44], m5 ;t41a
+ mova [rsp+gprsize*2+16*57], m4 ;t54a
+ mova [rsp+gprsize*2+16*58], m3 ;t55
+
+ mova m0, [rsp+gprsize*2+16*55] ;in21
+ mova m1, [rsp+gprsize*2+16*45] ;in11
+ pmulhrsw m3, m0, [o(pw_3564x8)] ;t53a
+ pmulhrsw m0, [o(pw_2019x8)] ;t42a
+ pmulhrsw m2, m1, [o(pw_3948x8)] ;t52a
+ pmulhrsw m1, [o(pw_m1092x8)] ;t43a
+ psubsw m4, m1, m0 ;t42
+ paddsw m0, m1 ;t43
+ psubsw m5, m2, m3 ;t53
+ paddsw m3, m2 ;t52
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, m3612, 1931 ;t42a, t53a
+ mova [rsp+gprsize*2+16*45], m5 ;t42a
+ mova [rsp+gprsize*2+16*46], m0 ;t43
+ mova [rsp+gprsize*2+16*55], m3 ;t52
+ mova [rsp+gprsize*2+16*56], m4 ;t53a
+
+ mova m0, [rsp+gprsize*2+16*47] ;in13
+ mova m1, [rsp+gprsize*2+16*53] ;in19
+ pmulhrsw m3, m0, [o(pw_3889x8)] ;t51a
+ pmulhrsw m0, [o(pw_1285x8)] ;t44a
+ pmulhrsw m2, m1, [o(pw_3659x8)] ;t50a
+ pmulhrsw m1, [o(pw_m1842x8)] ;t45a
+ psubsw m4, m0, m1 ;t45
+ paddsw m0, m1 ;t44
+ psubsw m5, m3, m2 ;t50
+ paddsw m3, m2 ;t51
+ ITX_MULSUB_2W 5, 4, 1, 2, 7, 3920, 1189 ;t45a, t50a
+ mova m6, m0
+ mova [rsp+gprsize*2+16*48], m5 ;t45a
+ mova [rsp+gprsize*2+16*53], m4 ;t50a
+ mova [rsp+gprsize*2+16*54], m3 ;t51
+
+ mova m0, [rsp+gprsize*2+16*51] ;in29
+ mova m1, [rsp+gprsize*2+16*49] ;in3
+ pmulhrsw m3, m0, [o(pw_3102x8)] ;t49a
+ pmulhrsw m0, [o(pw_2675x8)] ;t46a
+ pmulhrsw m2, m1, [o(pw_4085x8)] ;t48a
+ pmulhrsw m1, [o(pw_m301x8)] ;t47a
+ psubsw m5, m1, m0 ;t46
+ paddsw m0, m1 ;t47
+ psubsw m4, m2, m3 ;t49
+ paddsw m3, m2 ;t48
+
+ALIGN function_align
+.main2:
+ ITX_MULSUB_2W 4, 5, 1, 2, 7, m1189, 3920 ;t46a, t49a
+ mova m1, [rsp+gprsize*2+16*54] ;t51
+ psubsw m2, m0, m6 ;t44a
+ paddsw m0, m6 ;t47a
+ psubsw m6, m3, m1 ;t51a
+ paddsw m3, m1 ;t48a
+ mova [rsp+gprsize*2+16*50], m0 ;t47a
+ mova [rsp+gprsize*2+16*51], m3 ;t48a
+ ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t44, t51
+ mova [rsp+gprsize*2+16*47], m6 ;t44
+ mova [rsp+gprsize*2+16*54], m2 ;t51
+
+ mova m0, [rsp+gprsize*2+16*48] ;t45a
+ mova m3, [rsp+gprsize*2+16*53] ;t50a
+ psubsw m2, m4, m0 ;t45
+ paddsw m4, m0 ;t46
+ psubsw m6, m5, m3 ;t50
+ paddsw m5, m3 ;t49
+ ITX_MULSUB_2W 6, 2, 0, 3, 7, m2276, 3406 ;t45a, t50a
+ mova [rsp+gprsize*2+16*48], m6 ;t45a
+ mova [rsp+gprsize*2+16*49], m4 ;t46
+ mova [rsp+gprsize*2+16*52], m5 ;t49
+ mova [rsp+gprsize*2+16*53], m2 ;t50a
+
+ mova m0, [rsp+gprsize*2+16*43] ;t40
+ mova m2, [rsp+gprsize*2+16*46] ;t43
+ mova m3, [rsp+gprsize*2+16*55] ;t52
+ mova m1, [rsp+gprsize*2+16*58] ;t55
+ psubsw m4, m0, m2 ;t43a
+ paddsw m0, m2 ;t40a
+ psubsw m5, m1, m3 ;t52a
+ paddsw m1, m3 ;t55a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t43, t52
+ mova [rsp+gprsize*2+16*43], m0 ;t40a
+ mova [rsp+gprsize*2+16*46], m5 ;t43
+ mova [rsp+gprsize*2+16*55], m4 ;t52
+ mova [rsp+gprsize*2+16*58], m1 ;t55a
+
+ mova m0, [rsp+gprsize*2+16*44] ;t41a
+ mova m2, [rsp+gprsize*2+16*45] ;t42a
+ mova m3, [rsp+gprsize*2+16*56] ;t53a
+ mova m1, [rsp+gprsize*2+16*57] ;t54a
+ psubsw m4, m0, m2 ;t42
+ paddsw m0, m2 ;t41
+ psubsw m5, m1, m3 ;t53
+ paddsw m1, m3 ;t54
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, 3406, 2276 ;t42a, t53a
+ mova [rsp+gprsize*2+16*44], m0 ;t41
+ mova [rsp+gprsize*2+16*45], m5 ;t42a
+ mova [rsp+gprsize*2+16*56], m4 ;t53a
+ mova [rsp+gprsize*2+16*57], m1 ;t54
+
+ mova m0, [rsp+gprsize*2+16*41] ;t38a
+ mova m2, [rsp+gprsize*2+16*40] ;t37a
+ mova m3, [rsp+gprsize*2+16*61] ;t58a
+ mova m1, [rsp+gprsize*2+16*60] ;t57a
+ psubsw m4, m0, m2 ;t37
+ paddsw m0, m2 ;t38
+ psubsw m5, m1, m3 ;t58
+ paddsw m1, m3 ;t57
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t37a, t58a
+ mova [rsp+gprsize*2+16*41], m0 ;t38
+ mova [rsp+gprsize*2+16*40], m5 ;t37a
+ mova [rsp+gprsize*2+16*61], m4 ;t58a
+ mova [rsp+gprsize*2+16*60], m1 ;t57
+
+ mova m0, [rsp+gprsize*2+16*42] ;t39
+ mova m2, [rsp+gprsize*2+16*39] ;t36
+ mova m3, [rsp+gprsize*2+16*62] ;t59
+ mova m1, [rsp+gprsize*2+16*59] ;t56
+ psubsw m4, m0, m2 ;t36a
+ paddsw m0, m2 ;t39a
+ psubsw m5, m1, m3 ;t59a
+ paddsw m1, m3 ;t56a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m4017, 799 ;t36, t59
+ mova [rsp+gprsize*2+16*42], m0 ;t39a
+ mova [rsp+gprsize*2+16*39], m5 ;t36
+ mova [rsp+gprsize*2+16*62], m4 ;t59
+ mova [rsp+gprsize*2+16*59], m1 ;t56a
+
+ mova m0, [rsp+gprsize*2+16*35] ;t32
+ mova m2, [rsp+gprsize*2+16*38] ;t35
+ mova m3, [rsp+gprsize*2+16*63] ;t60
+ mova m1, [rsp+gprsize*2+16*66] ;t63
+ psubsw m4, m0, m2 ;t35a
+ paddsw m0, m2 ;t32a
+ psubsw m5, m1, m3 ;t60a
+ paddsw m1, m3 ;t63a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t35, t60
+ mova [rsp+gprsize*2+16*35], m0 ;t32a
+ mova [rsp+gprsize*2+16*38], m5 ;t35
+ mova [rsp+gprsize*2+16*63], m4 ;t60
+ mova [rsp+gprsize*2+16*66], m1 ;t63a
+
+ mova m0, [rsp+gprsize*2+16*36] ;t33a
+ mova m2, [rsp+gprsize*2+16*37] ;t34a
+ mova m3, [rsp+gprsize*2+16*64] ;t61a
+ mova m1, [rsp+gprsize*2+16*65] ;t62a
+ psubsw m4, m0, m2 ;t34
+ paddsw m0, m2 ;t33
+ psubsw m5, m1, m3 ;t61
+ paddsw m1, m3 ;t62
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, 799, 4017 ;t34a, t61a
+
+ mova m2, [rsp+gprsize*2+16*41] ;t38
+ mova m3, [rsp+gprsize*2+16*60] ;t57
+ psubsw m6, m0, m2 ;t38a
+ paddsw m0, m2 ;t33a
+ psubsw m2, m1, m3 ;t57a
+ paddsw m1, m3 ;t62a
+ mova [rsp+gprsize*2+16*36], m0 ;t33a
+ mova [rsp+gprsize*2+16*65], m1 ;t62a
+ ITX_MULSUB_2W 2, 6, 0, 3, 7, 1567, 3784 ;t38, t57
+ mova [rsp+gprsize*2+16*41], m2 ;t38
+ mova [rsp+gprsize*2+16*60], m6 ;t57
+
+ mova m2, [rsp+gprsize*2+16*40] ;t37
+ mova m3, [rsp+gprsize*2+16*61] ;t58
+ psubsw m0, m5, m2 ;t37
+ paddsw m5, m2 ;t34
+ psubsw m1, m4, m3 ;t58
+ paddsw m4, m3 ;t61
+ ITX_MULSUB_2W 1, 0, 2, 3, 7, 1567, 3784 ;t37a, t58a
+ mova [rsp+gprsize*2+16*37], m5 ;t34
+ mova [rsp+gprsize*2+16*64], m4 ;t61
+ mova [rsp+gprsize*2+16*40], m1 ;t37a
+ mova [rsp+gprsize*2+16*61], m0 ;t58a
+
+ mova m0, [rsp+gprsize*2+16*38] ;t35
+ mova m2, [rsp+gprsize*2+16*39] ;t36
+ mova m3, [rsp+gprsize*2+16*62] ;t59
+ mova m1, [rsp+gprsize*2+16*63] ;t60
+ psubsw m4, m0, m2 ;t36a
+ paddsw m0, m2 ;t35a
+ psubsw m5, m1, m3 ;t59a
+ paddsw m1, m3 ;t60a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t36, t59
+ mova [rsp+gprsize*2+16*38], m0 ;t35a
+ mova [rsp+gprsize*2+16*39], m5 ;t36
+ mova [rsp+gprsize*2+16*62], m4 ;t59
+ mova [rsp+gprsize*2+16*63], m1 ;t60a
+
+ mova m0, [rsp+gprsize*2+16*35] ;t32a
+ mova m2, [rsp+gprsize*2+16*42] ;t39a
+ mova m3, [rsp+gprsize*2+16*59] ;t56a
+ mova m1, [rsp+gprsize*2+16*66] ;t63a
+ psubsw m4, m0, m2 ;t39
+ paddsw m0, m2 ;t32
+ psubsw m5, m1, m3 ;t56
+ paddsw m1, m3 ;t63
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, 1567, 3784 ;t39a, t56a
+ mova [rsp+gprsize*2+16*35], m0 ;t32
+ mova [rsp+gprsize*2+16*42], m5 ;t39a
+ mova [rsp+gprsize*2+16*59], m4 ;t56a
+ mova [rsp+gprsize*2+16*66], m1 ;t63
+
+ mova m0, [rsp+gprsize*2+16*50] ;t47a
+ mova m2, [rsp+gprsize*2+16*43] ;t40a
+ mova m3, [rsp+gprsize*2+16*58] ;t55a
+ mova m1, [rsp+gprsize*2+16*51] ;t48a
+ psubsw m4, m0, m2 ;t40
+ paddsw m0, m2 ;t47
+ psubsw m5, m1, m3 ;t55
+ paddsw m1, m3 ;t48
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t40a, t55a
+ mova [rsp+gprsize*2+16*50], m0 ;t47
+ mova [rsp+gprsize*2+16*43], m5 ;t40a
+ mova [rsp+gprsize*2+16*58], m4 ;t55a
+ mova [rsp+gprsize*2+16*51], m1 ;t48
+
+ mova m0, [rsp+gprsize*2+16*49] ;t46
+ mova m2, [rsp+gprsize*2+16*44] ;t41
+ mova m3, [rsp+gprsize*2+16*57] ;t54
+ mova m1, [rsp+gprsize*2+16*52] ;t49
+ psubsw m4, m0, m2 ;t41a
+ paddsw m0, m2 ;t46a
+ psubsw m5, m1, m3 ;t54a
+ paddsw m1, m3 ;t49a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t41, t54
+ mova [rsp+gprsize*2+16*49], m0 ;t46a
+ mova [rsp+gprsize*2+16*44], m5 ;t41
+ mova [rsp+gprsize*2+16*57], m4 ;t54
+ mova [rsp+gprsize*2+16*52], m1 ;t49a
+
+ mova m0, [rsp+gprsize*2+16*48] ;t45a
+ mova m2, [rsp+gprsize*2+16*45] ;t42a
+ mova m3, [rsp+gprsize*2+16*56] ;t53a
+ mova m1, [rsp+gprsize*2+16*53] ;t50a
+ psubsw m4, m0, m2 ;t42
+ paddsw m0, m2 ;t45
+ psubsw m5, m1, m3 ;t53
+ paddsw m1, m3 ;t50
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t42a, t53a
+ mova [rsp+gprsize*2+16*48], m0 ;t45
+ mova [rsp+gprsize*2+16*45], m5 ;t42a
+ mova [rsp+gprsize*2+16*56], m4 ;t53a
+ mova [rsp+gprsize*2+16*53], m1 ;t50
+
+ mova m0, [rsp+gprsize*2+16*47] ;t44
+ mova m2, [rsp+gprsize*2+16*46] ;t43
+ mova m3, [rsp+gprsize*2+16*55] ;t52
+ mova m1, [rsp+gprsize*2+16*54] ;t51
+ psubsw m4, m0, m2 ;t43a
+ paddsw m0, m2 ;t44a
+ psubsw m5, m1, m3 ;t52a
+ paddsw m1, m3 ;t51a
+ ITX_MULSUB_2W 5, 4, 2, 3, 7, m3784, 1567 ;t43, t52
+
+ mova m2, [rsp+gprsize*2+16*38] ;t35a
+ mova m3, [rsp+gprsize*2+16*31] ;tmp[28]
+ psubsw m6, m2, m0 ;t44
+ paddsw m2, m0 ;t35
+ psubsw m0, m3, m2 ;out35
+ paddsw m2, m3 ;out28
+ mova m3, [rsp+gprsize*2+16*63] ;t60a
+ mova [rsp+gprsize*2+16*38], m0 ;out35
+ mova [rsp+gprsize*2+16*31], m2 ;out28
+ psubsw m0, m3, m1 ;t51
+ paddsw m3, m1 ;t60
+ ITX_MULSUB_2W 0, 6, 1, 2, 7, 2896, 2896 ;t44a, t51a
+ mova m2, [rsp+gprsize*2+16*6 ] ;tmp[3]
+ psubsw m1, m2, m3 ;out60
+ paddsw m2, m3 ;out3
+ mova m3, [rsp+gprsize*2+16*22] ;tmp[19]
+ mova [rsp+gprsize*2+16*63], m1 ;out60
+ mova [rsp+gprsize*2+16*6 ], m2 ;out3
+ psubsw m1, m3, m0 ;out44
+ paddsw m3, m0 ;out19
+ mova m2, [rsp+gprsize*2+16*15] ;tmp[12]
+
+ mova m0, [rsp+gprsize*2+16*39] ;t36
+ mova [rsp+gprsize*2+16*47], m1 ;out44
+ mova [rsp+gprsize*2+16*22], m3 ;out19
+ mova m1, [rsp+gprsize*2+16*62] ;t59
+ psubsw m3, m2, m6 ;out51
+ paddsw m2, m6 ;out12
+ mova [rsp+gprsize*2+16*54], m3 ;out51
+ mova [rsp+gprsize*2+16*15], m2 ;out12
+ psubsw m2, m0, m5 ;t43a
+ paddsw m0, m5 ;t36a
+ mova m5, [rsp+gprsize*2+16*30] ;tmp[27]
+ psubsw m3, m1, m4 ;t52a
+ paddsw m1, m4 ;t59a
+ ITX_MULSUB_2W 3, 2, 4, 6, 7, 2896, 2896 ;t43, t52
+ mova m4, [rsp+gprsize*2+16*7 ] ;tmp[4 ]
+ psubsw m6, m5, m0 ;out36
+ paddsw m5, m0 ;out27
+ psubsw m0, m4, m1 ;out59
+ paddsw m4, m1 ;out4
+ mova [rsp+gprsize*2+16*39], m6 ;out36
+ mova [rsp+gprsize*2+16*30], m5 ;out27
+ mova [rsp+gprsize*2+16*62], m0 ;out59
+ mova [rsp+gprsize*2+16*7 ], m4 ;out4
+ mova m0, [rsp+gprsize*2+16*23] ;tmp[20]
+ mova m5, [rsp+gprsize*2+16*14] ;tmp[11]
+ psubsw m4, m0, m3 ;out43
+ paddsw m0, m3 ;out20
+ psubsw m6, m5, m2 ;out52
+ paddsw m5, m2 ;out11
+ mova [rsp+gprsize*2+16*46], m4 ;out43
+ mova [rsp+gprsize*2+16*23], m0 ;out20
+ mova [rsp+gprsize*2+16*55], m6 ;out52
+ mova [rsp+gprsize*2+16*14], m5 ;out11
+
+ mova m0, [rsp+gprsize*2+16*40] ;t37a
+ mova m5, [rsp+gprsize*2+16*45] ;t42a
+ mova m3, [rsp+gprsize*2+16*56] ;t53a
+ mova m1, [rsp+gprsize*2+16*61] ;t58a
+ mova m2, [rsp+gprsize*2+16*29] ;tmp[26]
+ psubsw m4, m0, m5 ;t42
+ paddsw m0, m5 ;t37
+ psubsw m5, m1, m3 ;t53
+ paddsw m1, m3 ;t58
+ ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t43, t52
+ mova m3, [rsp+gprsize*2+16*8 ] ;tmp[5 ]
+ psubsw m6, m2, m0 ;out37
+ paddsw m2, m0 ;out26
+ psubsw m0, m3, m1 ;out58
+ paddsw m3, m1 ;out5
+ mova [rsp+gprsize*2+16*40], m6 ;out37
+ mova [rsp+gprsize*2+16*29], m2 ;out26
+ mova [rsp+gprsize*2+16*61], m0 ;out58
+ mova [rsp+gprsize*2+16*8 ], m3 ;out5
+ mova m0, [rsp+gprsize*2+16*24] ;tmp[21]
+ mova m1, [rsp+gprsize*2+16*13] ;tmp[10]
+ psubsw m2, m0, m5 ;out42
+ paddsw m0, m5 ;out21
+ psubsw m3, m1, m4 ;out53
+ paddsw m1, m4 ;out10
+ mova [rsp+gprsize*2+16*45], m2 ;out42
+ mova [rsp+gprsize*2+16*24], m0 ;out21
+ mova [rsp+gprsize*2+16*56], m3 ;out53
+ mova [rsp+gprsize*2+16*13], m1 ;out10
+
+ mova m0, [rsp+gprsize*2+16*41] ;t38
+ mova m5, [rsp+gprsize*2+16*44] ;t41
+ mova m3, [rsp+gprsize*2+16*57] ;t54
+ mova m1, [rsp+gprsize*2+16*60] ;t57
+ mova m2, [rsp+gprsize*2+16*28] ;tmp[25]
+ psubsw m4, m0, m5 ;t41a
+ paddsw m0, m5 ;t38a
+ psubsw m5, m1, m3 ;t54a
+ paddsw m1, m3 ;t57a
+ ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t41a, t54a
+ mova m3, [rsp+gprsize*2+16*9 ] ;tmp[6 ]
+ psubsw m6, m2, m0 ;out38
+ paddsw m2, m0 ;out25
+ psubsw m0, m3, m1 ;out57
+ paddsw m3, m1 ;out6
+ mova [rsp+gprsize*2+16*41], m6 ;out38
+ mova [rsp+gprsize*2+16*28], m2 ;out25
+ mova [rsp+gprsize*2+16*60], m0 ;out57
+ mova [rsp+gprsize*2+16*9 ], m3 ;out6
+ mova m0, [rsp+gprsize*2+16*25] ;tmp[22]
+ mova m1, [rsp+gprsize*2+16*12] ;tmp[9 ]
+ psubsw m2, m0, m5 ;out41
+ paddsw m0, m5 ;out22
+ psubsw m3, m1, m4 ;out54
+ paddsw m1, m4 ;out9
+ mova [rsp+gprsize*2+16*44], m2 ;out41
+ mova [rsp+gprsize*2+16*25], m0 ;out22
+ mova [rsp+gprsize*2+16*57], m3 ;out54
+ mova [rsp+gprsize*2+16*12], m1 ;out9
+
+ mova m0, [rsp+gprsize*2+16*42] ;t39a
+ mova m5, [rsp+gprsize*2+16*43] ;t40a
+ mova m3, [rsp+gprsize*2+16*58] ;t55a
+ mova m1, [rsp+gprsize*2+16*59] ;t56a
+ mova m2, [rsp+gprsize*2+16*27] ;tmp[24]
+ psubsw m4, m0, m5 ;t40
+ paddsw m0, m5 ;t39
+ psubsw m5, m1, m3 ;t55
+ paddsw m1, m3 ;t56
+ ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t40a, t55a
+ mova m3, [rsp+gprsize*2+16*10] ;tmp[7 ]
+ psubsw m6, m2, m0 ;out39
+ paddsw m2, m0 ;out24
+ psubsw m0, m3, m1 ;out56
+ paddsw m3, m1 ;out7
+ mova [rsp+gprsize*2+16*42], m6 ;out39
+ mova [rsp+gprsize*2+16*27], m2 ;out24
+ mova [rsp+gprsize*2+16*59], m0 ;out56
+ mova [rsp+gprsize*2+16*10], m3 ;out7
+ mova m0, [rsp+gprsize*2+16*26] ;tmp[23]
+ mova m1, [rsp+gprsize*2+16*11] ;tmp[8 ]
+ psubsw m2, m0, m5 ;out40
+ paddsw m0, m5 ;out23
+ psubsw m3, m1, m4 ;out55
+ paddsw m1, m4 ;out8
+ mova [rsp+gprsize*2+16*43], m2 ;out40
+ mova [rsp+gprsize*2+16*26], m0 ;out23
+ mova [rsp+gprsize*2+16*58], m3 ;out55
+ mova [rsp+gprsize*2+16*11], m1 ;out8
+
+ mova m0, [rsp+gprsize*2+16*37] ;t34
+ mova m5, [rsp+gprsize*2+16*48] ;t45
+ mova m3, [rsp+gprsize*2+16*53] ;t50
+ mova m1, [rsp+gprsize*2+16*64] ;t61
+ mova m2, [rsp+gprsize*2+16*32] ;tmp[29]
+ psubsw m4, m0, m5 ;t45a
+ paddsw m0, m5 ;t34a
+ psubsw m5, m1, m3 ;t50a
+ paddsw m1, m3 ;t61a
+ ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50
+ mova m3, [rsp+gprsize*2+16*5 ] ;tmp[2 ]
+ psubsw m6, m2, m0 ;out34
+ paddsw m2, m0 ;out29
+ psubsw m0, m3, m1 ;out61
+ paddsw m3, m1 ;out2
+ mova [rsp+gprsize*2+16*37], m6 ;out34
+ mova [rsp+gprsize*2+16*32], m2 ;out29
+ mova [rsp+gprsize*2+16*64], m0 ;out61
+ mova [rsp+gprsize*2+16*5 ], m3 ;out2
+ mova m0, [rsp+gprsize*2+16*21] ;tmp[18]
+ mova m1, [rsp+gprsize*2+16*16] ;tmp[13]
+ psubsw m2, m0, m5 ;out45
+ paddsw m0, m5 ;out18
+ psubsw m3, m1, m4 ;out50
+ paddsw m1, m4 ;out13
+ mova [rsp+gprsize*2+16*48], m2 ;out45
+ mova [rsp+gprsize*2+16*21], m0 ;out18
+ mova [rsp+gprsize*2+16*53], m3 ;out50
+ mova [rsp+gprsize*2+16*16], m1 ;out13
+
+ mova m0, [rsp+gprsize*2+16*36] ;t33a
+ mova m5, [rsp+gprsize*2+16*49] ;t46a
+ mova m3, [rsp+gprsize*2+16*52] ;t49a
+ mova m1, [rsp+gprsize*2+16*65] ;t62a
+ mova m2, [rsp+gprsize*2+16*33] ;tmp[30]
+ psubsw m4, m0, m5 ;t46
+ paddsw m0, m5 ;t33
+ psubsw m5, m1, m3 ;t49
+ paddsw m1, m3 ;t62
+ ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t45, t50
+ mova m3, [rsp+gprsize*2+16*4 ] ;tmp[1 ]
+ psubsw m6, m2, m0 ;out33
+ paddsw m2, m0 ;out30
+ psubsw m0, m3, m1 ;out62
+ paddsw m3, m1 ;out1
+ mova [rsp+gprsize*2+16*36], m6 ;out33
+ mova [rsp+gprsize*2+16*33], m2 ;out30
+ mova [rsp+gprsize*2+16*65], m0 ;out62
+ mova [rsp+gprsize*2+16*4 ], m3 ;out1
+ mova m0, [rsp+gprsize*2+16*20] ;tmp[17]
+ mova m1, [rsp+gprsize*2+16*17] ;tmp[14]
+ psubsw m2, m0, m5 ;out46
+ paddsw m0, m5 ;out17
+ psubsw m3, m1, m4 ;out49
+ paddsw m1, m4 ;out14
+ mova [rsp+gprsize*2+16*49], m2 ;out46
+ mova [rsp+gprsize*2+16*20], m0 ;out17
+ mova [rsp+gprsize*2+16*52], m3 ;out49
+ mova [rsp+gprsize*2+16*17], m1 ;out14
+
+ mova m0, [rsp+gprsize*2+16*35] ;t32
+ mova m5, [rsp+gprsize*2+16*50] ;t47
+ mova m3, [rsp+gprsize*2+16*51] ;t48
+ mova m1, [rsp+gprsize*2+16*66] ;t63
+ mova m2, [rsp+gprsize*2+16*34] ;tmp[31]
+ psubsw m4, m0, m5 ;t47a
+ paddsw m0, m5 ;t32a
+ psubsw m5, m1, m3 ;t48a
+ paddsw m1, m3 ;t63a
+ ITX_MULSUB_2W 5, 4, 3, 6, 7, 2896, 2896 ;t47, t48
+ mova m3, [rsp+gprsize*2+16*3 ] ;tmp[0 ]
+ psubsw m6, m2, m0 ;out32
+ paddsw m2, m0 ;out31
+ psubsw m0, m3, m1 ;out63
+ paddsw m3, m1 ;out0
+ mova [rsp+gprsize*2+16*35], m6 ;out32
+ mova [rsp+gprsize*2+16*34], m2 ;out31
+ mova [rsp+gprsize*2+16*66], m0 ;out63
+ mova [rsp+gprsize*2+16*3 ], m3 ;out0
+ mova m0, [rsp+gprsize*2+16*19] ;tmp[16]
+ mova m1, [rsp+gprsize*2+16*18] ;tmp[15]
+ psubsw m2, m0, m5 ;out47
+ paddsw m0, m5 ;out16
+ psubsw m3, m1, m4 ;out48
+ paddsw m1, m4 ;out15
+ mova [rsp+gprsize*2+16*50], m2 ;out47
+ mova [rsp+gprsize*2+16*19], m0 ;out16
+ mova [rsp+gprsize*2+16*51], m3 ;out48
+ mova [rsp+gprsize*2+16*18], m1 ;out15
+ ret
+
+
+cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 6, 8, 16*132, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+
+ call m(idct_64x16_internal_8bpc)
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ mov r3d, 16
+ lea tx2q, [o(.end)]
+
+.body:
+ pmulhrsw m0, m2
+ movd m2, [o(pw_2048)] ;intentionally rip-relative
+ pmulhrsw m0, m1
+ pmulhrsw m0, m2
+ pshuflw m0, m0, q0000
+ punpcklwd m0, m0
+ pxor m7, m7
+
+.loop:
+ mova m1, [dstq+16*0]
+ mova m3, [dstq+16*1]
+ mova m5, [dstq+16*2]
+ mova m6, [dstq+16*3]
+ punpckhbw m2, m1, m7
+ punpcklbw m1, m7
+ punpckhbw m4, m3, m7
+ punpcklbw m3, m7
+ paddw m2, m0
+ paddw m1, m0
+ paddw m4, m0
+ paddw m3, m0
+ packuswb m1, m2
+ packuswb m3, m4
+ punpckhbw m2, m5, m7
+ punpcklbw m5, m7
+ punpckhbw m4, m6, m7
+ punpcklbw m6, m7
+ paddw m2, m0
+ paddw m5, m0
+ paddw m4, m0
+ paddw m6, m0
+ packuswb m5, m2
+ packuswb m6, m4
+ mova [dstq+16*0], m1
+ mova [dstq+16*1], m3
+ mova [dstq+16*2], m5
+ mova [dstq+16*3], m6
+ add dstq, strideq
+ dec r3d
+ jg .loop
+ jmp tx2q
+
+.end:
+ RET
+
+
+%macro LOAD_4ROWS 2-3 0 ;src, stride, is_rect2
+
+%if %3
+ mova m3, [o(pw_2896x8)]
+ pmulhrsw m0, m3, [%1+%2*0]
+ pmulhrsw m1, m3, [%1+%2*1]
+ pmulhrsw m2, m3, [%1+%2*2]
+ pmulhrsw m3, [%1+%2*3]
+%else
+ mova m0, [%1+%2*0]
+ mova m1, [%1+%2*1]
+ mova m2, [%1+%2*2]
+ mova m3, [%1+%2*3]
+%endif
+%endmacro
+
+%macro LOAD_4ROWS_H 2 ;src, stride
+ mova m4, [%1+%2*0]
+ mova m5, [%1+%2*1]
+ mova m6, [%1+%2*2]
+ mova m7, [%1+%2*3]
+%endmacro
+
+cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mov r3d, 2
+ mov [rsp+gprsize*2+16*67], dstq
+ lea dstq, [rsp+gprsize+16*68]
+
+.pass1_loop:
+ LOAD_4ROWS coeffq+32*0, 32*8
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ pxor m4, m4
+ LOAD_4ROWS coeffq+32*4, 32*8
+
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+32*2, 32*4
+ mova [rsp+gprsize+16*19], m0
+ mova [rsp+gprsize+16*26], m1
+ mova [rsp+gprsize+16*23], m2
+ mova [rsp+gprsize+16*22], m3
+ mova [rsp+gprsize+16*21], m4
+ mova [rsp+gprsize+16*24], m5
+ mova [rsp+gprsize+16*25], m6
+ mova [rsp+gprsize+16*20], m7
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ SAVE_8ROWS rsp+gprsize+16*3, 16
+
+ LOAD_8ROWS coeffq+32*1, 32*2
+ mova [rsp+gprsize+16*35], m0 ;in1
+ mova [rsp+gprsize+16*49], m1 ;in3
+ mova [rsp+gprsize+16*43], m2 ;in5
+ mova [rsp+gprsize+16*41], m3 ;in7
+ mova [rsp+gprsize+16*39], m4 ;in9
+ mova [rsp+gprsize+16*45], m5 ;in11
+ mova [rsp+gprsize+16*47], m6 ;in13
+ mova [rsp+gprsize+16*37], m7 ;in15
+
+ LOAD_8ROWS coeffq+32*17, 32*2
+ mova [rsp+gprsize+16*63], m0 ;in17
+ mova [rsp+gprsize+16*53], m1 ;in19
+ mova [rsp+gprsize+16*55], m2 ;in21
+ mova [rsp+gprsize+16*61], m3 ;in23
+ mova [rsp+gprsize+16*59], m4 ;in25
+ mova [rsp+gprsize+16*57], m5 ;in27
+ mova [rsp+gprsize+16*51], m6 ;in29
+ mova [rsp+gprsize+16*65], m7 ;in31
+
+ call m(idct_16x64_internal_8bpc).main
+
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ SAVE_8ROWS coeffq+32*0, 32
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+32*8, 32
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+32*16, 32
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end3:
+ SAVE_8ROWS coeffq+32*24, 32
+ LOAD_8ROWS rsp+gprsize+16*35, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end4:
+ SAVE_8ROWS dstq+32*0, 32
+ LOAD_8ROWS rsp+gprsize+16*43, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end5)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end5:
+ SAVE_8ROWS dstq+32*8, 32
+ LOAD_8ROWS rsp+gprsize+16*51, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end6)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end6:
+ SAVE_8ROWS dstq+32*16, 32
+ LOAD_8ROWS rsp+gprsize+16*59, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end7)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end7:
+ SAVE_8ROWS dstq+32*24, 32
+
+ add coeffq, 16
+ add dstq, 16
+ dec r3d
+ jg .pass1_loop
+
+.pass2:
+ mov dstq, [rsp+gprsize*2+16*67]
+ sub coeffq, 32
+ mov r3d, 4
+
+.pass2_loop:
+ mov [rsp+gprsize*1+16*67], r3d
+
+ LOAD_4ROWS coeffq+16*0, 32*2
+ LOAD_4ROWS_H coeffq+16*1, 32*2
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_4ROWS coeffq+16*2, 32*2
+ LOAD_4ROWS_H coeffq+16*3, 32*2
+ call m(idct_16x8_internal_8bpc).main
+
+ mov r3, dstq
+ lea tx2q, [o(.end)]
+ lea dstq, [dstq+strideq*8]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.end1)]
+ mov dstq, r3
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end1:
+ pxor m7, m7
+ REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+ add coeffq, 16*16
+ mov r3d, [rsp+gprsize*1+16*67]
+ mov dstq, [rsp+gprsize*2+16*67]
+ add dstq, 8
+ mov [rsp+gprsize*2+16*67], dstq
+ dec r3d
+ jg .pass2_loop
+
+ mov r3d, 4
+ lea coeffq, [rsp+gprsize+16*68]
+.pass2_loop2:
+ mov [rsp+gprsize*1+16*67], r3d
+
+ LOAD_4ROWS coeffq+16*0, 32*2
+ LOAD_4ROWS_H coeffq+16*1, 32*2
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_4ROWS coeffq+16*2, 32*2
+ LOAD_4ROWS_H coeffq+16*3, 32*2
+ call m(idct_16x8_internal_8bpc).main
+
+ mov r3, dstq
+ lea tx2q, [o(.end2)]
+ lea dstq, [dstq+strideq*8]
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end2:
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.end3)]
+ mov dstq, r3
+ jmp m(idct_8x8_internal_8bpc).end
+
+.end3:
+
+ add coeffq, 16*16
+ mov r3d, [rsp+gprsize*1+16*67]
+ mov dstq, [rsp+gprsize*2+16*67]
+ add dstq, 8
+ mov [rsp+gprsize*2+16*67], dstq
+ dec r3d
+ jg .pass2_loop2
+ ret
+
+
+cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+ call m(idct_32x64_internal_8bpc)
+.end:
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ mov [coeffq], eobd
+ pmulhrsw m0, m1
+ mov r3d, 64
+ lea tx2q, [o(.end)]
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
+
+
+cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mov r4d, 2
+ sub eobd, 136
+ mov [rsp+gprsize*1+16*67], eobd
+ mov r3d, 4
+ cmovs r3d, r4d
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+ mov [rsp+gprsize*2+16*67], coeffq
+
+.pass1_loop:
+ LOAD_8ROWS coeffq+64*1, 64*2, 1
+ mova [rsp+gprsize+16*19], m0 ;in1
+ mova [rsp+gprsize+16*26], m1 ;in3
+ mova [rsp+gprsize+16*23], m2 ;in5
+ mova [rsp+gprsize+16*22], m3 ;in7
+ mova [rsp+gprsize+16*21], m4 ;in9
+ mova [rsp+gprsize+16*24], m5 ;in11
+ mova [rsp+gprsize+16*25], m6 ;in13
+ mova [rsp+gprsize+16*20], m7 ;in15
+
+ mov tx2d, [rsp+gprsize*1+16*67]
+ test tx2d, tx2d
+ jl .fast
+
+.full:
+ LOAD_8ROWS coeffq+64*0, 64*4, 1
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_8ROWS coeffq+64*2, 64*4, 1
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+64*17, 64*2, 1
+ mova [rsp+gprsize+16*33], m0 ;in17
+ mova [rsp+gprsize+16*28], m1 ;in19
+ mova [rsp+gprsize+16*29], m2 ;in21
+ mova [rsp+gprsize+16*32], m3 ;in23
+ mova [rsp+gprsize+16*31], m4 ;in25
+ mova [rsp+gprsize+16*30], m5 ;in27
+ mova [rsp+gprsize+16*27], m6 ;in29
+ mova [rsp+gprsize+16*34], m7 ;in31
+
+ call m(idct_8x32_internal_8bpc).main
+ jmp .pass1_end
+
+.fast:
+ LOAD_4ROWS coeffq, 256, 1
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+ LOAD_4ROWS coeffq+128*1, 256, 1
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ call m(idct_8x32_internal_8bpc).main_fast
+
+.pass1_end:
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+64*0, 64
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end3:
+ SAVE_8ROWS coeffq+64*16, 64
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end4:
+ SAVE_8ROWS coeffq+64*24, 64
+
+ add coeffq, 16
+ dec r3d
+ jg .pass1_loop
+
+.pass2:
+ mov coeffq, [rsp+gprsize*2+16*67]
+ mov r3d, 4
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*2+16*67], r4
+ lea r4, [o(m(idct_16x64_internal_8bpc).end1)]
+ jmp m(idct_16x64_internal_8bpc).pass2_loop
+
+
+cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+ call m(idct_64x32_internal_8bpc)
+.end:
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_16384)]
+ pmulhrsw m0, m1
+ mov [coeffq], eobd
+ mov r3d, 32
+ lea tx2q, [o(.end)]
+ jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body
+
+
+cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mov r4d, 2
+ sub eobd, 136
+ mov [rsp+gprsize*1+16*67], eobd
+ mov r3d, 4
+ cmovs r3d, r4d
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+ mov [rsp+gprsize*2+16*67], coeffq
+ mov [rsp+gprsize*3+16*67], dstq
+ lea dstq, [rsp+gprsize+16*69]
+ mov [rsp+gprsize*4+16*67], dstq
+
+.pass1_loop:
+ LOAD_4ROWS coeffq+64*0, 64*8, 1
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ pxor m4, m4
+ LOAD_4ROWS coeffq+64*4, 64*8, 1
+
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+64*2, 64*4, 1
+ mova [rsp+gprsize+16*19], m0
+ mova [rsp+gprsize+16*26], m1
+ mova [rsp+gprsize+16*23], m2
+ mova [rsp+gprsize+16*22], m3
+ mova [rsp+gprsize+16*21], m4
+ mova [rsp+gprsize+16*24], m5
+ mova [rsp+gprsize+16*25], m6
+ mova [rsp+gprsize+16*20], m7
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ SAVE_8ROWS rsp+gprsize+16*3, 16
+
+ LOAD_8ROWS coeffq+64*1, 64*2, 1
+ mova [rsp+gprsize+16*35], m0 ;in1
+ mova [rsp+gprsize+16*49], m1 ;in3
+ mova [rsp+gprsize+16*43], m2 ;in5
+ mova [rsp+gprsize+16*41], m3 ;in7
+ mova [rsp+gprsize+16*39], m4 ;in9
+ mova [rsp+gprsize+16*45], m5 ;in11
+ mova [rsp+gprsize+16*47], m6 ;in13
+ mova [rsp+gprsize+16*37], m7 ;in15
+
+ LOAD_8ROWS coeffq+64*17, 64*2, 1
+ mova [rsp+gprsize+16*63], m0 ;in17
+ mova [rsp+gprsize+16*53], m1 ;in19
+ mova [rsp+gprsize+16*55], m2 ;in21
+ mova [rsp+gprsize+16*61], m3 ;in23
+ mova [rsp+gprsize+16*59], m4 ;in25
+ mova [rsp+gprsize+16*57], m5 ;in27
+ mova [rsp+gprsize+16*51], m6 ;in29
+ mova [rsp+gprsize+16*65], m7 ;in31
+
+ call m(idct_16x64_internal_8bpc).main
+
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end:
+ SAVE_8ROWS coeffq+64*0, 64
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+64*16, 64
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end3:
+ SAVE_8ROWS coeffq+64*24, 64
+ LOAD_8ROWS rsp+gprsize+16*35, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end4:
+ SAVE_8ROWS dstq+64*0, 64
+ LOAD_8ROWS rsp+gprsize+16*43, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end5)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end5:
+ SAVE_8ROWS dstq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*51, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end6)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end6:
+ SAVE_8ROWS dstq+64*16, 64
+ LOAD_8ROWS rsp+gprsize+16*59, 16
+ mova [rsp+gprsize+16*0], m7
+ lea tx2q, [o(.pass1_end7)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end
+
+.pass1_end7:
+ SAVE_8ROWS dstq+64*24, 64
+
+ add coeffq, 16
+ add dstq, 16
+ dec r3d
+ jg .pass1_loop
+
+.pass2:
+ mov coeffq, [rsp+gprsize*4+16*67]
+ mov dstq, [rsp+gprsize*3+16*67]
+ mov eobd, [rsp+gprsize*1+16*67]
+ lea dstq, [dstq+32]
+ mov [rsp+gprsize*1+16*35], eobd
+ lea tx2q, [o(.pass2_end)]
+ mov r3d, 4
+ jmp m(idct_32x32_internal_8bpc).pass2_loop
+
+.pass2_end:
+ mova [rsp+gprsize+16*0], m7
+ lea r3, [o(.pass2_end1)]
+ jmp m(idct_8x32_internal_8bpc).end2
+
+.pass2_end1:
+ lea tx2q, [o(.pass2_end)]
+ add coeffq, 16*32
+ mov dstq, [rsp+gprsize*2+16*35]
+ mov r3d, [rsp+gprsize*3+16*35]
+ dec r3d
+ jg m(idct_32x32_internal_8bpc).pass2_loop
+
+.pass2_end2:
+ mov dstq, [rsp+gprsize*3+16*67]
+ mov coeffq, [rsp+gprsize*2+16*67]
+ lea tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)]
+ mov r3d, 4
+ jmp m(idct_32x32_internal_8bpc).pass2_loop
+
+
+cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eob, tx2
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+
+ call m(idct_64x64_internal_8bpc)
+ RET
+
+.dconly:
+ movd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1, [coeffq]
+ movd m2, [o(pw_8192)]
+ mov [coeffq], eobd
+ mov r3d, 64
+ lea tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)]
+ jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body
+
+cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
+ mov r5d, 4
+ mov r4d, 2
+ sub eobd, 136
+ cmovns r4d, r5d
+
+%if ARCH_X86_32
+ LEA r5, $$
+%endif
+
+ mov [rsp+gprsize*1+16*67], eobd
+ mov r3d, r4d
+ mov [rsp+gprsize*4+16*67], coeffq
+ mov [rsp+gprsize*3+16*67], dstq
+ lea dstq, [rsp+gprsize+16*69]
+ mov [rsp+gprsize*2+16*67], dstq
+
+.pass1_loop:
+ LOAD_4ROWS coeffq+64*0, 64*8
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_8x8_internal_8bpc).main
+ SAVE_7ROWS rsp+gprsize+16*3, 16
+
+ pxor m4, m4
+ LOAD_4ROWS coeffq+64*4, 64*8
+
+ REPX {mova x, m4}, m5, m6, m7
+ call m(idct_16x8_internal_8bpc).main
+ mova m7, [rsp+gprsize+16*0]
+ SAVE_8ROWS rsp+gprsize+16*11, 16
+
+ LOAD_8ROWS coeffq+64*2, 64*4
+ mova [rsp+gprsize+16*19], m0
+ mova [rsp+gprsize+16*26], m1
+ mova [rsp+gprsize+16*23], m2
+ mova [rsp+gprsize+16*22], m3
+ mova [rsp+gprsize+16*21], m4
+ mova [rsp+gprsize+16*24], m5
+ mova [rsp+gprsize+16*25], m6
+ mova [rsp+gprsize+16*20], m7
+
+ call m(idct_8x32_internal_8bpc).main_fast
+ SAVE_8ROWS rsp+gprsize+16*3, 16
+
+ LOAD_8ROWS coeffq+64*1, 64*2
+ mova [rsp+gprsize+16*35], m0 ;in1
+ mova [rsp+gprsize+16*49], m1 ;in3
+ mova [rsp+gprsize+16*43], m2 ;in5
+ mova [rsp+gprsize+16*41], m3 ;in7
+ mova [rsp+gprsize+16*39], m4 ;in9
+ mova [rsp+gprsize+16*45], m5 ;in11
+ mova [rsp+gprsize+16*47], m6 ;in13
+ mova [rsp+gprsize+16*37], m7 ;in15
+
+ LOAD_8ROWS coeffq+64*17, 64*2
+ mova [rsp+gprsize+16*63], m0 ;in17
+ mova [rsp+gprsize+16*53], m1 ;in19
+ mova [rsp+gprsize+16*55], m2 ;in21
+ mova [rsp+gprsize+16*61], m3 ;in23
+ mova [rsp+gprsize+16*59], m4 ;in25
+ mova [rsp+gprsize+16*57], m5 ;in27
+ mova [rsp+gprsize+16*51], m6 ;in29
+ mova [rsp+gprsize+16*65], m7 ;in31
+
+ call m(idct_16x64_internal_8bpc).main
+
+ LOAD_8ROWS rsp+gprsize+16*3, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end:
+ SAVE_8ROWS coeffq+64*0, 64
+ LOAD_8ROWS rsp+gprsize+16*11, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end1)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end1:
+ SAVE_8ROWS coeffq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*19, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end2)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end2:
+ SAVE_8ROWS coeffq+64*16, 64
+ LOAD_8ROWS rsp+gprsize+16*27, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end3)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end3:
+ SAVE_8ROWS coeffq+64*24, 64
+ LOAD_8ROWS rsp+gprsize+16*35, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end4)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end4:
+ SAVE_8ROWS dstq+64*0, 64
+ LOAD_8ROWS rsp+gprsize+16*43, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end5)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end5:
+ SAVE_8ROWS dstq+64*8, 64
+ LOAD_8ROWS rsp+gprsize+16*51, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end6)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end6:
+ SAVE_8ROWS dstq+64*16, 64
+ LOAD_8ROWS rsp+gprsize+16*59, 16
+ mova [rsp+gprsize+16*0], m7
+ mova m7, [o(pw_8192)]
+ lea tx2q, [o(.pass1_end7)]
+ jmp m(idct_8x8_internal_8bpc).pass1_end1
+
+.pass1_end7:
+ SAVE_8ROWS dstq+64*24, 64
+
+ add coeffq, 16
+ add dstq, 16
+ dec r3d
+ jg .pass1_loop
+
+.pass2:
+ mov dstq, [rsp+gprsize*3+16*67]
+ mov coeffq, [rsp+gprsize*2+16*67]
+ lea dstq, [dstq+32]
+ mov r3d, 4
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*2+16*67], r4
+ lea r4, [o(.pass2_end)]
+ jmp m(idct_16x64_internal_8bpc).pass2_loop
+
+.pass2_end:
+ LOAD_8ROWS rsp+gprsize+16*35, 16
+ lea dstq, [dstq+strideq*2]
+ lea r3, [rsp+16*32+gprsize]
+ mova [rsp+gprsize+16*0], m7
+ call m(idct_16x64_internal_8bpc).write
+ mov dstq, [rsp+gprsize*2+16*67]
+ mov r3d, [rsp+gprsize*3+16*67]
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*2+16*67], r4
+ lea r4, [o(.pass2_end)]
+
+ dec r3d
+ jg m(idct_16x64_internal_8bpc).pass2_loop
+
+.pass2_end2:
+ mov coeffq, [rsp+gprsize*4+16*67]
+ mov dstq, [rsp+gprsize*2+16*67]
+ mov r3d, 4
+ sub dstq, 72
+ lea r4, [dstq+8]
+ mov [rsp+gprsize*2+16*67], r4
+ lea r4, [o(m(idct_16x64_internal_8bpc).end1)]
+ jmp m(idct_16x64_internal_8bpc).pass2_loop