summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/x86/itx16_sse.asm
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 09:22:09 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 09:22:09 +0000
commit43a97878ce14b72f0981164f87f2e35e14151312 (patch)
tree620249daf56c0258faa40cbdcf9cfba06de2a846 /third_party/dav1d/src/x86/itx16_sse.asm
parentInitial commit. (diff)
downloadfirefox-upstream.tar.xz
firefox-upstream.zip
Adding upstream version 110.0.1.upstream/110.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/dav1d/src/x86/itx16_sse.asm')
-rw-r--r--third_party/dav1d/src/x86/itx16_sse.asm8135
1 files changed, 8135 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/itx16_sse.asm b/third_party/dav1d/src/x86/itx16_sse.asm
new file mode 100644
index 0000000000..3833e17c99
--- /dev/null
+++ b/third_party/dav1d/src/x86/itx16_sse.asm
@@ -0,0 +1,8135 @@
+; Copyright © 2021, VideoLAN and dav1d authors
+; Copyright © 2021, Two Orioles, LLC
+; Copyright © 2017-2021, The rav1e contributors
+; Copyright © 2020, Nathan Egge
+; Copyright © 2021, Matthias Dressel
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA
+%macro COEF 1-2
+pd_%1: times 4 dd %1
+%if %0 == 2
+pd_m%1: times 4 dd -%1
+%endif
+%endmacro
+
+COEF 201
+COEF 401
+COEF 601, 1
+COEF 799
+COEF 995
+COEF 1189, 1
+COEF 1380, 1
+COEF 1567
+COEF 1751
+COEF 1931
+COEF 2106, 1
+COEF 2276, 1
+COEF 2440
+COEF 2598, 1
+COEF 2751, 1
+COEF 2896
+COEF 3035
+COEF 3166
+COEF 3290
+COEF 3406
+COEF 3513
+COEF 3612
+COEF 3703
+COEF 3784
+COEF 3857
+COEF 3920
+COEF 3973
+COEF 4017
+COEF 4052
+COEF 4076
+COEF 4091
+
+deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+
+%if ARCH_X86_32
+pd_1: times 4 dd 1
+%endif
+pd_2: times 4 dd 2
+pw_5: times 8 dw 5
+pd_1321: times 4 dd 1321
+pd_2482: times 4 dd 2482
+pd_m3344: times 4 dd -3344
+pd_2048: times 4 dd 2048
+pw_4x2048_4xm2048: times 4 dw 2048
+ times 4 dw -2048
+pw_4xm2048_4x2048: times 4 dw -2048
+ times 4 dw 2048
+pw_2048: times 8 dw 2048
+pw_m2048: times 8 dw -2048
+pd_3803: times 4 dd 3803
+pw_4096: times 8 dw 4096
+pd_5793: times 4 dd 5793
+pd_6144: times 4 dd 6144
+pw_8192: times 8 dw 8192
+pd_10240: times 4 dd 10240
+pd_11586: times 4 dd 11586
+pw_1697x8: times 8 dw 1697*8
+pw_2896x8: times 8 dw 2896*8
+pw_1697x16: times 8 dw 1697*16
+pw_16384: times 8 dw 16384
+pixel_10bpc_max: times 8 dw 0x03ff
+
+pw_1567_3784: times 4 dw 1567, 3784
+pw_m3784_1567: times 4 dw -3784, 1567
+pw_2896_2896: times 4 dw 2896, 2896
+pw_m2896_2896: times 4 dw -2896, 2896
+
+clip_18b_min: times 4 dd -0x20000
+clip_18b_max: times 4 dd 0x1ffff
+
+idct64_mul_16bpc:
+dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017
+dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799
+dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276
+dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406
+
+cextern inv_txfm_add_dct_dct_4x4_8bpc_ssse3
+cextern iadst_4x4_internal_8bpc_ssse3.main
+cextern idct_4x8_internal_8bpc_ssse3.main
+cextern iadst_4x8_internal_8bpc_ssse3.main
+cextern idct_16x4_internal_8bpc_ssse3.main
+cextern iadst_16x4_internal_8bpc_ssse3.main
+cextern iadst_16x4_internal_8bpc_ssse3.main_pass2_end
+cextern idct_8x4_internal_8bpc_ssse3.main
+cextern iadst_8x4_internal_8bpc_ssse3.main
+cextern idct_8x8_internal_8bpc_ssse3.main
+cextern idct_8x8_internal_8bpc_ssse3.pass1_end3
+cextern iadst_8x8_internal_8bpc_ssse3.main
+cextern iadst_8x8_internal_8bpc_ssse3.main_pass2_end
+cextern idct_16x8_internal_8bpc_ssse3.main
+cextern iadst_16x8_internal_8bpc_ssse3.main
+cextern iadst_16x8_internal_8bpc_ssse3.main_pass2_end
+cextern idct_8x32_internal_8bpc_ssse3.main
+cextern idct_8x32_internal_8bpc_ssse3.main_fast
+cextern idct_8x32_internal_8bpc_ssse3.main_veryfast
+cextern idct_16x64_internal_8bpc_ssse3.main
+cextern idct_16x64_internal_8bpc_ssse3.main_fast
+
+tbl_4x16_2d: db 0, 13, 29, 45
+tbl_4x16_h: db 0, 16, 32, 48
+tbl_4x16_v: db 0, 4, 8, 12
+
+tbl_8x16_2d: db 0, 14, 30, 46
+tbl_8x16_v: db 0, 4, 8, 12
+tbl_8x16_h: db 0, 32, 64, 96
+
+tbl_16x16_2d: db 0, 10, 36, 78
+tbl_16x16_v: db 0, 4, 8, 12
+tbl_16x16_h: db 0, 64, 128, 192
+
+tbl_8x32_2d: dw 0, 14, 43, 75, 107, 139, 171, 203
+
+tbl_16x32_2d: dw 0, 14, 44, 90, 151, 215, 279, 343
+
+tbl_32x16_2d: ; first 4 entries of 32x32 are identical to this one
+tbl_32x32_2d: dw 0, 10, 36, 78, 136, 210, 300, 406
+
+tbl_Nx32_odd_offset: db 2*16, 2*23
+ db 2*20, 2*19
+ db 2*18, 2*21
+ db 2*22, 2*17
+ db 2*30, 2*25
+ db 2*26, 2*29
+ db 2*28, 2*27
+ db 2*24, 2*31
+
+tbl_Nx64_offset: db 2* 0, 2*32, 2*16, 2*46
+ db 2* 8, 2*40, 2*23, 2*38
+ db 2* 1, 2*36, 2*20, 2*42
+ db 2* 9, 2*44, 2*19, 2*34
+ db 2* 2, 2*60, 2*18, 2*50
+ db 2*10, 2*52, 2*21, 2*58
+ db 2* 3, 2*56, 2*22, 2*54
+ db 2*11, 2*48, 2*17, 2*62
+
+SECTION .text
+
+%define m_suffix(x, sfx) mangle(private_prefix %+ _ %+ x %+ sfx)
+%define m(x) m_suffix(x, SUFFIX)
+
+; This refers to the first function in itx_sse i.e. the start of the text section
+; which is needed as a base pointer for constants.
+%define itx8_start m_suffix(inv_txfm_add_dct_dct_4x4_8bpc, _ssse3)
+
+%if ARCH_X86_64
+%define o(x) x
+%else
+%define o(x) r6-$$+x ; PIC
+%endif
+
+%macro IWHT4_1D 0
+ ; m0 = in0, m1 = in1, m2 = in2, m3 = in3
+ paddd m0, m1 ; in0 += in1
+ psubd m4, m2, m3 ; tmp0 = in2 - in3
+ psubd m5, m0, m4 ; tmp1 = (in0 - tmp0) >> 1
+ psrad m5, 1
+ psubd m2, m5, m1 ; in2 = tmp1 - in1
+ psubd m5, m3 ; in1 = tmp1 - in3
+ psubd m0, m5 ; in0 -= in1
+ paddd m4, m2 ; in3 = tmp0 + in2
+ ; m0 = out0, m1 = in1, m2 = out2, m3 = in3
+ ; m4 = out3, m5 = out1
+%endmacro
+
+INIT_XMM sse2
+cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 3, 6, dst, stride, c, eob, bdmax
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ mova m2, [cq+16*2]
+ mova m3, [cq+16*3]
+ REPX {psrad x, 2}, m0, m1, m2, m3
+ IWHT4_1D
+ punpckldq m1, m0, m5
+ punpckhdq m3, m0, m5
+ punpckldq m5, m2, m4
+ punpckhdq m2, m4
+ punpcklqdq m0, m1, m5
+ punpckhqdq m1, m5
+ punpcklqdq m4, m3, m2
+ punpckhqdq m3, m2
+ mova m2, m4
+ IWHT4_1D
+ packssdw m0, m4 ; low: out3, high: out0
+ packssdw m2, m5 ; low: out2, high: out1
+ pxor m4, m4
+ mova [cq+16*0], m4
+ mova [cq+16*1], m4
+ mova [cq+16*2], m4
+ mova [cq+16*3], m4
+ lea r2, [dstq+strideq*2]
+ movq m1, [dstq+strideq*0]
+ movhps m1, [r2 +strideq*1]
+ movq m3, [r2 +strideq*0]
+ movhps m3, [dstq+strideq*1]
+ movd m5, bdmaxm
+ pshuflw m5, m5, q0000 ; broadcast
+ punpcklqdq m5, m5 ; broadcast
+ paddsw m0, m1
+ paddsw m2, m3
+ pmaxsw m0, m4
+ pmaxsw m2, m4
+ pminsw m0, m5
+ pminsw m2, m5
+ movhps [r2 +strideq*1], m0 ; write out0
+ movhps [dstq+strideq*1], m2 ; write out1
+ movq [r2 +strideq*0], m2 ; write out2
+ movq [dstq+strideq*0], m0 ; write out3
+ RET
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+; flags: 2 = inv_dst1, 4 = inv_dst2
+; skip round/shift if rnd is not a number
+%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
+; %1 dst/src[1]
+; %2 dst/src[2]
+; %3 tmp[1]
+; %4 tmp[2]
+; %5 tmp[3]
+; %6 rnd
+; %7 coef[1]
+; %8 coef[2]
+; %9 flags
+%ifnidn %7,%8 ; optimize when coef1 == coef2
+%if %8 < 32
+ pmulld m%4, m%1, m%8
+ pmulld m%3, m%2, m%8
+%else
+ mova m%3, [o(pd_%8)]
+ pmulld m%4, m%1, m%3
+ pmulld m%3, m%2
+%endif
+%endif
+%if %7 < 32
+ pmulld m%1, m%7
+ pmulld m%2, m%7
+%else
+ mova m%5, [o(pd_%7)]
+ pmulld m%1, m%5
+ pmulld m%2, m%5
+%endif
+%if %9 & 4 ; invert dst2
+ paddd m%4, m%2
+ psubd m%2, m%6, m%4
+%else
+%ifnum %6
+%ifnidn %7,%8
+ paddd m%4, m%6
+%else
+ paddd m%1, m%6
+%endif
+%endif
+%ifnidn %7,%8
+ paddd m%2, m%4
+%else
+ mova m%3, m%2
+ paddd m%2, m%1
+%endif
+%endif
+%if %9 & 2 ; invert dst1
+ psubd m%3, m%1
+ paddd m%1, m%3, m%6
+%else
+%ifnum %6
+%ifnidn %7,%8
+ paddd m%1, m%6
+%endif
+%endif
+ psubd m%1, m%3
+%endif
+%ifnum %6
+ psrad m%2, 12
+ psrad m%1, 12
+%endif
+%endmacro
+
+%macro INV_TXFM_FN 4-5+ 8 ; type1, type2, eob_offset, size, mmsize/stack
+cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, %5, dst, stride, c, eob, tx2
+ %define %%p1 m(i%1_%4_internal_16bpc)
+%if ARCH_X86_32
+ LEA r6, $$
+%endif
+%if has_epilogue
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jz %%end
+%endif
+ lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)]
+%ifnum %3
+%if %3
+ add eobd, %3
+%endif
+%else
+ lea r5, [o(%3)]
+%endif
+ call %%p1
+ RET
+%%end:
+%else
+ ; Jump to the 1st txfm function if we're not taking the fast path, which
+ ; in turn performs an indirect jump to the 2nd txfm function.
+ lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)]
+%ifnum %3
+%if %3
+ add eobd, %3
+%endif
+%else
+ lea r5, [o(%3)]
+%endif
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jnz %%p1
+%else
+ ; jump to the 1st txfm function unless it's located directly after this
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endif
+%endmacro
+
+%macro INV_TXFM_4X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 0, 4x4
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 4
+.dconly:
+ add r5d, 128
+ sar r5d, 8
+.dconly2:
+ imul r5d, 2896
+ mova m2, [o(pixel_10bpc_max)]
+ add r5d, 34816
+ movd m0, r5d
+ pshuflw m0, m0, q1111
+ pxor m3, m3
+ punpcklqdq m0, m0
+.dconly_loop:
+ movq m1, [dstq+strideq*0]
+ movhps m1, [dstq+strideq*1]
+ paddw m1, m0
+ pminsw m1, m2
+ pmaxsw m1, m3
+ movq [dstq+strideq*0], m1
+ movhps [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd
+ ; butterfly rotation
+ ITX_MULSUB_2D %1, %3, %5, %6, %7, %8, 2896, 2896 ; %1 out1 %3 out0
+ ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; %2 out2 %4 out3
+ ; Hadamard rotation
+ psubd m%5, m%1, m%2
+ paddd m%2, m%1
+ paddd m%1, m%3, m%4
+ psubd m%3, m%4
+ ; %1 (src1) = out0
+ ; %2 (src2) = out1
+ ; %3 (src3) = out3
+ ; $5 (tmp1) = out2
+%endmacro
+
+INIT_XMM sse4
+
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, identity
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+
+cglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ mova m2, [cq+16*2]
+ mova m3, [cq+16*3]
+ mova m5, [o(pd_2048)]
+ call .pass1_main
+ packssdw m0, m1 ; out0 out1
+ packssdw m4, m2 ; out2 out3
+ ; transpose
+ punpckhwd m2, m0, m4
+ punpcklwd m0, m4
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ ; m0 = out0 out1
+ ; m1 = out2 out3
+ ; m5 = pd_2048
+ jmp tx2q
+.pass1_main:
+ IDCT4_1D 0, 1, 2, 3, 4, 6, 7, 5
+ ret
+.pass2:
+ ; m0 = in0 in1
+ ; m1 = in2 in3
+ ; m5 = pd_2048
+ punpckhwd m2, m1, m0
+ punpcklwd m1, m0
+ pmaddwd m4, m2, [o(pw_m3784_1567)]
+ pmaddwd m2, [o(pw_1567_3784)]
+ pmaddwd m0, m1, [o(pw_m2896_2896)]
+ pmaddwd m1, [o(pw_2896_2896)]
+ REPX {paddd x, m5}, m4, m2, m0, m1
+ packssdw m5, m5 ; pw_2048
+ REPX {psrad x, 12}, m4, m2, m0, m1
+ packssdw m2, m4 ; t3 t2
+ packssdw m1, m0 ; t0 t1
+ paddsw m0, m1, m2 ; out0 out1
+ psubsw m1, m2 ; out3 out2
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ movq m2, [dstq+strideq*0]
+ movhps m2, [dstq+strideq*1]
+ lea r5, [dstq+strideq*2]
+ movq m3, [r5 +strideq*1]
+ movhps m3, [r5 +strideq*0]
+ mova m5, [o(pixel_10bpc_max)]
+ pxor m4, m4
+ mova [cq+16*0], m4
+ mova [cq+16*1], m4
+ mova [cq+16*2], m4
+ mova [cq+16*3], m4
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pminsw m0, m5
+ pminsw m1, m5
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movhps [r5 +strideq*0], m1
+ movq [r5 +strideq*1], m1
+ RET
+
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
+INV_TXFM_4X4_FN adst, identity
+
+cglobal iadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ call .main
+ packssdw m0, m2 ; out0 out1
+ packssdw m1, m4 ; out2 out3
+ ; transpose
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ ; m0 = out0 out1
+ ; m1 = out2 out3
+ ; m5 = pd_2048
+ jmp tx2q
+.pass2:
+ ; m0 = in0 in1
+ ; m1 = in2 in3
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main
+.end:
+ mova m4, [o(pw_2048)]
+ movq m2, [dstq+strideq*0]
+ movhps m2, [dstq+strideq*1]
+ lea r5, [dstq+strideq*2]
+ movq m3, [r5 +strideq*0]
+ movhps m3, [r5 +strideq*1]
+ mova m5, [o(pixel_10bpc_max)]
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ pxor m4, m4
+ mova [cq+16*0], m4
+ mova [cq+16*1], m4
+ mova [cq+16*2], m4
+ mova [cq+16*3], m4
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pminsw m0, m5
+ pminsw m1, m5
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [r5 +strideq*0], m1
+ movhps [r5 +strideq*1], m1
+ RET
+ALIGN function_align
+.main:
+ mova m1, [cq+16*2]
+ mova m3, [cq+16*3]
+ mova m5, [cq+16*0]
+ lea r3, [cq+16*1]
+.main2:
+ mova m0, [o(pd_1321)] ; SINPI_1_9
+ mova m2, [o(pd_2482)] ; SINPI_2_9
+ mova m6, [o(pd_3803)] ; SINPI_4_9
+ pmulld m4, m0, m1 ; s[4] = SINPI_1_9 * T[2]
+ pmulld m7, m3, m6 ; s[6] = SINPI_4_9 * T[3]
+ pmulld m6, m1 ; s[3] = SINPI_4_9 * T[2]
+ pmulld m0, m5 ; s[0] = SINPI_1_9 * T[0]
+ psubd m1, m3 ; T[2] - T[3]
+ pmulld m3, m2 ; s[5] = SINPI_2_9 * T[3]
+ pmulld m2, m5 ; s[1] = SINPI_2_9 * T[0]
+ paddd m0, m6 ; s[0] += s[3]
+ paddd m0, m3 ; s[0] += s[5]
+ mova m3, [o(pd_m3344)] ; -SINPI_3_9
+ psubd m2, m4 ; s[1] -= s[4]
+ psubd m2, m7 ; s[1] -= s[6]
+ psubd m1, m5 ; -b7 = (T[2] -T[3]) - T[0]
+ pmulld m1, m3 ; s[2] = -SINPI_3_9 * -b7
+ pmulld m3, [r3] ; -s[3] = -SINPI_3_9 * T[1]
+ mova m5, [o(pd_2048)]
+ REPX {paddd x, m5}, m0, m1 ; {s[0], s[2]} + 2048
+ paddd m4, m0, m2 ; x[3] = s[0] + s[1]
+ psubd m2, m3 ; x[1] = s[1] + s[3]
+ psubd m0, m3 ; x[0] = s[0] + s[3]
+ paddd m4, m3 ; x[3] -= s[3]
+ paddd m2, m5 ; x[1] + 2048
+ REPX {psrad x, 12}, m0, m2, m1, m4
+ ret
+
+
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
+INV_TXFM_4X4_FN flipadst, identity
+
+cglobal iflipadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ call m(iadst_4x4_internal_16bpc).main
+ packssdw m0, m2 ; out0 out1
+ packssdw m1, m4 ; out2 out3
+ ; transpose
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ ; m0 = out0 out1
+ ; m1 = out2 out3
+ ; m5 = pd_2048
+ jmp tx2q
+.pass2:
+ ; m0 = in0 in1
+ ; m1 = in2 in3
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main
+ mova m4, [o(pw_2048)]
+ movq m3, [dstq+strideq*1]
+ movhps m3, [dstq+strideq*0]
+ lea r5, [dstq+strideq*2]
+ movq m2, [r5 +strideq*1]
+ movhps m2, [r5 +strideq*0]
+ mova m5, [o(pixel_10bpc_max)]
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ pxor m4, m4
+ mova [cq+16*0], m4
+ mova [cq+16*1], m4
+ mova [cq+16*2], m4
+ mova [cq+16*3], m4
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m4
+ pmaxsw m1, m4
+ pminsw m0, m5
+ pminsw m1, m5
+ movhps [dstq+strideq*0], m1
+ movq [dstq+strideq*1], m1
+ movhps [r5 +strideq*0], m0
+ movq [r5 +strideq*1], m0
+ RET
+
+INV_TXFM_4X4_FN identity, dct
+INV_TXFM_4X4_FN identity, adst
+INV_TXFM_4X4_FN identity, flipadst
+INV_TXFM_4X4_FN identity, identity
+
+cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ mova m3, [o(pd_5793)]
+ pmulld m0, m3, [cq+16*0]
+ pmulld m1, m3, [cq+16*1]
+ pmulld m2, m3, [cq+16*2]
+ pmulld m3, [cq+16*3]
+ mova m5, [o(pd_2048)]
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ packssdw m0, m1
+ packssdw m2, m3
+ ; transpose
+ punpckhwd m3, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ ; m0 = out0 out1
+ ; m1 = out2 out3
+ ; m5 = pd_2048
+ jmp tx2q
+.pass2:
+ ; m0 = in0 in1
+ ; m1 = in2 in3
+ ; m5 = pd_2048
+ mova m4, [o(pw_1697x8)]
+ movq m2, [dstq+strideq*0]
+ movhps m2, [dstq+strideq*1]
+ lea r5, [dstq+strideq*2]
+ pmulhrsw m3, m4, m0
+ pmulhrsw m4, m1
+ paddsw m0, m3
+ paddsw m1, m4
+ movq m3, [r5 +strideq*0]
+ movhps m3, [r5 +strideq*1]
+ mova m4, [o(pixel_10bpc_max)]
+ packssdw m5, m5 ; pw_2048
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ pxor m5, m5
+ mova [cq+16*0], m5
+ mova [cq+16*1], m5
+ mova [cq+16*2], m5
+ mova [cq+16*3], m5
+ paddw m0, m2
+ paddw m1, m3
+ pmaxsw m0, m5
+ pmaxsw m1, m5
+ pminsw m0, m4
+ pminsw m1, m4
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [r5 +strideq*0], m1
+ movhps [r5 +strideq*1], m1
+ RET
+
+%macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2, eob_offset
+ INV_TXFM_FN %1, %2, %3, 4x8
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 8
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly
+%endif
+%endmacro
+
+INV_TXFM_4X8_FN dct, dct
+INV_TXFM_4X8_FN dct, identity, 9
+INV_TXFM_4X8_FN dct, adst
+INV_TXFM_4X8_FN dct, flipadst
+
+cglobal idct_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+ mova m5, [o(pd_2048)]
+%if ARCH_X86_64
+ xor r5d, r5d
+ cmp eobd, 13
+ setge r5b
+%else
+ mov r5d, 1
+ cmp eobd, 13
+ sbb r5d, 0
+%endif
+ shl r5d, 4
+.loop_pass1:
+ mova m3, [o(pd_2896)]
+ pmulld m0, m3, [cq+32*0+r5]
+ pmulld m1, m3, [cq+32*1+r5]
+ pmulld m2, m3, [cq+32*2+r5]
+ pmulld m3, [cq+32*3+r5]
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ call m(idct_4x4_internal_16bpc).pass1_main
+ packssdw m0, m1 ; out0 out1
+ packssdw m4, m2 ; out2 out3
+ test r5d, r5d
+ jz .end_pass1
+ mova [cq+32*0+16], m0
+ mova [cq+32*1+16], m4
+ xor r5d, r5d
+ jmp .loop_pass1
+.end_pass1:
+ punpckhwd m2, m0, m4
+ punpcklwd m0, m4
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ mova m2, [cq+32*0+16]
+ mova m6, [cq+32*1+16]
+ punpckhwd m4, m2, m6
+ punpcklwd m2, m6
+ punpckhwd m3, m2, m4
+ punpcklwd m2, m4
+ ; m0-3 = packed & transposed output
+ jmp tx2q
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(idct_4x8_internal_8bpc, _ssse3).main
+ ; m0-3 is now out0/1,3/2,4/5,7/6
+ mova m4, [o(pw_2048)]
+ shufps m1, m1, q1032
+ shufps m3, m3, q1032
+.end:
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ pxor m4, m4
+ REPX {mova [cq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7
+ mova m7, [o(pixel_10bpc_max)]
+ lea r2, [strideq*3]
+ movq m5, [dstq+strideq*0]
+ movq m6, [dstq+strideq*2]
+ movhps m5, [dstq+strideq*1]
+ movhps m6, [dstq+r2]
+ lea r4, [dstq+strideq*4]
+ paddw m0, m5
+ paddw m1, m6
+ movq m5, [r4+strideq*0]
+ movq m6, [r4+strideq*2]
+ movhps m5, [r4+strideq*1]
+ movhps m6, [r4+r2]
+ paddw m2, m5
+ paddw m3, m6
+ REPX {pminsw x, m7}, m0, m1, m2, m3
+ REPX {pmaxsw x, m4}, m0, m1, m2, m3
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m1
+ movhps [dstq+r2 ], m1
+ movq [r4 +strideq*0], m2
+ movhps [r4 +strideq*1], m2
+ movq [r4 +strideq*2], m3
+ movhps [r4 +r2 ], m3
+ RET
+
+INV_TXFM_4X8_FN adst, dct
+INV_TXFM_4X8_FN adst, adst
+INV_TXFM_4X8_FN adst, flipadst
+INV_TXFM_4X8_FN adst, identity, 9
+
+cglobal iadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ call .pass1_main
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ mova m2, [cq+32*2+16]
+ mova m6, [cq+32*3+16]
+ punpckhwd m4, m2, m6
+ punpcklwd m2, m6
+ punpckhwd m3, m2, m4
+ punpcklwd m2, m4
+ ; m0-3 = packed & transposed output
+ jmp tx2q
+.pass1_main:
+%undef cmp
+%if ARCH_X86_64
+ xor r5d, r5d
+ cmp eobd, 13
+ setge r5b
+%else
+ mov r5d, 1
+ cmp eobd, 13
+ sbb r5d, 0
+%endif
+ shl r5d, 4
+ lea r3, [cq+32*1+16]
+.loop_pass1:
+ mova m0, [o(pd_2048)]
+ mova m3, [o(pd_2896)]
+ pmulld m5, m3, [cq+32*0+r5]
+ pmulld m2, m3, [cq+32*1+r5]
+ pmulld m1, m3, [cq+32*2+r5]
+ pmulld m3, [cq+32*3+r5]
+ REPX {paddd x, m0}, m5, m2, m1, m3
+ REPX {psrad x, 12}, m5, m2, m1, m3
+ mova [r3], m2
+ call m(iadst_4x4_internal_16bpc).main2
+ packssdw m0, m2 ; out0 out1
+ packssdw m1, m4 ; out2 out3
+ test r5d, r5d
+ jz .end_pass1
+ mova [cq+32*2+16], m0
+ mova [cq+32*3+16], m1
+ xor r5d, r5d
+ jmp .loop_pass1
+.end_pass1:
+ ret
+.pass2:
+ shufps m0, m0, q1032
+ shufps m1, m1, q1032
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main
+ mova m4, [o(pw_4x2048_4xm2048)]
+ jmp m(idct_4x8_internal_16bpc).end
+
+INV_TXFM_4X8_FN flipadst, dct
+INV_TXFM_4X8_FN flipadst, adst
+INV_TXFM_4X8_FN flipadst, flipadst
+INV_TXFM_4X8_FN flipadst, identity, 9
+
+cglobal iflipadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ call m(iadst_4x8_internal_16bpc).pass1_main
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ mova m6, [cq+32*2+16]
+ mova m2, [cq+32*3+16]
+ punpcklwd m4, m2, m6
+ punpckhwd m2, m6
+ punpckhwd m3, m2, m4
+ punpcklwd m2, m4
+ ; m0-3 = packed & transposed output
+ jmp tx2q
+.pass2:
+ shufps m0, m0, q1032
+ shufps m1, m1, q1032
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main
+ mova m4, m0
+ mova m5, m1
+ pshufd m0, m3, q1032
+ pshufd m1, m2, q1032
+ pshufd m2, m5, q1032
+ pshufd m3, m4, q1032
+ mova m4, [o(pw_4xm2048_4x2048)]
+ jmp m(idct_4x8_internal_16bpc).end
+
+INV_TXFM_4X8_FN identity, dct
+INV_TXFM_4X8_FN identity, adst
+INV_TXFM_4X8_FN identity, flipadst
+INV_TXFM_4X8_FN identity, identity, 3
+
+cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+ mova m5, [o(pd_2048)]
+ mova m4, [o(pd_2896)]
+ mova m6, [o(pd_5793)]
+ ; clear m7 in case we skip the bottom square
+ pxor m7, m7
+%if ARCH_X86_64
+ xor r5d, r5d
+ cmp eobd, 16
+ setge r5b
+%else
+ mov r5d, 1
+ cmp eobd, 16
+ sbb r5d, 0
+%endif
+ shl r5d, 4
+.loop_pass1:
+ pmulld m0, m4, [cq+32*0+r5]
+ pmulld m1, m4, [cq+32*1+r5]
+ pmulld m2, m4, [cq+32*2+r5]
+ pmulld m3, m4, [cq+32*3+r5]
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ REPX {pmulld x, m6}, m0, m1, m2, m3
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 12}, m0, m1, m2, m3
+ packssdw m0, m1
+ packssdw m2, m3
+ test r5d, r5d
+ jz .end_pass1
+ mova [cq+32*0+16], m0
+ mova m7, m2
+ xor r5d, r5d
+ jmp .loop_pass1
+.end_pass1:
+ punpckhwd m4, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m1, m0, m4
+ punpcklwd m0, m4
+ mova m2, [cq+32*0+16]
+ punpckhwd m4, m2, m7
+ punpcklwd m2, m7
+ punpckhwd m3, m2, m4
+ punpcklwd m2, m4
+ ; m0-3 = packed & transposed output
+ jmp tx2q
+.pass2:
+ mova m4, [o(pw_4096)]
+ jmp m(idct_4x8_internal_16bpc).end
+
+%macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
+ INV_TXFM_FN %1, %2, tbl_4x16_%3, 4x16
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 16
+ add r5d, 384
+ sar r5d, 9
+ jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly2
+%endif
+%endmacro
+
+INV_TXFM_4X16_FN dct, dct
+INV_TXFM_4X16_FN dct, identity, v
+INV_TXFM_4X16_FN dct, adst
+INV_TXFM_4X16_FN dct, flipadst
+
+cglobal idct_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+%if ARCH_X86_32
+ mov r5m, r6d
+%endif
+ mov r6d, 4
+.zero_loop:
+ dec r6d
+ cmp eobb, byte [r5+r6]
+ jl .zero_loop
+ mov r5d, r6d
+ shl r5d, 4
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r6, r5m
+%endif
+ mova m5, [o(pd_2048)]
+.loop_pass1:
+ mova m0, [cq+64*0+r5]
+ mova m1, [cq+64*1+r5]
+ mova m2, [cq+64*2+r5]
+ mova m3, [cq+64*3+r5]
+ call m(idct_4x4_internal_16bpc).pass1_main
+ pcmpeqd m3, m3
+ REPX {psubd x, m3}, m0, m1, m4, m2
+ REPX {psrad x, 1}, m0, m1, m4, m2
+ packssdw m0, m1 ; out0 out1
+ packssdw m4, m2 ; out2 out3
+ punpckhwd m2, m0, m4
+ punpcklwd m0, m4
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ test r5d, r5d
+ jz .end_pass1
+ mova [cq+64*0+r5], m0
+ mova [cq+64*1+r5], m1
+ sub r5d, 16
+ jmp .loop_pass1
+.end_pass1:
+ mova m2, [cq+64*0+16]
+ mova m3, [cq+64*1+16]
+ mova m4, [cq+64*0+32]
+ mova m5, [cq+64*1+32]
+ mova m6, [cq+64*0+48]
+ mova m7, [cq+64*1+48]
+ ; m0-7 = packed & transposed output
+ jmp tx2q
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(idct_16x4_internal_8bpc, _ssse3).main
+ ; m0-6 is out0-13 [with odd registers having inversed output]
+ ; [coeffq+16*7] has out15/14
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [cq+16*7]
+ REPX {shufps x, x, q1032}, m1, m3, m5, m7
+ mova [cq+16*0], m4
+ mova [cq+16*1], m5
+ mova [cq+16*2], m6
+ mova [cq+16*3], m7
+.end:
+ pxor m4, m4
+ REPX {mova [cq+16*x], m4}, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ mova m7, [o(pixel_10bpc_max)]
+ mov r5d, 2
+ lea r3, [strideq*3]
+.loop:
+ movq m5, [dstq+strideq*0]
+ movq m6, [dstq+strideq*2]
+ movhps m5, [dstq+strideq*1]
+ movhps m6, [dstq+r3]
+ lea r4, [dstq+strideq*4]
+ paddw m0, m5
+ paddw m1, m6
+ movq m5, [r4+strideq*0]
+ movq m6, [r4+strideq*2]
+ movhps m5, [r4+strideq*1]
+ movhps m6, [r4+r3]
+ paddw m2, m5
+ paddw m3, m6
+ REPX {pminsw x, m7}, m0, m1, m2, m3
+ REPX {pmaxsw x, m4}, m0, m1, m2, m3
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m1
+ movhps [dstq+r3 ], m1
+ movq [r4 +strideq*0], m2
+ movhps [r4 +strideq*1], m2
+ movq [r4 +strideq*2], m3
+ movhps [r4 +r3 ], m3
+ dec r5d
+ jz .end2
+ lea dstq, [dstq+strideq*8]
+ mova m0, [cq+0*16]
+ mova m1, [cq+1*16]
+ mova m2, [cq+2*16]
+ mova m3, [cq+3*16]
+ REPX {mova [cq+x*16], m4}, 0, 1, 2, 3
+ jmp .loop
+.end2:
+ RET
+
+INV_TXFM_4X16_FN adst, dct
+INV_TXFM_4X16_FN adst, adst
+INV_TXFM_4X16_FN adst, flipadst
+INV_TXFM_4X16_FN adst, identity, v
+
+cglobal iadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+%if ARCH_X86_32
+ mov r5m, r6d
+%endif
+ mov r6d, 4
+.zero_loop:
+ dec r6d
+ cmp eobb, byte [r6+r5]
+ jl .zero_loop
+ mov r5d, r6d
+ shl r5d, 4
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r6, r5m
+%endif
+.loop_pass1:
+ mova m5, [cq+64*0+r5]
+ lea r3, [cq+64*1+r5]
+ mova m1, [cq+64*2+r5]
+ mova m3, [cq+64*3+r5]
+ call m(iadst_4x4_internal_16bpc).main2
+ pcmpeqd m3, m3
+ REPX {psubd x, m3}, m0, m2, m1, m4
+ REPX {psrad x, 1}, m0, m2, m1, m4
+ packssdw m0, m2 ; out0 out1
+ packssdw m1, m4 ; out2 out3
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ test r5d, r5d
+ jz m(idct_4x16_internal_16bpc).end_pass1
+ mova [cq+64*0+r5], m0
+ mova [cq+64*1+r5], m1
+ sub r5d, 16
+ jmp .loop_pass1
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main
+ call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end
+ ; m7/5/2/4 = out4/-11,-5/10,6/-9,-7/8
+ ; m0/3 & cq6/7 = out0/-15,-3/12,-1/14,2/-13
+ mova m1, [o(pw_4x2048_4xm2048)]
+ REPX {pmulhrsw x, m1}, m7, m2, m0
+ pshufd m6, m1, q1032 ; 4x-2048,4x2048
+ pmulhrsw m1, [cq+16*7]
+ REPX {pmulhrsw x, m6}, m5, m4, m3
+ pmulhrsw m6, [cq+16*6]
+ ; m7/5/2/4 = out4/11,5/10,6/9,7/8
+ ; m0/3/6/1 = out0/15,3/12,1/14,2/13
+ ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15
+ movhps [cq+0*8], m4
+ movhps [cq+1*8], m2
+ movhps [cq+2*8], m5
+ movhps [cq+3*8], m7
+ movhps [cq+4*8], m3
+ movhps [cq+5*8], m1
+ movhps [cq+6*8], m6
+ movhps [cq+7*8], m0
+ punpcklqdq m0, m6
+ punpcklqdq m1, m3
+ punpcklqdq m3, m2, m4
+ punpcklqdq m2, m7, m5
+ jmp m(idct_4x16_internal_16bpc).end
+
+INV_TXFM_4X16_FN flipadst, dct
+INV_TXFM_4X16_FN flipadst, adst
+INV_TXFM_4X16_FN flipadst, flipadst
+INV_TXFM_4X16_FN flipadst, identity, v
+
+cglobal iflipadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+%if ARCH_X86_32
+ mov r5m, r6d
+%endif
+ mov r6d, 4
+.zero_loop:
+ dec r6d
+ cmp eobb, byte [r5+r6]
+ jl .zero_loop
+ mov r5d, r6d
+ shl r5d, 4
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r6, r5m
+%endif
+.loop_pass1:
+ mova m5, [cq+64*0+r5]
+ lea r3, [cq+64*1+r5]
+ mova m1, [cq+64*2+r5]
+ mova m3, [cq+64*3+r5]
+ call m(iadst_4x4_internal_16bpc).main2
+ pcmpeqd m3, m3
+ REPX {psubd x, m3}, m0, m2, m1, m4
+ REPX {psrad x, 1}, m0, m2, m1, m4
+ packssdw m0, m2 ; out3 out2
+ packssdw m1, m4 ; out1 out0
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ test r5d, r5d
+ jz m(idct_4x16_internal_16bpc).end_pass1
+ mova [cq+64*0+r5], m0
+ mova [cq+64*1+r5], m1
+ sub r5d, 16
+ jmp .loop_pass1
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main
+ call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end
+ ; m7/5/2/4 = out11/-4,-10/5,9/-6,-8/7
+ ; m0/3 & cq6/7 = out15/-0,-12/3,-14/1,13/-2
+ mova m1, [o(pw_4x2048_4xm2048)]
+ REPX {pmulhrsw x, m1}, m7, m2, m0
+ pshufd m6, m1, q1032 ; 4x-2048,4x2048
+ pmulhrsw m1, [cq+16*7]
+ REPX {pmulhrsw x, m6}, m5, m4, m3
+ pmulhrsw m6, [cq+16*6]
+ ; m7/5/2/4 = out11/4,10/5,9/6,8/7
+ ; m0/3/6/1 = out15/0,12/3,14/1,13/2
+ ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15
+ movq [cq+0*8], m4
+ movq [cq+1*8], m2
+ movq [cq+2*8], m5
+ movq [cq+3*8], m7
+ movq [cq+4*8], m3
+ movq [cq+5*8], m1
+ movq [cq+6*8], m6
+ movq [cq+7*8], m0
+ punpckhqdq m0, m6
+ punpckhqdq m1, m3
+ punpckhqdq m3, m2, m4
+ punpckhqdq m2, m7, m5
+ jmp m(idct_4x16_internal_16bpc).end
+
+INV_TXFM_4X16_FN identity, dct, h
+INV_TXFM_4X16_FN identity, adst, h
+INV_TXFM_4X16_FN identity, flipadst, h
+INV_TXFM_4X16_FN identity, identity
+
+cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%undef cmp
+%if ARCH_X86_32
+ mov r5m, r6d
+%endif
+ mov r6d, 4
+.zero_loop:
+ dec r6d
+ cmp eobb, byte [r5+r6]
+ jl .zero_loop
+ mov r5d, r6d
+ shl r5d, 4
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r6, r5m
+%endif
+ mova m5, [o(pd_6144)]
+ mova m4, [o(pd_5793)]
+.loop_pass1:
+ pmulld m0, m4, [cq+64*0+r5]
+ pmulld m1, m4, [cq+64*1+r5]
+ pmulld m2, m4, [cq+64*2+r5]
+ pmulld m3, m4, [cq+64*3+r5]
+ REPX {paddd x, m5}, m0, m1, m2, m3
+ REPX {psrad x, 13}, m0, m1, m2, m3
+ packssdw m0, m1
+ packssdw m2, m3
+ punpckhwd m3, m0, m2
+ punpcklwd m0, m2
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ test r5d, r5d
+ jz m(idct_4x16_internal_16bpc).end_pass1
+ mova [cq+64*0+r5], m0
+ mova [cq+64*1+r5], m1
+ sub r5d, 16
+ jmp .loop_pass1
+.pass2:
+ mova [cq+16*4], m0
+ mova [cq+16*5], m1
+ mova [cq+16*6], m2
+ mova [cq+16*7], m7
+ mova m0, [o(pw_1697x16)]
+ mova m7, [o(pw_2048)]
+ pmulhrsw m1, m0, m4
+ pmulhrsw m2, m0, m5
+ REPX {paddsw x, x}, m4, m5
+ paddsw m4, m1
+ paddsw m5, m2
+ REPX {pmulhrsw x, m7}, m4, m5
+ mova [cq+16*0], m4
+ mova [cq+16*1], m5
+ mova m4, [cq+16*7]
+ pmulhrsw m1, m0, m6
+ pmulhrsw m2, m0, m4
+ REPX {paddsw x, x}, m6, m4
+ paddsw m6, m1
+ paddsw m4, m2
+ REPX {pmulhrsw x, m7}, m6, m4
+ mova [cq+16*2], m6
+ mova [cq+16*3], m4
+ mova m4, [cq+16*4]
+ mova m1, [cq+16*5]
+ mova m2, [cq+16*6]
+ pmulhrsw m5, m0, m2
+ pmulhrsw m6, m0, m3
+ REPX {paddsw x, x}, m2, m3
+ paddsw m2, m5
+ paddsw m3, m6
+ pmulhrsw m6, m0, m1
+ pmulhrsw m0, m4
+ REPX {paddsw x, x}, m1, m4
+ paddsw m1, m6
+ paddsw m0, m4
+ REPX {pmulhrsw x, m7}, m2, m3, m1, m0
+ jmp m(idct_4x16_internal_16bpc).end
+
+%macro INV_TXFM_8X4_FN 2 ; type1, type2
+%if ARCH_X86_64
+ INV_TXFM_FN %1, %2, 0, 8x4, 15
+%else
+ INV_TXFM_FN %1, %2, 0, 8x4, 8, 0-4*16
+%endif
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 2896
+ add r5d, 34816
+ movd m0, r5d
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ mova m6, [o(pixel_10bpc_max)]
+ pxor m5, m5
+ lea r2, [strideq*3]
+ mova m1, [dstq+strideq*0]
+ mova m2, [dstq+strideq*1]
+ mova m3, [dstq+strideq*2]
+ mova m4, [dstq+r2]
+ REPX {paddw x, m0}, m1, m2, m3, m4
+ REPX {pmaxsw x, m5}, m1, m2, m3, m4
+ REPX {pminsw x, m6}, m1, m2, m3, m4
+ mova [dstq+strideq*0], m1
+ mova [dstq+strideq*1], m2
+ mova [dstq+strideq*2], m3
+ mova [dstq+r2 ], m4
+ RET
+%endif
+%endmacro
+
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, identity
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+
+cglobal idct_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ lea r5, [o(.main)]
+.pass1_entry:
+%if ARCH_X86_32
+ lea r3, [rsp+gprsize]
+%else
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+0*16]
+ mova m1, [cq+1*16]
+ mova m2, [cq+2*16]
+ mova m3, [cq+3*16]
+ mova m4, [cq+4*16]
+ mova m5, [cq+5*16]
+ mova m6, [cq+6*16]
+ mova m7, [cq+7*16]
+ call .rect2_mul
+ call r5
+ call .transpose4x8packed
+ ; m0-3 = packed & transposed output
+ jmp tx2q
+.transpose4x8packed:
+ ; transpose
+ punpcklwd m1, m2, m6
+ punpckhwd m2, m6
+ punpckhwd m6, m0, m4
+ punpcklwd m0, m4
+
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m4, m6, m2
+ punpcklwd m6, m2
+
+ punpcklwd m2, m3, m4
+ punpckhwd m3, m4
+ punpckhwd m1, m0, m6
+ punpcklwd m0, m6
+ ret
+.main:
+ call .main_pass1
+ call .round
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ ret
+.rect2_mul:
+%if ARCH_X86_64
+ REPX {pmulld x, m14}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+%else
+ mova [r3], m7
+ mova m7, [o(pd_2896)]
+ REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulld m7, [r3]
+ mova [r3], m7
+ mova m7, [o(pd_2048)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [r3]
+%endif
+ REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7
+ ret
+%if ARCH_X86_64
+.main_pass1_fast:
+ pmulld m5, m3, [o(pd_m2276)]
+ pmulld m3, [o(pd_3406)]
+ pmulld m7, m1, [o(pd_4017)]
+ pmulld m1, [o(pd_799)]
+ pmulld m6, m2, [o(pd_3784)]
+ pmulld m2, [o(pd_1567)]
+ pmulld m0, m14
+ pxor m4, m4
+ jmp .main_pass1_fast2
+.main_pass1:
+ ITX_MULSUB_2D 5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a
+ ITX_MULSUB_2D 1, 7, 8, 9, 10, _, 799, 4017 ; t4a t7a
+ ITX_MULSUB_2D 2, 6, 8, 9, 10, _, 1567, 3784 ; t2 t3
+ REPX {pmulld x, m14}, m0, m4
+.main_pass1_fast2:
+ REPX {paddd x, m11}, m1, m2, m3, m5, m6, m7
+ REPX {psrad x, 12 }, m1, m2, m3, m5, m6, m7
+ paddd m8, m1, m5 ; t4
+ psubd m1, m5 ; t5a
+ paddd m9, m7, m3 ; t7
+ psubd m7, m3 ; t6a
+ REPX {pmaxsd x, m12}, m1, m8, m7, m9
+ REPX {pminsd x, m13}, m1, m8, m7, m9
+ REPX {pmulld x, m14}, m7, m1
+ paddd m0, m11
+ paddd m7, m11
+ psubd m5, m0, m4
+ paddd m0, m4
+ psubd m4, m7, m1
+ paddd m7, m1
+ REPX {psrad x, 12 }, m5, m0, m4, m7
+ psubd m3, m0, m6 ; dct4 out3
+ paddd m0, m6 ; dct4 out0
+ paddd m6, m5, m2 ; dct4 out1
+ psubd m5, m2 ; dct4 out2
+ REPX {pmaxsd x, m12}, m0, m6, m5, m3
+ REPX {pminsd x, m13}, m0, m6, m5, m3
+ ret
+.round:
+ paddd m1, m6, m7 ; out1
+ psubd m6, m7 ; out6
+ psubd m7, m0, m9 ; out7
+ paddd m0, m9 ; out0
+ paddd m2, m5, m4 ; out2
+ psubd m5, m4 ; out5
+ psubd m4, m3, m8 ; out4
+ paddd m3, m8 ; out3
+%else
+.main_pass1_fast:
+ pmulld m5, m3, [o(pd_m2276)]
+ pmulld m3, [o(pd_3406)]
+ pmulld m7, m1, [o(pd_4017)]
+ pmulld m1, [o(pd_799)]
+ pmulld m6, m2, [o(pd_3784)]
+ pmulld m2, [o(pd_1567)]
+ mova m4, [o(pd_2048)]
+ mova [r3+0*16], m2
+ REPX {paddd x, m4}, m5, m3, m7, m1
+ REPX {psrad x, 12}, m5, m3, m7, m1
+ paddd m2, m1, m5 ; t4
+ psubd m1, m5 ; t5a
+ pmulld m5, m0, [o(pd_2896)]
+ mova m0, m4
+ paddd m4, m7, m3 ; t7
+ psubd m7, m3 ; t6a
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3 }, m1, m2, m7, m4
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3 }, m1, m2, m7, m4
+ mova [r3+3*16], m2
+ mova [r3+1*16], m4
+ pxor m4, m4
+ mova m2, [r3+0*16]
+ mova m3, [o(pd_2896)]
+ jmp .main_pass1_fast2
+.main_pass1:
+ mova [r3+0*16], m0
+ mova [r3+1*16], m2
+ mova [r3+2*16], m4
+ mova [r3+3*16], m6
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 5, 3, 2, 4, 6, 0, 3406, 2276 ; t5a t6a
+ ITX_MULSUB_2D 1, 7, 2, 4, 6, 0, 799, 4017 ; t4a t7a
+ paddd m2, m1, m5 ; t4
+ psubd m1, m5 ; t5a
+ paddd m4, m7, m3 ; t7
+ psubd m7, m3 ; t6a
+ mova m6, [o(clip_18b_min)]
+ REPX {pmaxsd x, m6 }, m1, m2, m7, m4
+ mova m6, [o(clip_18b_max)]
+ REPX {pminsd x, m6 }, m1, m2, m7, m4
+ mova m6, [r3+3*16]
+ mova [r3+3*16], m2
+ mova m2, [r3+1*16]
+ mova [r3+1*16], m4
+
+ ITX_MULSUB_2D 2, 6, 4, 3, 5, _, 1567, 3784 ; t2 t3
+ mova m3, [o(pd_2896)]
+ mova m5, [r3+0*16]
+ mova m4, [r3+2*16]
+ REPX {pmulld x, m3 }, m5, m4
+.main_pass1_fast2:
+ REPX {paddd x, m0 }, m2, m6
+ REPX {psrad x, 12 }, m2, m6
+ REPX {pmulld x, m3 }, m7, m1
+ paddd m7, m0
+ paddd m0, m5
+
+ psubd m5, m0, m4
+ paddd m0, m4
+ psubd m4, m7, m1
+ paddd m7, m1
+ REPX {psrad x, 12 }, m5, m0, m4, m7
+ psubd m3, m0, m6 ; dct4 out3
+ paddd m0, m6 ; dct4 out0
+ paddd m6, m5, m2 ; dct4 out1
+ psubd m5, m2 ; dct4 out2
+
+ mova m1, [o(clip_18b_min)]
+ REPX {pmaxsd x, m1 }, m0, m6, m5, m3
+ mova m1, [o(clip_18b_max)]
+ REPX {pminsd x, m1 }, m0, m6, m5, m3
+ ret
+.round:
+ paddd m1, m6, m7 ; out1
+ psubd m6, m7 ; out6
+ mova [r3+0*16], m6
+ mova m6, [r3+1*16]
+ psubd m7, m0, m6 ; out7
+ paddd m0, m6 ; out0
+ paddd m2, m5, m4 ; out2
+ psubd m5, m4 ; out5
+ mova m6, [r3+3*16]
+ psubd m4, m3, m6 ; out4
+ paddd m3, m6 ; out3
+ mova m6, [r3+0*16]
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(idct_8x4_internal_8bpc, _ssse3).main
+.end:
+ lea r3, [strideq*3]
+ call .round2_and_write_8x4
+ REPX {mova [cq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+ RET
+.round2_and_write_8x4:
+ pxor m6, m6
+ mova m5, [o(pixel_10bpc_max)]
+ mova m4, [o(pw_2048)]
+.round1_and_write_8x4:
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+.write_8x4:
+ paddw m0, [dstq+strideq*0]
+ paddw m1, [dstq+strideq*1]
+ paddw m2, [dstq+strideq*2]
+ paddw m3, [dstq+r3]
+ REPX {pminsw x, m5}, m0, m1, m2, m3
+ REPX {pmaxsw x, m6}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r3 ], m3
+ ret
+
+INV_TXFM_8X4_FN adst, dct
+INV_TXFM_8X4_FN adst, adst
+INV_TXFM_8X4_FN adst, flipadst
+INV_TXFM_8X4_FN adst, identity
+
+cglobal iadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ lea r5, [o(.main)]
+ jmp m(idct_8x4_internal_16bpc).pass1_entry
+.main:
+ call .main_pass1
+ call .round
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ ret
+.main_pass1:
+%if ARCH_X86_64
+ ITX_MULSUB_2D 7, 0, 8, 9, 10, 11, 401, 4076 ; t1a, t0a
+ ITX_MULSUB_2D 1, 6, 8, 9, 10, 11, 3920, 1189 ; t7a, t6a
+ ITX_MULSUB_2D 5, 2, 8, 9, 10, 11, 1931, 3612 ; t3a, t2a
+ ITX_MULSUB_2D 3, 4, 8, 9, 10, 11, 3166, 2598 ; t5a, t4a
+ psubd m8, m2, m6 ; t6
+ paddd m2, m6 ; t2
+ psubd m6, m0, m4 ; t4
+ paddd m0, m4 ; t0
+ psubd m4, m5, m1 ; t7
+ paddd m5, m1 ; t3
+ psubd m1, m7, m3 ; t5
+ paddd m7, m3 ; t1
+ REPX {pmaxsd x, m12}, m6, m1, m8, m4, m2, m0, m5, m7
+ REPX {pminsd x, m13}, m6, m1, m8, m4, m2, m0, m5, m7
+ ITX_MULSUB_2D 6, 1, 3, 9, 10, 11, 1567, 3784 ; t5a, t4a
+ ITX_MULSUB_2D 4, 8, 3, 9, 10, 11, 3784, 10 ; t6a, t7a
+ psubd m9, m6, m8 ; t7
+ paddd m6, m8 ; out6
+ mova m8, [o(pd_2896)]
+ psubd m3, m7, m5 ; t3
+ paddd m7, m5 ; -out7
+ psubd m5, m0, m2 ; t2
+ paddd m0, m2 ; out0
+ psubd m2, m1, m4 ; t6
+ paddd m1, m4 ; -out1
+ REPX {pmaxsd x, m12}, m5, m3, m2, m9
+ REPX {pminsd x, m13}, m5, m3, m2, m9
+ REPX {pmulld x, m14}, m5, m3, m2, m9
+ psubd m4, m5, m3 ; (t2 - t3) * 2896
+ paddd m3, m5 ; (t2 + t3) * 2896
+ psubd m5, m2, m9 ; (t6 - t7) * 2896
+ paddd m2, m9 ; (t6 + t7) * 2896
+ ret
+.round:
+
+ ; m0=out0,m1=-out1,m6=out6,m7=-out7
+
+ pcmpeqd m8, m8
+ REPX {pxor x, m8 }, m1, m7, m3, m5
+ REPX {psubd x, m8 }, m1, m7
+ REPX {paddd x, m11}, m2, m3, m4, m5
+ REPX {psrad x, 12 }, m2, m3, m4, m5
+%else
+ mova [r3+0*16], m2
+ mova [r3+1*16], m3
+ mova [r3+2*16], m4
+ mova [r3+3*16], m5
+ mova m5, [o(pd_2048)]
+
+ ITX_MULSUB_2D 7, 0, 2, 3, 4, 5, 401, 4076 ; t1a, t0a
+ ITX_MULSUB_2D 1, 6, 2, 3, 4, 5, 3920, 1189 ; t7a, t6a
+ mova m2, [r3+0*16]
+ mova m3, [r3+1*16]
+ mova m4, [r3+2*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ mova [r3+2*16], m6
+ mova m1, [r3+3*16]
+ mova [r3+3*16], m7
+ ITX_MULSUB_2D 1, 2, 0, 6, 7, 5, 1931, 3612 ; t3a, t2a
+ ITX_MULSUB_2D 3, 4, 0, 6, 7, 5, 3166, 2598 ; t5a, t4a
+ mova m0, [r3+0*16]
+ mova m6, [r3+2*16]
+ psubd m7, m2, m6 ; t6
+ paddd m2, m6 ; t2
+ psubd m6, m0, m4 ; t4
+ paddd m0, m4 ; t0
+ mova [r3+0*16], m7
+ mova m5, [r3+1*16]
+ mova m7, [r3+3*16]
+ psubd m4, m1, m5 ; t7
+ paddd m5, m1 ; t3
+ psubd m1, m7, m3 ; t5
+ paddd m7, m3 ; t1
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3 }, m6, m1, m4, m2, m0, m5, m7
+ mova [r3+1*16], m7
+ mova m7, [o(clip_18b_max)]
+ pmaxsd m3, [r3+0*16]
+ REPX {pminsd x, m7 }, m6, m1, m3, m4, m2, m0, m5
+ pminsd m7, [r3+1*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m2
+ mova [r3+2*16], m5
+ mova [r3+3*16], m7
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 6, 1, 2, 5, 7, 0, 1567, 3784 ; t5a, t4a
+ ITX_MULSUB_2D 4, 3, 2, 5, 7, 0, 3784, 7 ; t6a, t7a
+ mova m5, [r3+2*16]
+ mova m7, [r3+3*16]
+ psubd m2, m6, m3 ; t7
+ paddd m6, m3 ; out6
+ mova [r3+3*16], m6
+ mova m0, [r3+0*16]
+ mova m6, [r3+1*16]
+ psubd m3, m7, m5 ; t3
+ paddd m7, m5 ; -out7
+ psubd m5, m0, m6 ; t2
+ paddd m0, m6 ; out0
+ psubd m6, m1, m4 ; t6
+ paddd m1, m4 ; -out1
+ mova m4, [o(clip_18b_min)]
+ REPX {pmaxsd x, m4 }, m5, m3, m6, m2
+ mova m4, [o(clip_18b_max)]
+ REPX {pminsd x, m4 }, m5, m3, m6, m2
+ mova m4, [o(pd_2896)]
+ REPX {pmulld x, m4 }, m5, m3, m6, m2
+ psubd m4, m5, m3 ; (t2 - t3) * 2896
+ paddd m3, m5 ; (t2 + t3) * 2896
+ psubd m5, m6, m2 ; (t6 - t7) * 2896
+ paddd m2, m6 ; (t6 + t7) * 2896
+ ret
+.round:
+ mova [r3+2*16], m0
+
+ pcmpeqd m0, m0
+ mova m6, [o(pd_2048)]
+ REPX {pxor x, m0 }, m1, m7, m3, m5
+ REPX {psubd x, m0 }, m1, m7
+ REPX {paddd x, m6 }, m2, m3, m4, m5
+ REPX {psrad x, 12 }, m2, m3, m4, m5
+
+ mova m6, [r3+3*16]
+ mova m0, [r3+2*16]
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main
+ jmp m(idct_8x4_internal_16bpc).end
+
+INV_TXFM_8X4_FN flipadst, dct
+INV_TXFM_8X4_FN flipadst, adst
+INV_TXFM_8X4_FN flipadst, flipadst
+INV_TXFM_8X4_FN flipadst, identity
+
+cglobal iflipadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ lea r5, [o(.main)]
+ jmp m(idct_8x4_internal_16bpc).pass1_entry
+.main:
+ call m(iadst_8x4_internal_16bpc).main_pass1
+ call m(iadst_8x4_internal_16bpc).round
+ packssdw m7, m6
+ packssdw m5, m4
+ packssdw m3, m2
+ packssdw m1, m0
+ mova m0, m7
+ mova m2, m5
+ mova m4, m3
+ mova m6, m1
+ ret
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main
+ lea r3, [strideq*3]
+ add dstq, r3
+ neg strideq
+ jmp m(idct_8x4_internal_16bpc).end
+
+INV_TXFM_8X4_FN identity, dct
+INV_TXFM_8X4_FN identity, adst
+INV_TXFM_8X4_FN identity, flipadst
+INV_TXFM_8X4_FN identity, identity
+
+cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ lea r5, [o(.main)]
+ jmp m(idct_8x4_internal_16bpc).pass1_entry
+.main:
+ REPX {paddd x, x}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ ret
+.pass2:
+ mova m7, [o(pw_1697x8)]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ jmp m(idct_8x4_internal_16bpc).end
+
+%macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset
+%if ARCH_X86_64
+ INV_TXFM_FN %1, %2, %3, 8x8, 15, 0-3*16
+%else
+ INV_TXFM_FN %1, %2, %3, 8x8, 8, 0-5*16
+%endif
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 2
+.end:
+ add r5d, 384
+ sar r5d, 9
+.end2:
+ imul r5d, 2896
+ add r5d, 34816
+ movd m0, r5d
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ mova m6, [o(pixel_10bpc_max)]
+ pxor m5, m5
+ lea r2, [strideq*3]
+.loop:
+ mova m1, [dstq+strideq*0]
+ mova m2, [dstq+strideq*1]
+ mova m3, [dstq+strideq*2]
+ mova m4, [dstq+r2]
+ REPX {paddw x, m0}, m1, m2, m3, m4
+ REPX {pmaxsw x, m5}, m1, m2, m3, m4
+ REPX {pminsw x, m6}, m1, m2, m3, m4
+ mova [dstq+strideq*0], m1
+ mova [dstq+strideq*1], m2
+ mova [dstq+strideq*2], m3
+ mova [dstq+r2 ], m4
+ lea dstq, [dstq+strideq*4]
+ dec r3d
+ jg .loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, identity, 6
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+
+cglobal idct_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_32
+ DECLARE_REG_TMP 1
+ mov [rsp+4*16+1*gprsize], r1
+%else
+ DECLARE_REG_TMP 6
+%endif
+ lea t0, [o(.pass1_main)]
+
+.pass1_full:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+%undef cmp
+%if ARCH_X86_64
+ xor r5d, r5d
+ cmp eobd, 10
+ setge r5b
+%else
+ mov r5d, 1
+ cmp eobd, 10
+ sbb r5d, 0
+%endif
+ shl r5d, 4
+%if ARCH_X86_32
+ lea r3, [rsp+gprsize]
+%endif
+.loop_pass1:
+ mova m0, [cq+0*32+r5]
+ mova m1, [cq+1*32+r5]
+ mova m2, [cq+2*32+r5]
+ mova m3, [cq+3*32+r5]
+ mova m4, [cq+4*32+r5]
+ mova m5, [cq+5*32+r5]
+ mova m6, [cq+6*32+r5]
+ mova m7, [cq+7*32+r5]
+ call t0
+
+ test r5d, r5d
+ jz .end_pass1
+
+ mova [cq+0*32+16], m0
+ mova [cq+1*32+16], m1
+ mova [cq+2*32+16], m2
+ mova [cq+3*32+16], m3
+
+ sub r5d, 16
+ jmp .loop_pass1
+.end_pass1:
+ mova m4, [cq+0*32+16]
+ mova m5, [cq+1*32+16]
+ mova m6, [cq+2*32+16]
+ mova m7, [cq+3*32+16]
+%if ARCH_X86_32
+ mov r1, [rsp+4*16+1*gprsize]
+%endif
+ jmp tx2q
+.pass1_main:
+ call m(idct_8x4_internal_16bpc).main_pass1
+ pcmpeqd m1, m1
+ REPX {psubd x, m1}, m0, m6, m5, m3
+ call m(idct_8x4_internal_16bpc).round
+ REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7
+.pack_and_transpose:
+ packssdw m2, m3
+ packssdw m6, m7
+ packssdw m0, m1
+ packssdw m4, m5
+ jmp m(idct_8x4_internal_16bpc).transpose4x8packed
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
+ lea r3, [strideq*3]
+%if ARCH_X86_64
+ mova m10, [o(pixel_10bpc_max)]
+ pxor m9, m9
+%endif
+ call .round3_and_write_8x8
+.zero:
+%if ARCH_X86_64
+%define mzero m9
+%else
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+16*x], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+%undef mzero
+ RET
+
+ ; round (rounded right-shift by 5) before writing
+ ; data in m0-7
+ ; on x86-64, pw_2048 is in m8
+ ; .round1 is for m0-7
+ ; .round2 is for m0-6 & [rsp+gprsize*2]
+ ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32)
+ ; .round4 is x86-32-only, it is similar to .round2 but with constant already in m7
+%if ARCH_X86_32
+.round1_and_write_8x8:
+ mova [rsp+gprsize*2], m7
+.round2_and_write_8x8:
+%endif
+.round3_and_write_8x8:
+ mova m7, [o(pw_2048)]
+%if ARCH_X86_32
+.round4_and_write_8x8:
+%endif
+ REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulhrsw m7, [rsp+gprsize*2]
+%if ARCH_X86_64
+ jmp .write_8x8
+.round2_and_write_8x8:
+ mova m7, [rsp+gprsize*2]
+.round1_and_write_8x8:
+ REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+%endif
+
+ ; m0-7 have to-be-written data [pre-rounded]
+ ; on x86-64, m9-10 contain a zero/pixel_max
+ ; on x86-32, these are runtime-generated, and [rsp+gprsize*2] is scratch
+ ; r0,1,3 contain dstq/strideq/stride3q
+ ; r5 is a scratch register
+.write_8x8:
+ lea r5, [dstq+strideq*4]
+ paddw m0, [dstq+strideq*0]
+ paddw m1, [dstq+strideq*1]
+ paddw m2, [dstq+strideq*2]
+ paddw m3, [dstq+r3]
+ paddw m4, [r5 +strideq*0]
+ paddw m5, [r5 +strideq*1]
+ paddw m6, [r5 +strideq*2]
+ paddw m7, [r5 +r3]
+%if ARCH_X86_64
+ REPX {pmaxsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+%else
+ mova [rsp+gprsize*2], m7
+ pxor m7, m7
+ REPX {pmaxsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmaxsw m7, [rsp+gprsize*2]
+ mova [rsp+gprsize*2], m7
+ mova m7, [o(pixel_10bpc_max)]
+ REPX {pminsw x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pminsw m7, [rsp+gprsize*2]
+%endif
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r3 ], m3
+ mova [r5 +strideq*0], m4
+ mova [r5 +strideq*1], m5
+ mova [r5 +strideq*2], m6
+ mova [r5 +r3 ], m7
+ ret
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity, 6
+
+cglobal iadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_32
+ mov [rsp+4*16+1*gprsize], r1
+%endif
+ lea t0, [o(.pass1_main)]
+ jmp m(idct_8x8_internal_16bpc).pass1_full
+.pass1_main:
+ call m(iadst_8x4_internal_16bpc).main_pass1
+ call .round
+ jmp m(idct_8x8_internal_16bpc).pack_and_transpose
+.round:
+%if ARCH_X86_64
+ pcmpeqd m8, m8 ; -1
+ REPX {psubd x, m8 }, m0, m6
+ REPX {pxor x, m8 }, m1, m7, m3, m5
+ REPX {psrad x, 1 }, m0, m1, m6, m7
+ REPX {psubd x, m8 }, m1, m7
+ mova m8, [o(pd_6144)]
+ REPX {paddd x, m8 }, m2, m3, m4, m5
+ REPX {psrad x, 13 }, m2, m3, m4, m5
+%else
+ mova [r3+2*16], m0
+
+ pcmpeqd m0, m0 ; -1
+ mova m6, [o(pd_6144)]
+ REPX {pxor x, m0 }, m1, m7, m3, m5
+ REPX {psrad x, 1 }, m1, m7
+ REPX {psubd x, m0 }, m1, m7
+ REPX {paddd x, m6 }, m2, m3, m4, m5
+ REPX {psrad x, 13 }, m2, m3, m4, m5
+
+ mova m0, [r3+2*16]
+ psrld m6, 12 ; +1
+ paddd m0, m6
+ paddd m6, [r3+3*16]
+ REPX {psrad x, 1 }, m0, m6
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main
+ call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end
+ lea r3, [strideq*3]
+%if ARCH_X86_64
+ mova m10, [o(pixel_10bpc_max)]
+ pxor m9, m9
+%endif
+ call .round3_and_write_8x8
+ jmp m(idct_8x8_internal_16bpc).zero
+
+ ; round (rounded right-shift by 5) before writing; odd registers are negated
+ ; data in m0-7
+ ; on x86-64, pw_2048 is in m8 and pw_m2048 is in m11
+ ; .round1 is for m0-7
+ ; .round2 is for m0-6 & [rsp+gprsize*2]
+ ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32)
+%if ARCH_X86_64
+.round2_and_write_8x8:
+ mova m7, [rsp+gprsize*2]
+.round1_and_write_8x8:
+ REPX {pmulhrsw x, m8 }, m0, m2, m4, m6
+ REPX {pmulhrsw x, m11}, m1, m3, m5, m7
+ jmp m(idct_8x8_internal_16bpc).write_8x8
+%else
+.round1_and_write_8x8:
+ mova [rsp+gprsize*2], m7
+.round2_and_write_8x8:
+%endif
+.round3_and_write_8x8:
+ mova m7, [o(pw_2048)]
+ REPX {pmulhrsw x, m7}, m0, m2, m4, m6
+ mova m7, [o(pw_m2048)]
+ REPX {pmulhrsw x, m7}, m1, m3, m5
+ pmulhrsw m7, [rsp+gprsize*2]
+ jmp m(idct_8x8_internal_16bpc).write_8x8
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, flipadst
+INV_TXFM_8X8_FN flipadst, identity, 6
+
+cglobal iflipadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_32
+ mov [rsp+4*16+1*gprsize], r1
+%endif
+ lea t0, [o(.pass1_main)]
+ jmp m(idct_8x8_internal_16bpc).pass1_full
+.pass1_main:
+ call m(iadst_8x4_internal_16bpc).main_pass1
+ call m(iadst_8x8_internal_16bpc).round
+ ; invert registers
+ packssdw m7, m6
+ packssdw m5, m4
+ packssdw m3, m2
+ packssdw m1, m0
+ mova m0, m7
+ mova m2, m5
+ mova m4, m3
+ mova m6, m1
+ jmp m(idct_8x4_internal_16bpc).transpose4x8packed
+
+.pass2:
+ lea dstq, [dstq+strideq*8]
+ sub dstq, strideq
+ neg strideq
+ jmp m(iadst_8x8_internal_16bpc).pass2
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+0*32]
+ mova m1, [cq+1*32]
+ mova m2, [cq+2*32]
+ mova m3, [cq+3*32]
+ mova m4, [cq+4*32]
+ mova m5, [cq+5*32]
+ mova m6, [cq+6*32]
+ mova m7, [cq+7*32]
+ packssdw m0, [cq+0*32+16]
+ packssdw m1, [cq+1*32+16]
+ packssdw m2, [cq+2*32+16]
+ packssdw m3, [cq+3*32+16]
+ packssdw m4, [cq+4*32+16]
+ packssdw m5, [cq+5*32+16]
+ packssdw m6, [cq+6*32+16]
+ packssdw m7, [cq+7*32+16]
+ mova [rsp+gprsize+16*1], m6
+ jmp m_suffix(idct_8x8_internal_8bpc, _ssse3).pass1_end3
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ lea r3, [strideq*3]
+%if ARCH_X86_64
+ mova m10, [o(pixel_10bpc_max)]
+ pxor m9, m9
+ mova m8, [o(pw_4096)]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+%else
+ mova [rsp+gprsize], m7
+ mova m7, [o(pw_4096)]
+ call m(idct_8x8_internal_16bpc).round4_and_write_8x8
+%endif
+ jmp m(idct_8x8_internal_16bpc).zero
+
+%macro INV_TXFM_8X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
+%if ARCH_X86_64
+ INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 15, 0-16*16
+%else
+ INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16
+%endif
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ mov r3d, 4
+%if stack_size_padded > 0
+ ; adjust to caller's stack allocation
+ add rsp, (12+ARCH_X86_64)*16
+%endif
+ jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end
+%endif
+%endmacro
+
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, identity, v
+INV_TXFM_8X16_FN dct, adst
+INV_TXFM_8X16_FN dct, flipadst
+
+%if ARCH_X86_64
+DECLARE_REG_TMP 7
+%endif
+
+cglobal idct_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ PUSH r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(m(idct_8x8_internal_16bpc).pass1_main)]
+.pass1_full:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+%undef cmp
+ mov r6d, 4
+.zero_loop:
+ dec r6d
+ cmp eobb, byte [r5+r6]
+ jl .zero_loop
+ mov r5d, r6d
+ shl r5d, 4
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r6, [rsp+16*16+2*gprsize]
+ ; setup stack pointer
+ lea r3, [rsp+gprsize]
+%endif
+.loop_pass1:
+ mova m0, [cq+0*64+r5]
+ mova m1, [cq+1*64+r5]
+ mova m2, [cq+2*64+r5]
+ mova m3, [cq+3*64+r5]
+ mova m4, [cq+4*64+r5]
+ mova m5, [cq+5*64+r5]
+ mova m6, [cq+6*64+r5]
+ mova m7, [cq+7*64+r5]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call t0
+
+ mova [cq+0*64+r5], m0
+ mova [cq+1*64+r5], m1
+ mova [cq+2*64+r5], m2
+ mova [cq+3*64+r5], m3
+ sub r5d, 16
+ jge .loop_pass1
+%if WIN64
+ POP r7
+%elif ARCH_X86_32
+ mov r1, [rsp+16*16+1*gprsize]
+%endif
+ jmp tx2q
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+
+ ; input is in cqN*16, where N=0/4/8/12/1/5/9/13/2/6/10/14/3/7/11/15
+ ; some are still pre-loaded from the final loop iteration in pass=1
+
+ mova m1, m2
+ mova m2, [cq+ 1*16]
+ mova m3, [cq+ 9*16]
+ mova m4, [cq+ 2*16]
+ mova m5, [cq+10*16]
+ mova m6, [cq+ 3*16]
+ mova m7, [cq+11*16]
+ call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
+ mova [rsp+gprsize+3*16], m0
+ mova [rsp+gprsize+4*16], m1
+ mova [rsp+gprsize+5*16], m2
+ mova [rsp+gprsize+6*16], m3
+ mova [rsp+gprsize+7*16], m4
+ mova [rsp+gprsize+8*16], m5
+ mova [rsp+gprsize+9*16], m6
+ ; m7 is already stored in [rsp+gprsize+0*16]
+ mova m0, [cq+ 4*16]
+ mova m1, [cq+12*16]
+ mova m2, [cq+ 5*16]
+ mova m3, [cq+13*16]
+ mova m4, [cq+ 6*16]
+ mova m5, [cq+14*16]
+ mova m6, [cq+ 7*16]
+ mova m7, [cq+15*16]
+ call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
+
+ ; out0-7 is in rsp+gprsize+3-10*mmsize
+ ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize
+
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ mova m10, [o(pixel_10bpc_max)]
+ pxor m9, m9
+ mov r6, dstq
+%else
+ mov [rsp+16*16+gprsize*1], dstq
+%endif
+ lea r3, [strideq*3]
+ lea dstq, [dstq+strideq*8]
+ call m(idct_8x8_internal_16bpc).round2_and_write_8x8
+%if ARCH_X86_64
+%define mzero m9
+%else
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+%undef mzero
+ mova m0, [rsp+gprsize+ 3*16]
+ mova m1, [rsp+gprsize+ 4*16]
+ mova m2, [rsp+gprsize+ 5*16]
+ mova m3, [rsp+gprsize+ 6*16]
+ mova m4, [rsp+gprsize+ 7*16]
+ mova m5, [rsp+gprsize+ 8*16]
+ mova m6, [rsp+gprsize+ 9*16]
+ mova m7, [rsp+gprsize+10*16]
+%if ARCH_X86_64
+ mov dstq, r6
+%else
+ mov dstq, [rsp+16*16+gprsize*1]
+%endif
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+ RET
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, adst
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, identity, v
+
+cglobal iadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ PUSH r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(m(iadst_8x8_internal_16bpc).pass1_main)]
+ jmp m(idct_8x16_internal_16bpc).pass1_full
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ mova m4, [cq+ 9*16]
+ mova m5, [cq+13*16]
+ mova [rsp+gprsize+7*16], m0
+ mova [rsp+gprsize+8*16], m1
+ mova [rsp+gprsize+5*16], m4
+ mova [rsp+gprsize+6*16], m5
+ mova m0, m2
+ mova m1, m3
+ mova m2, [cq+ 1*16]
+ mova m3, [cq+ 5*16]
+ mova m4, [cq+ 2*16]
+ mova m5, [cq+ 6*16]
+ mova m6, [cq+11*16]
+ mova m7, [cq+15*16]
+ mova [rsp+gprsize+ 3*16], m4
+ mova [rsp+gprsize+ 4*16], m5
+ mova [rsp+gprsize+ 9*16], m6
+ mova [rsp+gprsize+10*16], m7
+ mova m4, [cq+10*16]
+ mova m5, [cq+14*16]
+ mova m6, [cq+ 3*16]
+ mova m7, [cq+ 7*16]
+ call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main
+ call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end
+
+%if ARCH_X86_64
+ mova m11, [o(pw_m2048)]
+ mova m8, [o(pw_2048)]
+ mova m10, [o(pixel_10bpc_max)]
+ pxor m9, m9
+ mov r6, dstq
+%else
+ mov [rsp+16*16+gprsize*1], dstq
+%endif
+ lea r3, [strideq*3]
+ lea dstq, [dstq+strideq*8]
+ call m(iadst_8x8_internal_16bpc).round2_and_write_8x8
+%if ARCH_X86_64
+%define mzero m9
+%else
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+%undef mzero
+ mova m0, [rsp+gprsize+ 3*16]
+ mova m1, [rsp+gprsize+ 4*16]
+ mova m2, [rsp+gprsize+ 5*16]
+ mova m3, [rsp+gprsize+ 6*16]
+ mova m4, [rsp+gprsize+ 7*16]
+ mova m5, [rsp+gprsize+ 8*16]
+ mova m6, [rsp+gprsize+ 9*16]
+ mova m7, [rsp+gprsize+10*16]
+%if ARCH_X86_64
+ mov dstq, r6
+%else
+ mov dstq, [rsp+16*16+gprsize*1]
+%endif
+ call m(iadst_8x8_internal_16bpc).round1_and_write_8x8
+ RET
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+INV_TXFM_8X16_FN flipadst, identity, v
+
+cglobal iflipadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ PUSH r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(m(iflipadst_8x8_internal_16bpc).pass1_main)]
+ jmp m(idct_8x16_internal_16bpc).pass1_full
+
+.pass2:
+ lea r3, [strideq*3]
+ lea r3, [r3*5]
+ add dstq, r3
+ neg strideq
+ jmp m(iadst_8x16_internal_16bpc).pass2
+
+INV_TXFM_8X16_FN identity, dct, h
+INV_TXFM_8X16_FN identity, adst, h
+INV_TXFM_8X16_FN identity, flipadst, h
+INV_TXFM_8X16_FN identity, identity
+
+cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ PUSH r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(m(idct_8x8_internal_16bpc).pack_and_transpose)]
+ jmp m(idct_8x16_internal_16bpc).pass1_full
+
+.pass2:
+%if ARCH_X86_64
+ mova m4, [o(pw_2048)]
+ mova m5, [o(pixel_10bpc_max)]
+ pxor m6, m6
+ mova m7, [o(pw_1697x16)]
+%endif
+ mov r5d, 4
+ lea r3, [strideq*3]
+.pass2_loop:
+ call .main
+%if ARCH_X86_64
+ call m(idct_8x4_internal_16bpc).round1_and_write_8x4
+%else
+ call m(idct_8x4_internal_16bpc).round2_and_write_8x4
+%endif
+ REPX {mova [cq+x*16], m6}, 0, 4, 8, 12, 16, 20, 24, 28
+ dec r5d
+ jle .end
+ add cq, 16
+ lea dstq, [dstq+strideq*4]
+ mova m0, [cq+ 0*16]
+ mova m1, [cq+ 4*16]
+ mova m2, [cq+ 8*16]
+ mova m3, [cq+12*16]
+ jmp .pass2_loop
+.end:
+ RET
+.main:
+ ; y = pmulhrsw(x, pw_1697x16); x = paddsw(x, x); x = paddsw(x, y)
+%if ARCH_X86_32
+ mova m7, [o(pw_1697x16)]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+%else
+ pmulhrsw m8, m7, m0
+ pmulhrsw m9, m7, m1
+ pmulhrsw m10, m7, m2
+ pmulhrsw m11, m7, m3
+%endif
+ REPX {paddsw x, x}, m0, m1, m2, m3
+%if ARCH_X86_64
+ paddsw m0, m8
+ paddsw m1, m9
+ paddsw m2, m10
+ paddsw m3, m11
+%else
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+%endif
+ ret
+
+%macro INV_TXFM_16X4_FN 2 ; type1, type2
+%if ARCH_X86_64
+ INV_TXFM_FN %1, %2, 0, 16x4, 16, 0-8*16
+%else
+ INV_TXFM_FN %1, %2, 0, 16x4, 8, 0-12*16
+%endif
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 4
+.dconly:
+ add r5d, 384
+ sar r5d, 9
+.dconly2:
+ imul r5d, 2896
+ add r5d, 34816
+ movd m0, r5d
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ mova m3, [o(pixel_10bpc_max)]
+ pxor m4, m4
+.loop:
+ mova m1, [dstq+ 0]
+ mova m2, [dstq+16]
+ REPX {paddw x, m0}, m1, m2
+ REPX {pminsw x, m3}, m1, m2
+ REPX {pmaxsw x, m4}, m1, m2
+ mova [dstq+ 0], m1
+ mova [dstq+16], m2
+ add dstq, strideq
+ dec r3d
+ jg .loop
+ RET
+%endif
+%endmacro
+
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, identity
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+
+cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ ; setup stack pointer
+ lea r3, [rsp+gprsize]
+
+ mova m0, [cq+ 1*16]
+ mova m1, [cq+ 3*16]
+ mova m2, [cq+ 5*16]
+ mova m3, [cq+ 7*16]
+ mova m4, [cq+ 9*16]
+ mova m5, [cq+11*16]
+ mova m6, [cq+13*16]
+ mova m7, [cq+15*16]
+ call .main_oddhalf
+ mova m0, [cq+ 0*16]
+ mova m1, [cq+ 2*16]
+ mova m2, [cq+ 4*16]
+ mova m3, [cq+ 6*16]
+ mova m4, [cq+ 8*16]
+ mova m5, [cq+10*16]
+ mova m6, [cq+12*16]
+ mova m7, [cq+14*16]
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ ; t0-7 is in m0-7
+
+ call .round
+
+%if ARCH_X86_64
+.pack_transpose:
+ ; transpose in two parts
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+.transpose:
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ call .transpose4x8packed_hi
+%else
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ mova [r3+2*16], m2
+ mova [r3+3*16], m3
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+ 9*16]
+ mova m4, [r3+10*16]
+ mova m6, [r3+11*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+ jmp tx2q
+%if ARCH_X86_64
+.transpose4x8packed_hi:
+ punpcklwd m9, m10, m14
+ punpckhwd m10, m14
+ punpckhwd m14, m8, m12
+ punpcklwd m8, m12
+
+ punpckhwd m11, m8, m9
+ punpcklwd m8, m9
+ punpckhwd m12, m14, m10
+ punpcklwd m14, m10
+
+ punpcklwd m10, m11, m12
+ punpckhwd m11, m12
+ punpckhwd m9, m8, m14
+ punpcklwd m8, m14
+ ret
+%endif
+.main_oddhalf_fast: ; lower half zero
+ pmulld m7, m0, [o(pd_4076)]
+ pmulld m0, [o(pd_401)]
+ pmulld m6, m1, [o(pd_m1189)]
+ pmulld m1, [o(pd_3920)]
+%if ARCH_X86_32
+ mova m4, [o(pd_2048)]
+ REPX {paddd x, m4}, m1, m6
+ REPX {psrad x, 12}, m1, m6
+ mova [r3+1*16], m1
+%endif
+ pmulld m5, m2, [o(pd_3612)]
+ pmulld m2, [o(pd_1931)]
+%if ARCH_X86_32
+ pmulld m1, m3, [o(pd_m2598)]
+%else
+ pmulld m4, m3, [o(pd_m2598)]
+%endif
+ pmulld m3, [o(pd_3166)]
+ jmp .main_oddhalf_fast2
+.main_oddhalf:
+%if ARCH_X86_64
+ ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 401, 4076 ; t8a, t15a
+ ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a
+ ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a
+ ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3166, 2598 ; t9a, t14a
+.main_oddhalf_fast2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+ psubd m8, m0, m4 ; t9
+ paddd m0, m4 ; t8
+ psubd m4, m6, m2 ; t10
+ paddd m2, m6 ; t11
+ psubd m6, m1, m5 ; t13
+ paddd m5, m1 ; t12
+ psubd m1, m7, m3 ; t14
+ paddd m7, m3 ; t15
+ REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7
+ REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7
+ mova m15, [o(pd_3784)]
+ mova m10, [o(pd_1567)]
+ ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15
+ ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 4
+ psubd m3, m1, m4 ; t10
+ paddd m1, m4 ; t9
+ psubd m4, m0, m2 ; t11a
+ paddd m0, m2 ; t8a
+ psubd m2, m8, m6 ; t13
+ paddd m6, m8 ; t14
+ psubd m8, m7, m5 ; t12a
+ paddd m7, m5 ; t15a
+ REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7
+ REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7
+ REPX {pmulld x, m14}, m2, m8, m3, m4
+ paddd m2, m11
+ paddd m8, m11
+ paddd m5, m2, m3 ; t13a
+ psubd m2, m3 ; t10a
+ psubd m3, m8, m4 ; t11
+ paddd m4, m8 ; t12
+ REPX {psrad x, 12}, m5, m2, m3, m4
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ mova [r3+2*16], m2
+ mova [r3+3*16], m3
+ mova [r3+4*16], m4
+ mova [r3+5*16], m5
+ mova [r3+6*16], m6
+ mova [r3+7*16], m7
+%else
+ mova [r3+0*16], m2
+ mova [r3+1*16], m3
+ mova [r3+2*16], m4
+ mova [r3+3*16], m5
+ mova m4, [o(pd_2048)]
+
+ ITX_MULSUB_2D 0, 7, 2, 3, 5, _, 401, 4076 ; t8a, t15a
+ ITX_MULSUB_2D 6, 1, 2, 3, 5, 4, 3920, 1189 ; t11a, t12a
+
+ mova m2, [r3+0*16]
+ mova m3, [r3+1*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ mova m1, [r3+2*16]
+ mova m5, [r3+3*16]
+ mova [r3+2*16], m6
+ mova [r3+3*16], m7
+
+ ITX_MULSUB_2D 2, 5, 0, 6, 7, _, 1931, 3612 ; t10a, t13a
+ ITX_MULSUB_2D 1, 3, 0, 6, 7, _, 3166, 2598 ; t9a, t14a
+
+ mova m0, [r3+0*16]
+ mova m6, [r3+2*16]
+ mova m7, [r3+3*16]
+.main_oddhalf_fast2:
+ REPX {paddd x, m4}, m0, m7, m2, m5, m1, m3
+ REPX {psrad x, 12}, m0, m7, m2, m5, m1, m3
+ psubd m4, m0, m1 ; t9
+ paddd m0, m1 ; t8
+ mova m1, [r3+1*16]
+ mova [r3+0*16], m4
+ psubd m4, m6, m2 ; t10
+ paddd m2, m6 ; t11
+ psubd m6, m1, m5 ; t13
+ paddd m5, m1 ; t12
+ psubd m1, m7, m3 ; t14
+ paddd m7, m3 ; t15
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m1, m4, m6, m0, m2, m5, m7
+ pmaxsd m3, [r3+0*16]
+ mova [r3+0*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m1, m4, m6, m0, m2, m5, m7
+ pminsd m3, [r3+0*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m2
+ mova [r3+2*16], m5
+ mova [r3+3*16], m7
+ mova m7, [o(pd_2048)]
+ ITX_MULSUB_2D 1, 3, 0, 2, 5, 7, 1567, 3784
+ ITX_MULSUB_2D 6, 4, 0, 2, _, 7, 5, 3784, 4
+ mova m0, [r3+0*16]
+ mova m2, [r3+1*16]
+ psubd m5, m1, m4 ; t10
+ mova [r3+1*16], m5
+ paddd m1, m4 ; t9
+ psubd m4, m0, m2 ; t11a
+ paddd m0, m2 ; t8a
+ mova m5, [r3+2*16]
+ mova m7, [r3+3*16]
+ psubd m2, m3, m6 ; t13
+ paddd m6, m3 ; t14
+ paddd m3, m7, m5 ; t15a
+ psubd m7, m5 ; t12a
+ mova [r3+0*16], m3
+ mova m3, [r3+1*16]
+ mova m5, [o(clip_18b_min)]
+ REPX {pmaxsd x, m5}, m2, m7, m3, m4, m0, m1, m6
+ pmaxsd m5, [r3+0*16]
+ mova [r3+0*16], m5
+ mova m5, [o(clip_18b_max)]
+ REPX {pminsd x, m5}, m2, m7, m3, m4, m0, m1, m6
+ pminsd m5, [r3+0*16]
+ mova [r3+0*16], m5
+ mova m5, [o(pd_2896)]
+ REPX {pmulld x, m5}, m2, m7, m3, m4
+ mova m5, [o(pd_2048)]
+ REPX {paddd x, m5}, m2, m7
+ paddd m5, m2, m3 ; t13a
+ psubd m2, m3 ; t10a
+ psubd m3, m7, m4 ; t11
+ paddd m4, m7 ; t12
+ REPX {psrad x, 12}, m5, m2, m3, m4
+ mova m7, [r3+0*16]
+ mova [r3+11*16], m0
+ mova [r3+10*16], m1
+ mova [r3+9*16], m2
+ mova [r3+8*16], m3
+ mova [r3+7*16], m4
+ mova [r3+6*16], m5
+ mova [r3+5*16], m6
+ mova [r3+4*16], m7
+%endif
+ ret
+.round:
+%if ARCH_X86_64
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ pcmpeqd m8, m8
+ REPX {psubd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ mova m8, [r3+1*16]
+ mova m9, [r3+2*16]
+ mova m10, [r3+3*16]
+ mova m11, [r3+4*16]
+ mova m12, [r3+5*16]
+ mova m13, [r3+6*16]
+ mova m14, [r3+7*16]
+ psubd m15, m0, m14 ; out15
+ paddd m0, m14 ; out0
+ psubd m14, m1, m13 ; out14
+ paddd m1, m13 ; out1
+ psubd m13, m2, m12 ; out13
+ paddd m2, m12 ; out2
+ psubd m12, m3, m11 ; out12
+ paddd m3, m11 ; out3
+ psubd m11, m4, m10 ; out11
+ paddd m4, m10 ; out4
+ psubd m10, m5, m9 ; out10
+ paddd m5, m9 ; out5
+ psubd m9, m6, m8 ; out9
+ paddd m6, m8 ; out6
+ psubd m8, m7, [r3+0*16] ; out8
+ paddd m7, [r3+0*16] ; out7
+ REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ ; and out0-15 is now in m0-15
+%else
+ mova [r3+ 0*16], m0
+ mova m0, [o(clip_18b_min)]
+ REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
+ pmaxsd m0, [r3+ 0*16]
+ mova [r3+ 0*16], m7
+ mova m7, [o(clip_18b_max)]
+ REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pminsd m7, [r3+ 0*16]
+ mova [r3+ 0*16], m0
+ pcmpeqd m0, m0
+ REPX {psubd x, m0}, m1, m2, m3, m4, m5, m6, m7
+ mova [r3+ 1*16], m1
+ mova [r3+ 2*16], m2
+ mova m1, [r3+ 0*16]
+ psubd m1, m0
+ mova [r3+ 0*16], m1
+ mova m1, [r3+11*16]
+ mova m2, [r3+10*16]
+ psubd m0, m7, m1
+ paddd m7, m1
+ psubd m1, m6, m2
+ paddd m6, m2
+ REPX {psrad x, 1}, m0, m1, m6, m7
+ packssdw m0, m1 ; out8-9
+ packssdw m6, m7 ; out6-7
+ mova [r3+11*16], m6
+ mova m1, [r3+9*16]
+ mova m7, [r3+8*16]
+ psubd m2, m5, m1
+ paddd m5, m1
+ psubd m1, m4, m7
+ paddd m4, m7
+ REPX {psrad x, 1}, m2, m1, m4, m5
+ packssdw m2, m1 ; out10-11
+ packssdw m4, m5 ; out4-5
+ mova m1, [r3+2*16]
+ mova [r3+10*16], m4
+ mova m6, [r3+7*16]
+ mova m7, [r3+6*16]
+ psubd m4, m3, m6
+ paddd m3, m6
+ psubd m6, m1, m7
+ paddd m1, m7
+ REPX {psrad x, 1}, m4, m6, m1, m3
+ packssdw m4, m6 ; out12-13
+ packssdw m1, m3 ; out2-3
+ mova m3, [r3+1*16]
+ mova [r3+9*16], m1
+ mova m1, [r3+0*16]
+ mova m5, [r3+5*16]
+ mova m7, [r3+4*16]
+ psubd m6, m3, m5
+ paddd m3, m5
+ psubd m5, m1, m7
+ paddd m1, m7
+ REPX {psrad x, 1}, m6, m5, m1, m3
+ packssdw m6, m5 ; out14-15
+ packssdw m1, m3 ; out0-1
+ mova [r3+8*16], m1
+%endif
+ ret
+
+.pass2:
+ lea r4, [o(m_suffix(idct_8x4_internal_8bpc, _ssse3).main)]
+.pass2_loop:
+ lea r3, [strideq*3]
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call r4
+ call m(idct_8x4_internal_16bpc).round2_and_write_8x4
+ REPX {mova [cq+x*16], m6}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+%if ARCH_X86_64
+ mova m0, m8
+ mova m1, m9
+ mova m2, m10
+ mova m3, m11
+%else
+ mova m0, [rsp+gprsize+0*16]
+ mova m1, [rsp+gprsize+1*16]
+ mova m2, [rsp+gprsize+2*16]
+ mova m3, [rsp+gprsize+3*16]
+%endif
+ add dstq, 16
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call r4
+ call m(idct_8x4_internal_16bpc).round2_and_write_8x4
+ RET
+
+INV_TXFM_16X4_FN adst, dct
+INV_TXFM_16X4_FN adst, adst
+INV_TXFM_16X4_FN adst, flipadst
+INV_TXFM_16X4_FN adst, identity
+
+cglobal iadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ ; setup stack pointer
+ lea r3, [rsp+gprsize]
+ call .main
+%if ARCH_X86_64
+ jmp m(idct_16x4_internal_16bpc).pack_transpose
+%else
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+gprsize+0*16], m0
+ mova [rsp+gprsize+1*16], m1
+ mova [rsp+gprsize+2*16], m2
+ mova [rsp+gprsize+3*16], m3
+ mova m0, [rsp+gprsize+ 8*16]
+ mova m2, [rsp+gprsize+ 9*16]
+ mova m4, [rsp+gprsize+10*16]
+ mova m6, [rsp+gprsize+11*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ jmp tx2q
+%endif
+
+.main:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+ 2*16]
+ mova m1, [cq+13*16]
+ mova m2, [cq+ 6*16]
+ mova m3, [cq+ 9*16]
+ mova m4, [cq+10*16]
+ mova m5, [cq+ 5*16]
+ mova m6, [cq+14*16]
+ mova m7, [cq+ 1*16]
+ call .main_part1
+ mova m0, [cq+ 0*16]
+ mova m1, [cq+15*16]
+ mova m2, [cq+ 4*16]
+ mova m3, [cq+11*16]
+ mova m4, [cq+ 8*16]
+ mova m5, [cq+ 7*16]
+ mova m6, [cq+12*16]
+ mova m7, [cq+ 3*16]
+ call .main_part2
+.round:
+%if ARCH_X86_64
+ mova m15, [o(pd_6144)]
+ psrld m14, 11 ; pd_1
+ pcmpeqd m8, m8 ; -1
+ psubd m13, m15, m14 ; pd_6143
+ REPX {paddd x, m14}, m0, m2
+ REPX {paddd x, m15}, m4, m6
+ REPX {pxor x, m8 }, m1, m3, m5, m7
+ REPX {psrad x, 1 }, m1, m3
+ REPX {paddd x, m15}, m5, m7
+ REPX {psubd x, m8 }, m1, m3
+ paddd m8, m15, m9
+ psubd m9, m13, m10
+ paddd m10, m15, m11
+ psubd m11, m13, m12
+ paddd m12, m14, [r3+3*16]
+ psubd m13, m14, [r3+2*16]
+ psubd m15, m14, [r3+0*16]
+ paddd m14, [r3+1*16]
+ REPX {psrad x, 1 }, m0, m2, m12, m13, m14, m15
+ REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11
+%else
+ mova [r3+8*16], m1
+ mova [r3+9*16], m3
+ mova m3, [o(pd_6144)]
+ pcmpeqd m1, m1
+ REPX {pxor x, m1}, m5, m7
+ REPX {paddd x, m3}, m4, m5, m6, m7
+ REPX {psrad x, 13}, m4, m5, m6, m7
+ packssdw m4, m5
+ packssdw m6, m7
+ mova [r3+10*16], m4
+ mova [r3+11*16], m6
+ mova m4, [r3+4*16]
+ mova m5, [r3+5*16]
+ mova m6, [r3+6*16]
+ mova m7, [r3+7*16]
+ REPX {pxor x, m1}, m5, m7
+ REPX {psubd x, m1}, m4, m6
+ REPX {psrad x, 1 }, m4, m5, m6, m7
+ REPX {psubd x, m1}, m5, m7
+ packssdw m4, m5
+ packssdw m6, m7
+ mova m5, [r3+8*16]
+ mova m7, [r3+9*16]
+ mova [r3+8*16], m4
+ mova [r3+9*16], m6
+ REPX {pxor x, m1}, m5, m7
+ REPX {paddd x, m3}, m0, m5, m2, m7
+ REPX {psrad x, 13}, m0, m5, m2, m7
+ packssdw m0, m5
+ packssdw m2, m7
+ mova m4, [r3+0*16]
+ mova m5, [r3+1*16]
+ mova m6, [r3+2*16]
+ mova m7, [r3+3*16]
+ REPX {psubd x, m1}, m4, m6
+ REPX {pxor x, m1}, m5, m7
+ REPX {psrad x, 1 }, m4, m5, m6, m7
+ REPX {psubd x, m1}, m5, m7
+ packssdw m4, m5
+ packssdw m6, m7
+%endif
+ ret
+
+.main_part2:
+%if ARCH_X86_64
+ ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201, 4091
+ ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751, 3703
+ ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035, 2751
+ ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857, 1380
+ psubd m8, m0, m4 ; t8a
+ paddd m0, m4 ; t0a
+ psubd m4, m1, m5 ; t9a
+ paddd m1, m5 ; t1a
+ psubd m5, m2, m6 ; t12a
+ paddd m2, m6 ; t4a
+ psubd m6, m3, m7 ; t13a
+ paddd m7, m3 ; t5a
+ REPX {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7
+ REPX {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
+ mova m15, [o(pd_4017)]
+ mova m10, [o(pd_799)]
+ ITX_MULSUB_2D 8, 4, 3, 9, _, 11, 10, 15
+ ITX_MULSUB_2D 6, 5, 3, 9, _, 11, 15, 10
+ psubd m3, m0, m2 ; t4
+ paddd m0, m2 ; t0
+ psubd m2, m1, m7 ; t5
+ paddd m1, m7 ; t1
+ psubd m7, m4, m6 ; t12a
+ paddd m4, m6 ; t8a
+ psubd m6, m8, m5 ; t13a
+ paddd m5, m8 ; t9a
+ REPX {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5
+ REPX {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
+ mova m15, [o(pd_3784)]
+ mova m10, [o(pd_1567)]
+ ITX_MULSUB_2D 3, 2, 8, 9, _, 11, 10, 15
+ ITX_MULSUB_2D 7, 6, 8, 9, _, 11, 10, 15
+ mova m10, [r3+0*16] ; t2
+ mova m8, [r3+1*16] ; t3
+ psubd m9, m0, m10 ; t2a
+ paddd m0, m10 ; out0
+ psubd m10, m1, m8 ; t3a
+ paddd m1, m8 ; -out15
+ mova [r3+0*16], m1
+ mova m15, [r3+3*16] ; t7a
+ mova m1, [r3+2*16] ; t6a
+ psubd m8, m3, m15 ; t7
+ paddd m15, m3 ; out12
+ paddd m3, m2, m1 ; -out3
+ psubd m2, m1 ; t6
+ mova [r3+3*16], m15
+ mova [r3+1*16], m2
+ mova m1, [r3+7*16] ; t15
+ mova m2, [r3+6*16] ; t14
+ paddd m15, m7, m1 ; -out13
+ psubd m7, m1 ; t15a
+ psubd m11, m6, m2 ; t14a
+ paddd m2, m6 ; out2
+ mova [r3+2*16], m15
+ mova m1, [r3+4*16] ; t10a
+ mova m15, [r3+5*16] ; t11a
+ psubd m6, m4, m1 ; t10
+ paddd m1, m4 ; -out1
+ psubd m4, m5, m15 ; t11
+ paddd m5, m15 ; out14
+ REPX {pmaxsd x, m12}, m11, m7, m9, m10, m6, m4, m8
+ pmaxsd m12, [r3+1*16] ; t6
+ mova [r3+1*16], m5
+ REPX {pminsd x, m13}, m11, m7, m9, m10, m6, m4, m12, m8
+ REPX {pmulld x, m14}, m11, m7, m9, m10, m6, m4, m12, m8
+ paddd m5, m11, m7 ; -out5 (unshifted)
+ psubd m11, m7 ; out10 (unshifted)
+ paddd m7, m9, m10 ; -out7 (unshifted)
+ psubd m9, m10 ; out8 (unshifted)
+ psubd m10, m6, m4 ; -out9 (unshifted)
+ paddd m6, m4 ; out6 (unshifted)
+ paddd m4, m12, m8 ; out4 (unshifted)
+ psubd m12, m8 ; -out11 (unshifted)
+%else
+ mova [r3+8*16], m0
+ mova [r3+9*16], m1
+ mova [r3+10*16], m2
+ mova [r3+11*16], m3
+ mova m3, [o(pd_2048)]
+ ITX_MULSUB_2D 5, 4, 0, 1, 2, 3, 3035, 2751
+ ITX_MULSUB_2D 7, 6, 0, 1, 2, 3, 3857, 1380
+ mova m0, [r3+8*16]
+ mova m1, [r3+9*16]
+ mova [r3+8*16], m4
+ mova m4, [r3+10*16]
+ mova [r3+9*16], m5
+ mova [r3+10*16], m6
+ mova m5, [r3+11*16]
+ mova [r3+11*16], m7
+ ITX_MULSUB_2D 1, 0, 2, 6, 7, 3, 201, 4091
+ ITX_MULSUB_2D 5, 4, 2, 6, 7, 3, 1751, 3703
+ mova m2, [r3+8*16]
+ mova m6, [r3+9*16]
+ psubd m3, m0, m2 ; t8a
+ paddd m0, m2 ; t0a
+ mova [r3+8*16], m3
+ psubd m2, m1, m6 ; t9a
+ paddd m1, m6 ; t1a
+ mova m3, [r3+10*16]
+ psubd m6, m4, m3 ; t12a
+ paddd m4, m3 ; t4a
+ mova m3, [r3+11*16]
+ psubd m7, m5, m3 ; t13a
+ paddd m5, m3 ; t5a
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m2, m6, m7, m0, m1, m4, m5
+ pmaxsd m3, [r3+8*16]
+ mova [r3+8*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m2, m6, m7, m0, m1, m4, m5
+ pminsd m3, [r3+8*16]
+ mova [r3+8*16], m3
+ psubd m3, m0, m4 ; t4
+ paddd m0, m4 ; t0
+ psubd m4, m1, m5 ; t5
+ paddd m1, m5 ; t1
+ mova m5, [o(pd_2048)]
+ mova [r3+9*16], m1
+ mova [r3+10*16], m4
+ mova [r3+11*16], m3
+ mova m3, [r3+8*16]
+ mova [r3+8*16], m0
+ ITX_MULSUB_2D 3, 2, 0, 1, 4, 5, 799, 4017
+ ITX_MULSUB_2D 7, 6, 0, 1, 4, 5, 4017, 4
+ psubd m5, m2, m7 ; t12a
+ paddd m2, m7 ; t8a
+ psubd m7, m3, m6 ; t13a
+ paddd m6, m3 ; t9a
+ mova m0, [r3+8*16]
+ mova m1, [r3+9*16]
+ mova m4, [r3+10*16]
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m4, m5, m7, m0, m1, m2, m6
+ pmaxsd m3, [r3+11*16]
+ mova [r3+8*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m4, m5, m7, m0, m1, m2, m6
+ pminsd m3, [r3+8*16]
+ mova [r3+8*16], m0
+ mova [r3+9*16], m1
+ mova [r3+10*16], m2
+ mova [r3+11*16], m6
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 3, 4, 1, 2, 6, 0, 1567, 3784
+ ITX_MULSUB_2D 5, 7, 1, 2, 6, 0, 6, 3784
+ mova m0, [r3+7*16] ; t7a
+ mova m2, [r3+6*16] ; t6a
+ psubd m1, m3, m0 ; t7
+ paddd m0, m3 ; out12
+ paddd m3, m4, m2 ; -out3
+ psubd m4, m2 ; t6
+ mova [r3+7*16], m3
+ mova m3, [r3+3*16] ; t15
+ mova m2, [r3+2*16] ; t14
+ paddd m6, m5, m3 ; -out13
+ psubd m5, m3 ; t15a
+ psubd m3, m7, m2 ; t14a
+ paddd m2, m7 ; out2
+ mova [r3+6*16], m2
+ mova m7, [r3+0*16] ; t10a
+ mova m2, [r3+1*16] ; t11a
+ mova [r3+0*16], m0
+ mova [r3+1*16], m6
+ mova m6, [r3+11*16]
+ psubd m0, m6, m2 ; t11
+ paddd m6, m2 ; out14
+ mova [r3+2*16], m6
+ mova m2, [r3+10*16]
+ psubd m6, m2, m7 ; t10
+ paddd m2, m7 ; -out1
+ mova m7, [r3+5*16] ; t3
+ mova [r3+5*16], m2
+ mova [r3+10*16], m1
+ mova m1, [r3+9*16]
+ psubd m2, m1, m7 ; t3a
+ paddd m1, m7 ; -out15
+ mova [r3+3*16], m1
+ mova m1, [r3+4*16] ; t2
+ mova m7, [r3+8*16]
+ psubd m7, m1 ; t2a
+ paddd m1, [r3+8*16] ; out0
+ mova [r3+4*16], m1
+ mova m1, [o(clip_18b_min)]
+ REPX {pmaxsd x, m1}, m0, m2, m3, m4, m5, m6, m7
+ pmaxsd m1, [r3+10*16]
+ mova [r3+10*16], m1
+ mova m1, [o(clip_18b_max)]
+ REPX {pminsd x, m1}, m0, m2, m3, m4, m5, m6, m7
+ pminsd m1, [r3+10*16]
+ mova [r3+10*16], m1
+ mova m1, [o(pd_2896)]
+ REPX {pmulld x, m1}, m0, m2, m3, m4, m5, m6, m7
+ pmulld m1, [r3+10*16]
+ mova [r3+11*16], m3
+ psubd m3, m4, m1 ; -out11 (unshifted)
+ paddd m4, m1 ; out4 (unshifted)
+ psubd m1, m6, m0 ; -out9 (unshifted)
+ paddd m6, m0 ; out6 (unshifted)
+ psubd m0, m7, m2 ; out8 (unshifted)
+ paddd m7, m2 ; -out7 (unshifted)
+ mova m2, [r3+11*16]
+ mova [r3+11*16], m5
+ paddd m5, m2 ; -out5 (unshifted)
+ psubd m2, [r3+11*16] ; out10 (unshifted)
+ ; m0-3 contain out8-11 (unshifted), m4-7 contain out4-7 (unshifted)
+ ; r[-4,3] contain out0-3 and out12-15
+%endif
+ ret
+.main_part1:
+%if ARCH_X86_64
+ ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 995, 3973
+ ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 2440, 3290
+ ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3513, 2106
+ ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 4052, 601
+ psubd m8, m0, m4 ; t10a
+ paddd m0, m4 ; t2a
+ psubd m4, m1, m5 ; t11a
+ paddd m1, m5 ; t3a
+ psubd m5, m2, m6 ; t14a
+ paddd m2, m6 ; t6a
+ psubd m6, m3, m7 ; t15a
+ paddd m7, m3 ; t7a
+ REPX {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7
+ REPX {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
+ mova m15, [o(pd_2276)]
+ mova m10, [o(pd_3406)]
+ ITX_MULSUB_2D 8, 4, 3, 9, _, 11, 10, 15
+ ITX_MULSUB_2D 6, 5, 3, 9, _, 11, 15, 10
+ psubd m3, m0, m2 ; t6
+ paddd m0, m2 ; t2
+ psubd m2, m1, m7 ; t7
+ paddd m1, m7 ; t3
+ psubd m7, m4, m6 ; t14a
+ paddd m4, m6 ; t10a
+ psubd m6, m8, m5 ; t15a
+ paddd m5, m8 ; t11a
+ REPX {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5
+ REPX {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
+ mova m15, [o(pd_1567)]
+ mova m10, [o(pd_3784)]
+ ITX_MULSUB_2D 2, 3, 8, 9, _, 11, 10, 15
+ ITX_MULSUB_2D 6, 7, 8, 9, _, 11, 10, 15
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ mova [r3+4*16], m4
+ mova [r3+5*16], m5
+ mova [r3+2*16], m2
+ mova [r3+3*16], m3
+ mova [r3+6*16], m6
+ mova [r3+7*16], m7
+%else
+ mova [r3+4*16], m0
+ mova [r3+5*16], m1
+ mova [r3+6*16], m2
+ mova [r3+7*16], m3
+ mova m3, [o(pd_2048)]
+ ITX_MULSUB_2D 5, 4, 0, 1, 2, 3, 3513, 2106
+ ITX_MULSUB_2D 7, 6, 0, 1, 2, 3, 4052, 601
+ mova [r3+0*16], m4
+ mova [r3+1*16], m5
+ mova [r3+2*16], m6
+ mova [r3+3*16], m7
+ mova m0, [r3+4*16]
+ mova m1, [r3+5*16]
+ mova m2, [r3+6*16]
+ mova m7, [r3+7*16]
+ ITX_MULSUB_2D 1, 0, 4, 5, 6, 3, 995, 3973
+ ITX_MULSUB_2D 7, 2, 4, 5, 6, 3, 2440, 3290
+ mova m4, [r3+0*16]
+ mova m5, [r3+1*16]
+ psubd m6, m0, m4 ; t10a
+ paddd m0, m4 ; t2a
+ mova [r3+4*16], m6
+ mova m6, [r3+2*16]
+ mova m3, [r3+3*16]
+ psubd m4, m1, m5 ; t11a
+ paddd m1, m5 ; t3a
+ psubd m5, m2, m6 ; t14a
+ paddd m2, m6 ; t6a
+ psubd m6, m7, m3 ; t15a
+ paddd m7, m3 ; t7a
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m4, m5, m6, m0, m1, m2, m7
+ pmaxsd m3, [r3+4*16]
+ mova [r3+4*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m4, m5, m6, m0, m1, m2, m7
+ pminsd m3, [r3+4*16]
+ mova [r3+4*16], m3
+ psubd m3, m0, m2 ; t6
+ paddd m0, m2 ; t2
+ psubd m2, m1, m7 ; t7
+ paddd m1, m7 ; t3
+ mova [r3+5*16], m1
+ mova [r3+6*16], m3
+ mova [r3+7*16], m2
+ mova m1, [r3+4*16]
+ mova [r3+4*16], m0
+ mova m3, [o(pd_2048)]
+ ITX_MULSUB_2D 1, 4, 0, 7, 2, 3, 3406, 2276
+ ITX_MULSUB_2D 6, 5, 0, 7, 2, 3, 2276, 2
+ psubd m7, m4, m6 ; t14a
+ paddd m4, m6 ; t10a
+ psubd m6, m1, m5 ; t15a
+ paddd m5, m1 ; t11a
+ mova m1, [r3+5*16]
+ mova m3, [r3+6*16]
+ mova m2, [r3+7*16]
+ mova m0, [o(clip_18b_min)]
+ REPX {pmaxsd x, m0}, m3, m2, m7, m6, m1, m4, m5
+ pmaxsd m0, [r3+4*16]
+ mova [r3+4*16], m0
+ mova m0, [o(clip_18b_max)]
+ REPX {pminsd x, m0}, m3, m2, m7, m6, m1, m4, m5
+ pminsd m0, [r3+4*16]
+ mova [r3+4*16], m0
+ mova [r3+5*16], m1
+ mova [r3+0*16], m4
+ mova [r3+1*16], m5
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 2, 3, 1, 4, 5, 0, 3784, 1567
+ ITX_MULSUB_2D 6, 7, 1, 4, 5, 0, 5, 1567
+ mova [r3+6*16], m2
+ mova [r3+7*16], m3
+ mova [r3+2*16], m6
+ mova [r3+3*16], m7
+%endif
+ ret
+
+.pass2:
+ lea r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)]
+ jmp m(idct_16x4_internal_16bpc).pass2_loop
+
+INV_TXFM_16X4_FN flipadst, dct
+INV_TXFM_16X4_FN flipadst, adst
+INV_TXFM_16X4_FN flipadst, flipadst
+INV_TXFM_16X4_FN flipadst, identity
+
+cglobal iflipadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+ lea r3, [rsp+gprsize]
+ call m(iadst_16x4_internal_16bpc).main
+%if ARCH_X86_64
+ packssdw m1, m0
+ packssdw m3, m2
+ packssdw m5, m4
+ packssdw m7, m6
+ packssdw m9, m8
+ packssdw m11, m10
+ packssdw m13, m12
+ packssdw m15, m14
+ mova m0, m15
+ mova m2, m13
+ mova m4, m11
+ mova m6, m9
+ mova m8, m7
+ mova m10, m5
+ mova m12, m3
+ mova m14, m1
+ jmp m(idct_16x4_internal_16bpc).transpose
+%else
+ mova [rsp+gprsize+4*16], m0
+ mova [rsp+gprsize+5*16], m2
+ mova [rsp+gprsize+6*16], m4
+ mova [rsp+gprsize+7*16], m6
+ pshufd m6, [rsp+gprsize+ 8*16], q1032
+ pshufd m4, [rsp+gprsize+ 9*16], q1032
+ pshufd m2, [rsp+gprsize+10*16], q1032
+ pshufd m0, [rsp+gprsize+11*16], q1032
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+gprsize+0*16], m0
+ mova [rsp+gprsize+1*16], m1
+ mova [rsp+gprsize+2*16], m2
+ mova [rsp+gprsize+3*16], m3
+ pshufd m6, [rsp+gprsize+ 4*16], q1032
+ pshufd m4, [rsp+gprsize+ 5*16], q1032
+ pshufd m2, [rsp+gprsize+ 6*16], q1032
+ pshufd m0, [rsp+gprsize+ 7*16], q1032
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ jmp tx2q
+%endif
+
+.pass2:
+ lea r3, [strideq*3]
+ lea dstq, [dstq+r3]
+ neg strideq
+ lea r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)]
+ jmp m(idct_16x4_internal_16bpc).pass2_loop
+
+INV_TXFM_16X4_FN identity, dct
+INV_TXFM_16X4_FN identity, adst
+INV_TXFM_16X4_FN identity, flipadst
+INV_TXFM_16X4_FN identity, identity
+
+cglobal iidentity_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_64
+ mova m15, [o(pd_11586)]
+ pmulld m0, m15, [cq+ 0*16]
+ pmulld m1, m15, [cq+ 1*16]
+ pmulld m2, m15, [cq+ 2*16]
+ pmulld m3, m15, [cq+ 3*16]
+ pmulld m4, m15, [cq+ 4*16]
+ pmulld m5, m15, [cq+ 5*16]
+ pmulld m6, m15, [cq+ 6*16]
+ pmulld m7, m15, [cq+ 7*16]
+ pmulld m8, m15, [cq+ 8*16]
+ pmulld m9, m15, [cq+ 9*16]
+ pmulld m10, m15, [cq+10*16]
+ pmulld m11, m15, [cq+11*16]
+ pmulld m12, m15, [cq+12*16]
+ pmulld m13, m15, [cq+13*16]
+ pmulld m14, m15, [cq+14*16]
+ pmulld m15, [cq+15*16]
+ mova [cq+ 0*16], m15
+ mova m15, [o(pd_6144)]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ paddd m15, [cq+ 0*16]
+ REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ jmp m(idct_16x4_internal_16bpc).pack_transpose
+%else
+ add cq, 8*16
+ mov r5d, 2
+.loop_pass1:
+ mova m7, [o(pd_11586)]
+ pmulld m0, m7, [cq+0*16]
+ pmulld m1, m7, [cq+1*16]
+ pmulld m2, m7, [cq+2*16]
+ pmulld m3, m7, [cq+3*16]
+ pmulld m4, m7, [cq+4*16]
+ pmulld m5, m7, [cq+5*16]
+ pmulld m6, m7, [cq+6*16]
+ pmulld m7, [cq+7*16]
+ mova [cq+7*16], m7
+ mova m7, [o(pd_6144)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [cq+7*16]
+ REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ dec r5d
+ jz .end_pass1
+ mova [rsp+gprsize+0*16], m0
+ mova [rsp+gprsize+1*16], m1
+ mova [rsp+gprsize+2*16], m2
+ mova [rsp+gprsize+3*16], m3
+ sub cq, 8*16
+ jmp .loop_pass1
+.end_pass1:
+ jmp tx2q
+%endif
+
+.pass2:
+%if ARCH_X86_64
+ mova m12, [o(pw_1697x8)]
+%endif
+ lea r4, [o(.main)]
+ jmp m(idct_16x4_internal_16bpc).pass2_loop
+.main:
+%if ARCH_X86_64
+ pmulhrsw m4, m0, m12
+ pmulhrsw m5, m1, m12
+ pmulhrsw m6, m2, m12
+ pmulhrsw m7, m3, m12
+%else
+ mova m7, [o(pw_1697x8)]
+ pmulhrsw m4, m0, m7
+ pmulhrsw m5, m1, m7
+ pmulhrsw m6, m2, m7
+ pmulhrsw m7, m3
+%endif
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ ret
+
+%macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset
+%if ARCH_X86_64
+ INV_TXFM_FN %1, %2, %3, 16x8, 16, 0-8*16
+%else
+ INV_TXFM_FN %1, %2, %3, 16x8, 8, 0-13*16
+%endif
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 8
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+%if ARCH_X86_32
+ add rsp, 1*16
+%endif
+ jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly
+%endif
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, identity, 6
+INV_TXFM_16X8_FN dct, adst
+INV_TXFM_16X8_FN dct, flipadst
+
+cglobal idct_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_64
+ DECLARE_REG_TMP 6, 4, 6
+%else
+ mov [rsp+gprsize+12*16], r1
+ DECLARE_REG_TMP 1, 4, 3
+%endif
+ lea t0, [o(.main)]
+.loop_main:
+%undef cmp
+%if ARCH_X86_64
+ xor r5d, r5d
+ cmp eobd, 10
+ setge r5b
+%else
+ mov r5d, 1
+ cmp eobd, 10
+ sbb r5d, 0
+%endif
+ shl r5d, 4
+
+ lea r3, [rsp+gprsize]
+.loop_pass1:
+ call t0
+%if ARCH_X86_64
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+4*32+r5], m8
+ mova [cq+5*32+r5], m9
+ mova [cq+6*32+r5], m10
+ mova [cq+7*32+r5], m11
+%else
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+4*32+r5], m0
+ mova [cq+5*32+r5], m1
+ mova [cq+6*32+r5], m2
+ mova [cq+7*32+r5], m3
+ mova m0, [rsp+gprsize+ 8*16]
+ mova m2, [rsp+gprsize+ 9*16]
+ mova m4, [rsp+gprsize+10*16]
+ mova m6, [rsp+gprsize+11*16]
+%endif
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ pxor m7, m7
+ REPX {mova [cq+x*32+r5], m7}, 8, 9, 10, 11, 12, 13, 14, 15
+ test r5d, r5d
+ jz .end
+ mova [cq+0*32+r5], m0
+ mova [cq+1*32+r5], m1
+ mova [cq+2*32+r5], m2
+ mova [cq+3*32+r5], m3
+ xor r5d, r5d
+ jmp .loop_pass1
+.end:
+
+ jmp tx2q
+.main:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+ 1*32+r5]
+ mova m1, [cq+ 3*32+r5]
+ mova m2, [cq+ 5*32+r5]
+ mova m3, [cq+ 7*32+r5]
+ mova m4, [cq+ 9*32+r5]
+ mova m5, [cq+11*32+r5]
+ mova m6, [cq+13*32+r5]
+ mova m7, [cq+15*32+r5]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+
+ mova m0, [cq+ 0*32+r5]
+ mova m1, [cq+ 2*32+r5]
+ mova m2, [cq+ 4*32+r5]
+ mova m3, [cq+ 6*32+r5]
+ mova m4, [cq+ 8*32+r5]
+ mova m5, [cq+10*32+r5]
+ mova m6, [cq+12*32+r5]
+ mova m7, [cq+14*32+r5]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ call m(idct_16x4_internal_16bpc).round
+%if ARCH_X86_64
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_32
+ mov strideq, [rsp+gprsize+12*16]
+%endif
+ mov r4d, 2
+.pass2_main:
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+%endif
+ lea r3, [strideq*3]
+ jmp .loop_pass2_entry
+.loop_pass2:
+ mova m0, [cq+0*32+ 0]
+ mova m1, [cq+1*32+ 0]
+ mova m2, [cq+2*32+ 0]
+ mova m3, [cq+3*32+ 0]
+.loop_pass2_entry:
+ mova m4, [cq+0*32+16]
+ mova m5, [cq+1*32+16]
+ mova m6, [cq+2*32+16]
+ mova m7, [cq+3*32+16]
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
+ call m(idct_8x8_internal_16bpc).round2_and_write_8x8
+%if ARCH_X86_64
+%define mzero m9
+%else
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
+ add dstq, 16
+ add cq, 4*32
+ dec r4d
+ jg .loop_pass2
+ RET
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, adst
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, identity, 6
+
+cglobal iadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_32
+ mov [rsp+gprsize+12*16], r1
+%endif
+ lea t0, [o(.main)]
+ jmp m(idct_16x8_internal_16bpc).loop_main
+
+.main:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+ 2*32+r5]
+ mova m1, [cq+13*32+r5]
+ mova m2, [cq+ 6*32+r5]
+ mova m3, [cq+ 9*32+r5]
+ mova m4, [cq+10*32+r5]
+ mova m5, [cq+ 5*32+r5]
+ mova m6, [cq+14*32+r5]
+ mova m7, [cq+ 1*32+r5]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(iadst_16x4_internal_16bpc).main_part1
+ mova m0, [cq+ 0*32+r5]
+ mova m1, [cq+15*32+r5]
+ mova m2, [cq+ 4*32+r5]
+ mova m3, [cq+11*32+r5]
+ mova m4, [cq+ 8*32+r5]
+ mova m5, [cq+ 7*32+r5]
+ mova m6, [cq+12*32+r5]
+ mova m7, [cq+ 3*32+r5]
+%if ARCH_X86_32
+ add r3, 8*16
+%endif
+ call m(idct_8x4_internal_16bpc).rect2_mul
+%if ARCH_X86_32
+ sub r3, 8*16
+%endif
+ call m(iadst_16x4_internal_16bpc).main_part2
+ call m(iadst_16x4_internal_16bpc).round
+%if ARCH_X86_64
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_32
+ mov strideq, [rsp+gprsize+12*16]
+%endif
+ mov r4d, 2
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+ mova m11, [o(pw_m2048)]
+%endif
+ lea r3, [strideq*3]
+ jmp .loop_pass2_entry
+.loop_pass2:
+ mova m0, [cq+0*32+ 0]
+ mova m1, [cq+1*32+ 0]
+ mova m2, [cq+2*32+ 0]
+ mova m3, [cq+3*32+ 0]
+.loop_pass2_entry:
+ mova m4, [cq+0*32+16]
+ mova m5, [cq+1*32+16]
+ mova m6, [cq+2*32+16]
+ mova m7, [cq+3*32+16]
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main
+ call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end
+ call m(iadst_8x8_internal_16bpc).round2_and_write_8x8
+%if ARCH_X86_64
+%define mzero m9
+%else
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
+ add dstq, 16
+ add cq, 4*32
+ dec r4d
+ jg .loop_pass2
+ RET
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+INV_TXFM_16X8_FN flipadst, identity, 6
+
+cglobal iflipadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_32
+ mov [rsp+gprsize+12*16], r1
+%endif
+ lea t0, [o(.main)]
+ jmp m(idct_16x8_internal_16bpc).loop_main
+.main:
+ call m(iadst_16x8_internal_16bpc).main
+%if ARCH_X86_64
+ pshufd m1, m0, q1032
+ pshufd m3, m2, q1032
+ pshufd m5, m4, q1032
+ pshufd m7, m6, q1032
+ pshufd m0, m14, q1032
+ pshufd m2, m12, q1032
+ pshufd m4, m10, q1032
+ pshufd m6, m8, q1032
+ mova m14, m1
+ mova m12, m3
+ mova m10, m5
+ mova m8, m7
+%else
+ pshufd m1, m0, q1032
+ pshufd m3, m2, q1032
+ pshufd m5, m4, q1032
+ pshufd m7, m6, q1032
+ pshufd m0, [r3+11*16], q1032
+ pshufd m2, [r3+10*16], q1032
+ pshufd m4, [r3+9*16], q1032
+ pshufd m6, [r3+8*16], q1032
+ mova [r3+8*16], m7
+ mova [r3+9*16], m5
+ mova [r3+10*16], m3
+ mova [r3+11*16], m1
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_32
+ mov strideq, [rsp+gprsize+12*16]
+%endif
+ lea dstq, [dstq+strideq*8]
+ neg strideq
+ add dstq, strideq
+%if ARCH_X86_32
+ mov [rsp+gprsize+12*16], strideq
+%endif
+ jmp m(iadst_16x8_internal_16bpc).pass2
+
+INV_TXFM_16X8_FN identity, dct, -54
+INV_TXFM_16X8_FN identity, adst, -54
+INV_TXFM_16X8_FN identity, flipadst, -54
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_32
+ mov [rsp+gprsize+12*16], r1
+%endif
+ lea t0, [o(.main)]
+ jmp m(idct_16x8_internal_16bpc).loop_main
+.main:
+%if ARCH_X86_64
+ mova m15, [o(pd_2896)]
+ pmulld m0, m15, [cq+ 0*32+r5]
+ pmulld m1, m15, [cq+ 1*32+r5]
+ pmulld m2, m15, [cq+ 2*32+r5]
+ pmulld m3, m15, [cq+ 3*32+r5]
+ pmulld m4, m15, [cq+ 4*32+r5]
+ pmulld m5, m15, [cq+ 5*32+r5]
+ pmulld m6, m15, [cq+ 6*32+r5]
+ pmulld m7, m15, [cq+ 7*32+r5]
+ pmulld m8, m15, [cq+ 8*32+r5]
+ pmulld m9, m15, [cq+ 9*32+r5]
+ pmulld m10, m15, [cq+10*32+r5]
+ pmulld m11, m15, [cq+11*32+r5]
+ pmulld m12, m15, [cq+12*32+r5]
+ pmulld m13, m15, [cq+13*32+r5]
+ pmulld m14, m15, [cq+14*32+r5]
+ pmulld m15, [cq+15*32+r5]
+ mova [r3], m15
+ mova m15, [o(pd_2048)]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ paddd m15, [r3]
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ mova [r3], m15
+ mova m15, [o(pd_11586)]
+ REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ pmulld m15, [r3]
+ mova [r3], m15
+ mova m15, [o(pd_6144)]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ paddd m15, [r3]
+ REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%else
+ mova m0, [cq+ 0*32+r5]
+ mova m1, [cq+ 1*32+r5]
+ mova m2, [cq+ 2*32+r5]
+ mova m3, [cq+ 3*32+r5]
+ mova m4, [cq+ 4*32+r5]
+ mova m5, [cq+ 5*32+r5]
+ mova m6, [cq+ 6*32+r5]
+ mova m7, [cq+ 7*32+r5]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ mova [r3], m7
+ mova m7, [o(pd_11586)]
+ REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulld m7, [r3]
+ mova [r3], m7
+ mova m7, [o(pd_6144)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [r3]
+ REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ mova [r3+ 8*16], m0
+ mova [r3+ 9*16], m2
+ mova [r3+10*16], m4
+ mova [r3+11*16], m6
+ mova m0, [cq+ 8*32+r5]
+ mova m1, [cq+ 9*32+r5]
+ mova m2, [cq+10*32+r5]
+ mova m3, [cq+11*32+r5]
+ mova m4, [cq+12*32+r5]
+ mova m5, [cq+13*32+r5]
+ mova m6, [cq+14*32+r5]
+ mova m7, [cq+15*32+r5]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ mova [r3], m7
+ mova m7, [o(pd_11586)]
+ REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pmulld m7, [r3]
+ mova [r3], m7
+ mova m7, [o(pd_6144)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [r3]
+ REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+%endif
+ ret
+.pass2:
+%if ARCH_X86_32
+ mov strideq, [rsp+gprsize+12*16]
+%endif
+ mov r4d, 2
+%if ARCH_X86_64
+ mova m8, [o(pw_4096)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+%endif
+ lea r3, [strideq*3]
+ jmp .loop_pass2_entry
+.loop_pass2:
+ mova m0, [cq+0*32+ 0]
+ mova m1, [cq+1*32+ 0]
+ mova m2, [cq+2*32+ 0]
+ mova m3, [cq+3*32+ 0]
+.loop_pass2_entry:
+ mova m4, [cq+0*32+16]
+ mova m5, [cq+1*32+16]
+ mova m6, [cq+2*32+16]
+ mova m7, [cq+3*32+16]
+%if ARCH_X86_64
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+%else
+ mova [rsp+gprsize], m7
+ mova m7, [o(pw_4096)]
+ call m(idct_8x8_internal_16bpc).round4_and_write_8x8
+%endif
+%if ARCH_X86_64
+%define mzero m9
+%else
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
+ add dstq, 16
+ add cq, 4*32
+ dec r4d
+ jg .loop_pass2
+ RET
+
+%macro INV_TXFM_16X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
+%if ARCH_X86_64
+ INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 16, 0-(16+WIN64)*16
+%else
+ INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 8, 0-17*16
+%endif
+%ifidn %1_%2, dct_dct
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 16
+ add r5d, 640
+ sar r5d, 10
+ add rsp, (5+ARCH_X86_64*3+WIN64)*16
+ jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
+%endif
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, identity, v
+INV_TXFM_16X16_FN dct, adst
+INV_TXFM_16X16_FN dct, flipadst
+
+cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if ARCH_X86_64
+ DECLARE_REG_TMP 6, 7
+%if WIN64
+ mov [rsp+16*16+gprsize], r7
+%endif
+%elif ARCH_X86_32
+ DECLARE_REG_TMP 1, 6
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(.main)]
+.pass1_full:
+%undef cmp
+ mov t1d, 4
+.zero_loop:
+ dec t1d
+ cmp eobb, byte [r5+t1]
+ jb .zero_loop
+ mov r5d, t1d
+ shl r5d, 4
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r6, [rsp+16*16+2*gprsize]
+%endif
+ ; setup stack pointer
+ lea r3, [rsp+gprsize]
+.loop_pass1:
+ call t0
+%if ARCH_X86_64
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+4*64+r5], m8
+ mova [cq+5*64+r5], m9
+ mova [cq+6*64+r5], m10
+ mova [cq+7*64+r5], m11
+%else
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+4*64+r5], m0
+ mova [cq+5*64+r5], m1
+ mova [cq+6*64+r5], m2
+ mova [cq+7*64+r5], m3
+ mova m0, [rsp+gprsize+ 8*16]
+ mova m2, [rsp+gprsize+ 9*16]
+ mova m4, [rsp+gprsize+10*16]
+ mova m6, [rsp+gprsize+11*16]
+%endif
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+0*64+r5], m0
+ mova [cq+1*64+r5], m1
+ mova [cq+2*64+r5], m2
+ mova [cq+3*64+r5], m3
+ pxor m0, m0
+ REPX {mova [cq+x*64+r5], m0}, 8, 9, 10, 11, 12, 13, 14, 15
+ sub r5d, 16
+ jge .loop_pass1
+
+%if ARCH_X86_32
+ ; restore pic-ptr
+ mov r1, [rsp+16*16+1*gprsize]
+%endif
+ jmp tx2q
+.main:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+
+ mova m0, [cq+ 1*64+r5]
+ mova m1, [cq+ 3*64+r5]
+ mova m2, [cq+ 5*64+r5]
+ mova m3, [cq+ 7*64+r5]
+ mova m4, [cq+ 9*64+r5]
+ mova m5, [cq+11*64+r5]
+ mova m6, [cq+13*64+r5]
+ mova m7, [cq+15*64+r5]
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+
+ mova m0, [cq+ 0*64+r5]
+ mova m1, [cq+ 2*64+r5]
+ mova m2, [cq+ 4*64+r5]
+ mova m3, [cq+ 6*64+r5]
+ mova m4, [cq+ 8*64+r5]
+ mova m5, [cq+10*64+r5]
+ mova m6, [cq+12*64+r5]
+ mova m7, [cq+14*64+r5]
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ call .round
+%if ARCH_X86_64
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ ret
+.round:
+%if ARCH_X86_64
+ REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
+ psrld m8, m11, 10 ; 2
+ REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ mova m8, [r3+1*16]
+ mova m9, [r3+2*16]
+ mova m10, [r3+3*16]
+ mova m11, [r3+4*16]
+ mova m12, [r3+5*16]
+ mova m13, [r3+6*16]
+ mova m14, [r3+7*16]
+ psubd m15, m0, m14 ; out15
+ paddd m0, m14 ; out0
+ psubd m14, m1, m13 ; out14
+ paddd m1, m13 ; out1
+ psubd m13, m2, m12 ; out13
+ paddd m2, m12 ; out2
+ psubd m12, m3, m11 ; out12
+ paddd m3, m11 ; out3
+ psubd m11, m4, m10 ; out11
+ paddd m4, m10 ; out4
+ psubd m10, m5, m9 ; out10
+ paddd m5, m9 ; out5
+ psubd m9, m6, m8 ; out9
+ paddd m6, m8 ; out6
+ psubd m8, m7, [r3+0*16] ; out8
+ paddd m7, [r3+0*16] ; out7
+ REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ ; and out0-15 is now in m0-15
+%else
+ mova [r3+ 0*16], m0
+ mova m0, [o(clip_18b_min)]
+ REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
+ pmaxsd m0, [r3+ 0*16]
+ mova [r3+ 0*16], m7
+ mova m7, [o(clip_18b_max)]
+ REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ pminsd m7, [r3+ 0*16]
+ mova [r3+ 0*16], m0
+ mova m0, [o(pd_2)]
+ REPX {paddd x, m0}, m1, m2, m3, m4, m5, m6, m7
+ paddd m0, [r3+ 0*16]
+ mova [r3+ 0*16], m0
+ mova [r3+ 1*16], m1
+ mova [r3+ 2*16], m2
+ mova m1, [r3+11*16]
+ mova m2, [r3+10*16]
+ psubd m0, m7, m1
+ paddd m7, m1
+ psubd m1, m6, m2
+ paddd m6, m2
+ REPX {psrad x, 2}, m0, m1, m6, m7
+ packssdw m0, m1 ; out8-9
+ packssdw m6, m7 ; out6-7
+ mova [r3+11*16], m6
+ mova m1, [r3+9*16]
+ mova m7, [r3+8*16]
+ psubd m2, m5, m1
+ paddd m5, m1
+ psubd m1, m4, m7
+ paddd m4, m7
+ REPX {psrad x, 2}, m2, m1, m4, m5
+ packssdw m2, m1 ; out10-11
+ packssdw m4, m5 ; out4-5
+ mova m1, [r3+2*16]
+ mova [r3+10*16], m4
+ mova m6, [r3+7*16]
+ mova m7, [r3+6*16]
+ psubd m4, m3, m6
+ paddd m3, m6
+ psubd m6, m1, m7
+ paddd m1, m7
+ REPX {psrad x, 2}, m4, m6, m1, m3
+ packssdw m4, m6 ; out12-13
+ packssdw m1, m3 ; out2-3
+ mova m3, [r3+1*16]
+ mova [r3+9*16], m1
+ mova m1, [r3+0*16]
+ mova m5, [r3+5*16]
+ mova m7, [r3+4*16]
+ psubd m6, m3, m5
+ paddd m3, m5
+ psubd m5, m1, m7
+ paddd m1, m7
+ REPX {psrad x, 2}, m6, m5, m1, m3
+ packssdw m6, m5 ; out14-15
+ packssdw m1, m3 ; out0-1
+ mova [r3+8*16], m1
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+ mov r7, dstq
+%else
+ mov [rsp+2*gprsize+16*16], dstq
+%endif
+ lea r3, [strideq*3]
+ mov r4d, 2
+.loop_pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ mova m0, [cq+0*64+ 0]
+ mova m1, [cq+2*64+ 0]
+ mova m2, [cq+0*64+16]
+ mova m3, [cq+2*64+16]
+ mova m4, [cq+0*64+32]
+ mova m5, [cq+2*64+32]
+ mova m6, [cq+0*64+48]
+ mova m7, [cq+2*64+48]
+ call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
+ mova [rsp+gprsize+3*16], m0
+ mova [rsp+gprsize+4*16], m1
+ mova [rsp+gprsize+5*16], m2
+ mova [rsp+gprsize+6*16], m3
+ mova [rsp+gprsize+7*16], m4
+ mova [rsp+gprsize+8*16], m5
+ mova [rsp+gprsize+9*16], m6
+ ; m7 is already stored in [rsp+gprsize+0*16]
+ mova m0, [cq+1*64+ 0]
+ mova m1, [cq+3*64+ 0]
+ mova m2, [cq+1*64+16]
+ mova m3, [cq+3*64+16]
+ mova m4, [cq+1*64+32]
+ mova m5, [cq+3*64+32]
+ mova m6, [cq+1*64+48]
+ mova m7, [cq+3*64+48]
+ call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
+
+ ; out0-7 is in rsp+gprsize+3-10*mmsize
+ ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize
+
+%if ARCH_X86_64
+ lea dstq, [r7+strideq*8]
+%else
+ mov dstq, [rsp+2*gprsize+16*16]
+ lea dstq, [dstq+strideq*8]
+%endif
+ call m(idct_8x8_internal_16bpc).round2_and_write_8x8
+%if ARCH_X86_64
+ mov dstq, r7
+%else
+ mov dstq, [rsp+2*gprsize+16*16]
+%endif
+ mova m0, [rsp+gprsize+ 3*16]
+ mova m1, [rsp+gprsize+ 4*16]
+ mova m2, [rsp+gprsize+ 5*16]
+ mova m3, [rsp+gprsize+ 6*16]
+ mova m4, [rsp+gprsize+ 7*16]
+ mova m5, [rsp+gprsize+ 8*16]
+ mova m6, [rsp+gprsize+ 9*16]
+ mova m7, [rsp+gprsize+10*16]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+%if ARCH_X86_64
+ add r7, 16
+%define mzero m9
+%else
+ add dword [rsp+2*gprsize+16*16], 16
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
+ add cq, 64*4
+ REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1
+%undef mzero
+ dec r4d
+ jg .loop_pass2
+%if WIN64
+ mov r7, [rsp+16*16+gprsize]
+%endif
+ RET
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, adst
+INV_TXFM_16X16_FN adst, flipadst
+
+cglobal iadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ mov [rsp+16*16+gprsize], r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(.main)]
+ jmp m(idct_16x16_internal_16bpc).pass1_full
+
+.main:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+ 2*64+r5]
+ mova m1, [cq+13*64+r5]
+ mova m2, [cq+ 6*64+r5]
+ mova m3, [cq+ 9*64+r5]
+ mova m4, [cq+10*64+r5]
+ mova m5, [cq+ 5*64+r5]
+ mova m6, [cq+14*64+r5]
+ mova m7, [cq+ 1*64+r5]
+ call m(iadst_16x4_internal_16bpc).main_part1
+ mova m0, [cq+ 0*64+r5]
+ mova m1, [cq+15*64+r5]
+ mova m2, [cq+ 4*64+r5]
+ mova m3, [cq+11*64+r5]
+ mova m4, [cq+ 8*64+r5]
+ mova m5, [cq+ 7*64+r5]
+ mova m6, [cq+12*64+r5]
+ mova m7, [cq+ 3*64+r5]
+ call m(iadst_16x4_internal_16bpc).main_part2
+ call .round
+%if ARCH_X86_64
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ ret
+.round:
+%if ARCH_X86_64
+ pcmpeqd m8, m8 ; -1
+ mova m15, [o(pd_10240)]
+ psrld m14, 10 ; +2
+ psubd m13, m14, m8 ; +3
+ REPX {pxor x, m8 }, m1, m3, m5, m7
+ REPX {paddd x, m14}, m0, m2
+ REPX {paddd x, m13}, m1, m3
+ REPX {paddd x, m15}, m4, m5, m6, m7
+ paddd m13, m15, m8 ; +10239
+ paddd m8, m15, m9
+ psubd m9, m13, m10
+ paddd m10, m15, m11
+ psubd m11, m13, m12
+ paddd m12, m14, [r3+3*16]
+ psubd m13, m14, [r3+2*16]
+ psubd m15, m14, [r3+0*16]
+ paddd m14, [r3+1*16]
+ REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15
+ REPX {psrad x, 14}, m4, m5, m6, m7, m8, m9, m10, m11
+%else
+ mova [r3+8*16], m1
+ mova [r3+9*16], m3
+ mova m3, [o(pd_10240)]
+ pcmpeqd m1, m1
+ REPX {pxor x, m1}, m5, m7
+ REPX {paddd x, m3}, m4, m5, m6, m7
+ REPX {psrad x, 14}, m4, m5, m6, m7
+ packssdw m4, m5
+ packssdw m6, m7
+ mova [r3+10*16], m4
+ mova [r3+11*16], m6
+ mova m4, [r3+4*16]
+ mova m5, [r3+5*16]
+ mova m6, [r3+6*16]
+ mova m7, [r3+7*16]
+ mova m3, [o(pd_2)]
+ REPX {pxor x, m1}, m5, m7
+ REPX {paddd x, m3}, m4, m6
+ psubd m3, m1
+ REPX {paddd x, m3}, m5, m7
+ REPX {psrad x, 2 }, m4, m5, m6, m7
+ packssdw m4, m5
+ packssdw m6, m7
+ mova m5, [r3+8*16]
+ mova m7, [r3+9*16]
+ mova [r3+8*16], m4
+ mova [r3+9*16], m6
+ mova m3, [o(pd_10240)]
+ REPX {pxor x, m1}, m5, m7
+ REPX {paddd x, m3}, m0, m5, m2, m7
+ REPX {psrad x, 14}, m0, m5, m2, m7
+ packssdw m0, m5
+ packssdw m2, m7
+ mova m4, [r3+0*16]
+ mova m5, [r3+1*16]
+ mova m6, [r3+2*16]
+ mova m7, [r3+3*16]
+ mova m3, [o(pd_2)]
+ REPX {pxor x, m1}, m5, m7
+ REPX {paddd x, m3}, m4, m6
+ psubd m3, m1
+ REPX {paddd x, m3}, m5, m7
+ REPX {psrad x, 2 }, m4, m5, m6, m7
+ packssdw m4, m5
+ packssdw m6, m7
+%endif
+ ret
+.pass2:
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ mova m11, [o(pw_m2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+ mov r7, dstq
+%else
+ mov [rsp+2*gprsize+16*16], dstq
+%endif
+ lea r3, [strideq*3]
+ mov r4d, 2
+.loop_pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ mova m0, [cq+0*64+32]
+ mova m1, [cq+1*64+32]
+ mova m2, [cq+2*64+16]
+ mova m3, [cq+3*64+16]
+ mova m4, [cq+0*64+ 0]
+ mova m5, [cq+1*64+ 0]
+ mova m6, [cq+2*64+48]
+ mova m7, [cq+3*64+48]
+ mova [rsp+gprsize+3*16], m0
+ mova [rsp+gprsize+4*16], m1
+ mova [rsp+gprsize+5*16], m2
+ mova [rsp+gprsize+6*16], m3
+ mova [rsp+gprsize+7*16], m4
+ mova [rsp+gprsize+8*16], m5
+ mova [rsp+gprsize+9*16], m6
+ mova [rsp+gprsize+10*16], m7
+ mova m0, [cq+2*64+ 0]
+ mova m1, [cq+3*64+ 0]
+ mova m2, [cq+0*64+16]
+ mova m3, [cq+1*64+16]
+ mova m4, [cq+2*64+32]
+ mova m5, [cq+3*64+32]
+ mova m6, [cq+0*64+48]
+ mova m7, [cq+1*64+48]
+ call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main
+ call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end
+
+ ; out0-7 is in rsp+gprsize+3-10*mmsize
+ ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize
+
+%if ARCH_X86_64
+ lea dstq, [r7+strideq*8]
+%else
+ mov dstq, [rsp+2*gprsize+16*16]
+ lea dstq, [dstq+strideq*8]
+%endif
+ call m(iadst_8x8_internal_16bpc).round2_and_write_8x8
+%if ARCH_X86_64
+ mov dstq, r7
+%else
+ mov dstq, [rsp+2*gprsize+16*16]
+%endif
+ mova m0, [rsp+gprsize+ 3*16]
+ mova m1, [rsp+gprsize+ 4*16]
+ mova m2, [rsp+gprsize+ 5*16]
+ mova m3, [rsp+gprsize+ 6*16]
+ mova m4, [rsp+gprsize+ 7*16]
+ mova m5, [rsp+gprsize+ 8*16]
+ mova m6, [rsp+gprsize+ 9*16]
+ mova m7, [rsp+gprsize+10*16]
+ call m(iadst_8x8_internal_16bpc).round1_and_write_8x8
+%if ARCH_X86_64
+ add r7, 16
+%define mzero m9
+%else
+ add dword [rsp+2*gprsize+16*16], 16
+%define mzero m7
+ pxor m7, m7
+%endif
+ REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7
+ add cq, 64*4
+ REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1
+%undef mzero
+ dec r4d
+ jg .loop_pass2
+%if WIN64
+ mov r7, [rsp+16*16+gprsize]
+%endif
+ RET
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ mov [rsp+16*16+gprsize], r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(.main)]
+ jmp m(idct_16x16_internal_16bpc).pass1_full
+
+.main:
+ call m(iadst_16x16_internal_16bpc).main
+%if ARCH_X86_64
+ mova m1, m0
+ mova m3, m2
+ mova m5, m4
+ mova m7, m6
+ pshufd m0, m14, q1032
+ pshufd m2, m12, q1032
+ pshufd m4, m10, q1032
+ pshufd m6, m8, q1032
+ pshufd m8, m7, q1032
+ pshufd m10, m5, q1032
+ pshufd m12, m3, q1032
+ pshufd m14, m1, q1032
+%else
+ pshufd m1, m0, q1032
+ pshufd m3, m2, q1032
+ pshufd m5, m4, q1032
+ pshufd m7, m6, q1032
+ pshufd m0, [r3+11*16], q1032
+ pshufd m2, [r3+10*16], q1032
+ pshufd m4, [r3+9*16], q1032
+ pshufd m6, [r3+8*16], q1032
+ mova [r3+11*16], m1
+ mova [r3+10*16], m3
+ mova [r3+ 9*16], m5
+ mova [r3+ 8*16], m7
+%endif
+ ret
+
+.pass2:
+ lea r3, [strideq*3]
+ lea r3, [r3*5]
+ add dstq, r3
+ neg strideq
+ jmp m(iadst_16x16_internal_16bpc).pass2
+
+INV_TXFM_16X16_FN identity, dct, h
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
+%if WIN64
+ mov [rsp+16*16+gprsize], r7
+%elif ARCH_X86_32
+ mov [rsp+16*16+gprsize*1], r1
+ mov [rsp+16*16+gprsize*2], r6
+%endif
+ lea t0, [o(.main)]
+ jmp m(idct_16x16_internal_16bpc).pass1_full
+
+.main:
+%if ARCH_X86_64
+ mova m15, [o(pd_11586)]
+ pmulld m0, m15, [cq+ 0*64+r5]
+ pmulld m1, m15, [cq+ 1*64+r5]
+ pmulld m2, m15, [cq+ 2*64+r5]
+ pmulld m3, m15, [cq+ 3*64+r5]
+ pmulld m4, m15, [cq+ 4*64+r5]
+ pmulld m5, m15, [cq+ 5*64+r5]
+ pmulld m6, m15, [cq+ 6*64+r5]
+ pmulld m7, m15, [cq+ 7*64+r5]
+ pmulld m8, m15, [cq+ 8*64+r5]
+ pmulld m9, m15, [cq+ 9*64+r5]
+ pmulld m10, m15, [cq+10*64+r5]
+ pmulld m11, m15, [cq+11*64+r5]
+ pmulld m12, m15, [cq+12*64+r5]
+ pmulld m13, m15, [cq+13*64+r5]
+ pmulld m14, m15, [cq+14*64+r5]
+ pmulld m15, [cq+15*64+r5]
+ mova [r3], m15
+ mova m15, [o(pd_10240)]
+ REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14
+ paddd m15, [r3]
+ REPX {psrad x, 14 }, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m8, m9, m10, m11, m12, m13, m14, m15
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%else
+ mova m7, [o(pd_11586)]
+ pmulld m0, m7, [cq+ 0*64+r5]
+ pmulld m1, m7, [cq+ 1*64+r5]
+ pmulld m2, m7, [cq+ 2*64+r5]
+ pmulld m3, m7, [cq+ 3*64+r5]
+ pmulld m4, m7, [cq+ 4*64+r5]
+ pmulld m5, m7, [cq+ 5*64+r5]
+ pmulld m6, m7, [cq+ 6*64+r5]
+ pmulld m7, [cq+ 7*64+r5]
+ mova [r3], m7
+ mova m7, [o(pd_10240)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [r3]
+ REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ mova [r3+8*16], m0
+ mova [r3+9*16], m2
+ mova [r3+10*16], m4
+ mova [r3+11*16], m6
+ mova m7, [o(pd_11586)]
+ pmulld m0, m7, [cq+ 8*64+r5]
+ pmulld m1, m7, [cq+ 9*64+r5]
+ pmulld m2, m7, [cq+10*64+r5]
+ pmulld m3, m7, [cq+11*64+r5]
+ pmulld m4, m7, [cq+12*64+r5]
+ pmulld m5, m7, [cq+13*64+r5]
+ pmulld m6, m7, [cq+14*64+r5]
+ pmulld m7, [cq+15*64+r5]
+ mova [r3], m7
+ mova m7, [o(pd_10240)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [r3]
+ REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+%endif
+ ret
+
+.pass2:
+%if ARCH_X86_64
+ mova m4, [o(pw_2048)]
+ mova m5, [o(pixel_10bpc_max)]
+ pxor m6, m6
+ mova m7, [o(pw_1697x16)]
+ mov r7, dstq
+%else
+ mov [rsp+2*gprsize+16*16], dstq
+%endif
+ mov r5d, 4
+ lea r3, [strideq*3]
+.pass2_loop:
+ mova m0, [cq+0*64+0]
+ mova m1, [cq+1*64+0]
+ mova m2, [cq+2*64+0]
+ mova m3, [cq+3*64+0]
+ call m(iidentity_8x16_internal_16bpc).main
+%if ARCH_X86_64
+ call m(idct_8x4_internal_16bpc).round1_and_write_8x4
+%else
+ call m(idct_8x4_internal_16bpc).round2_and_write_8x4
+%endif
+ REPX {mova [cq+x*16], m6}, 0, 4, 8, 12
+ add cq, 16
+ lea dstq, [dstq+strideq*4]
+ dec r5w
+ jg .pass2_loop
+ add cq, 64*3
+ btc r5d, 16
+ jc .end
+%if ARCH_X86_64
+ lea dstq, [r7+16]
+%else
+ mov dstq, [rsp+2*gprsize+16*16]
+ add dstq, 16
+%endif
+ add r5d, 4
+ jmp .pass2_loop
+.end:
+%if WIN64
+ mov r7, [rsp+16*16+gprsize]
+%endif
+ RET
+
+cglobal inv_txfm_add_identity_identity_8x32_16bpc, 4, 7, 8, dst, stride, c, eob
+%if ARCH_X86_32
+ LEA r6, $$
+%endif
+ mova m5, [o(pw_5)]
+ mova m7, [o(pixel_10bpc_max)]
+ pxor m6, m6
+ mov r5d, eobd
+ add eobb, 21
+ cmovc eobd, r5d ; 43, 107, 171 -> 64, 128, 192
+ lea r4, [strideq*3]
+.loop:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+ REPX {paddsw x, m5}, m0, m1, m2, m3
+ REPX {psraw x, 3 }, m0, m1, m2, m3
+ call .main_zero
+ add cq, 16
+ lea dstq, [dstq+strideq*4]
+ btc eobd, 16
+ jnc .loop
+ sub eobd, 64
+ jge .loop
+ RET
+ALIGN function_align
+.main_zero:
+ REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+.main:
+ punpckhwd m4, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m2, m3
+ punpcklwd m2, m3
+ punpckhwd m3, m0, m4
+ punpcklwd m0, m4
+ punpckhwd m4, m2, m1
+ punpcklwd m2, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ paddw m0, [dstq+strideq*0]
+ paddw m1, [dstq+strideq*1]
+ paddw m2, [dstq+strideq*2]
+ paddw m3, [dstq+r4 ]
+ REPX {pmaxsw x, m6}, m0, m1, m2, m3
+ REPX {pminsw x, m7}, m0, m1, m2, m3
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r4 ], m3
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x8_16bpc, 4, 7, 8, dst, stride, c, eob
+%if ARCH_X86_32
+ LEA r6, $$
+%endif
+ mova m5, [o(pw_4096)]
+ mova m7, [o(pixel_10bpc_max)]
+ pxor m6, m6
+ mov r4d, eobd
+ add eobb, 21
+ cmovc eobd, r4d
+ lea r4, [strideq*3]
+ mov r5, dstq
+.loop:
+ mova m0, [cq+32*0]
+ packssdw m0, [cq+32*1]
+ mova m1, [cq+32*2]
+ packssdw m1, [cq+32*3]
+ mova m2, [cq+32*4]
+ packssdw m2, [cq+32*5]
+ mova m3, [cq+32*6]
+ packssdw m3, [cq+32*7]
+ REPX {mova [cq+32*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ call m(inv_txfm_add_identity_identity_8x32_16bpc).main
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ btc eobd, 16
+ jnc .loop
+ add cq, 32*8-32
+ add r5, 16
+ mov dstq, r5
+ sub eobd, 64
+ jge .loop
+ RET
+
+cglobal inv_txfm_add_identity_identity_16x32_16bpc, 4, 7, 12, dst, stride, c, eob
+%if ARCH_X86_32
+ LEA r6, $$
+%else
+ mova m8, [o(pw_2896x8)]
+ mova m9, [o(pw_1697x16)]
+ mova m11, [o(pw_8192)]
+%endif
+ mova m7, [o(pixel_10bpc_max)]
+ lea r4, [strideq*3]
+ pxor m6, m6
+%if ARCH_X86_64
+ paddw m10, m11, m11 ; pw_16384
+%endif
+ mov r5, dstq
+ call .main
+ sub eobd, 36
+ jl .ret
+ add cq, 128*8-32
+ lea dstq, [r5+16]
+ call .main
+ sub cq, 128*8
+ lea dstq, [r5+strideq*8]
+ mov r5, dstq
+ call .main
+ sub eobd, 107 ; eob < 143
+ jl .ret
+ add cq, 128*8-32
+ lea dstq, [r5+16]
+ call .main
+ sub cq, 128*8
+ lea dstq, [r5+strideq*8]
+ mov r5, dstq
+ call .main
+ sub eobd, 128 ; eob < 271
+ jl .ret
+ add cq, 128*8-32
+ lea dstq, [r5+16]
+ call .main
+ sub cq, 128*8
+ lea dstq, [r5+strideq*8]
+ mov r5, dstq
+ call .main
+ sub eobd, 128 ; eob < 399
+ jl .ret
+ add cq, 128*8-32
+ lea dstq, [r5+16]
+ call .main
+.ret:
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+%if ARCH_X86_64
+ REPX {pmulhrsw x, m8 }, m0, m1, m2, m3
+ pmulhrsw m4, m9, m0
+ pmulhrsw m5, m9, m1
+ REPX {pmulhrsw x, m10}, m4, m5
+%else
+ mova m6, [o(pw_2896x8)]
+ REPX {pmulhrsw x, m6 }, m0, m1, m2, m3
+ mova m5, [o(pw_1697x16)]
+ pmulhrsw m4, m5, m0
+ pmulhrsw m5, m1
+ mova m6, [o(pw_16384)]
+ REPX {pmulhrsw x, m6 }, m4, m5
+%endif
+ paddsw m0, m4
+ paddsw m1, m5
+%if ARCH_X86_64
+ pmulhrsw m4, m9, m2
+ pmulhrsw m5, m9, m3
+ REPX {pmulhrsw x, m10}, m4, m5
+%else
+ mova m5, [o(pw_1697x16)]
+ pmulhrsw m4, m5, m2
+ pmulhrsw m5, m3
+ REPX {pmulhrsw x, m6 }, m4, m5
+%endif
+ paddsw m2, m4
+ paddsw m3, m5
+%if ARCH_X86_64
+ REPX {pmulhrsw x, m11}, m0, m1, m2, m3
+%else
+ psrlw m6, 1 ; pw_8192
+ REPX {pmulhrsw x, m6 }, m0, m1, m2, m3
+ pxor m6, m6
+%endif
+ call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ btc eobd, 16
+ jnc .main
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x16_16bpc, 4, 7, 11, dst, stride, c, eob
+%if ARCH_X86_32
+ LEA r6, $$
+%else
+ mova m8, [o(pw_2896x8)]
+ mova m9, [o(pw_1697x16)]
+ mova m10, [o(pw_2048)]
+%endif
+ mova m7, [o(pixel_10bpc_max)]
+ lea r4, [strideq*3]
+ pxor m6, m6
+ mov r5, dstq
+ call .main
+ sub eobd, 36
+ jl .ret
+ call .main
+ add cq, 64*8-64
+ lea dstq, [r5+16*1]
+ call .main
+ sub eobd, 107 ; eob < 143
+ jl .ret
+ call .main
+ add cq, 64*8-64
+ lea dstq, [r5+16*2]
+ call .main
+ sub eobd, 128 ; eob < 271
+ jl .ret
+ call .main
+ add cq, 64*8-64
+ lea dstq, [r5+16*3]
+ call .main
+ sub eobd, 128 ; eob < 399
+ jl .ret
+ call .main
+.ret:
+ RET
+ALIGN function_align
+.main:
+ mova m0, [cq+64*0]
+ packssdw m0, [cq+64*1]
+ mova m1, [cq+64*2]
+ packssdw m1, [cq+64*3]
+ mova m2, [cq+64*4]
+ packssdw m2, [cq+64*5]
+ mova m3, [cq+64*6]
+ packssdw m3, [cq+64*7]
+%if ARCH_X86_64
+ REPX {pmulhrsw x, m8 }, m0, m1, m2, m3
+%else
+ mova m6, [o(pw_2896x8)]
+ REPX {pmulhrsw x, m6 }, m0, m1, m2, m3
+%endif
+ REPX {paddsw x, x }, m0, m1, m2, m3
+%if ARCH_X86_64
+ pmulhrsw m4, m9, m0
+ pmulhrsw m5, m9, m1
+%else
+ mova m6, [o(pw_1697x16)]
+ pmulhrsw m4, m6, m0
+ pmulhrsw m5, m6, m1
+%endif
+ REPX {paddsw x, x }, m0, m1
+ paddsw m0, m4
+ paddsw m1, m5
+%if ARCH_X86_64
+ pmulhrsw m4, m9, m2
+ pmulhrsw m5, m9, m3
+%else
+ pmulhrsw m4, m6, m2
+ pmulhrsw m6, m3
+%endif
+ REPX {paddsw x, x }, m2, m3
+ paddsw m2, m4
+%if ARCH_X86_64
+ paddsw m3, m5
+ REPX {pmulhrsw x, m10}, m0, m1, m2, m3
+%else
+ paddsw m3, m6
+ mova m6, [o(pw_2048)]
+ REPX {pmulhrsw x, m6 }, m0, m1, m2, m3
+ pxor m6, m6
+%endif
+ REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+ call m(inv_txfm_add_identity_identity_8x32_16bpc).main
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ btc eobd, 16
+ jnc .main
+ ret
+
+cglobal inv_txfm_add_identity_identity_32x32_16bpc, 4, 7, 8, dst, stride, c, eob
+%undef cmp
+%if ARCH_X86_32
+ LEA r6, $$
+%endif
+ mova m5, [o(pw_8192)]
+ mova m7, [o(pixel_10bpc_max)]
+ pxor m6, m6
+ lea r4, [strideq*3]
+ mov r5, dstq
+ call .main ; 0
+ cmp eobd, 36
+ jl .ret
+ add cq, 128*8-32 ; 0 1
+ lea dstq, [r5+16] ; 1
+ call .main
+ call .main2
+ cmp eobd, 136
+ jl .ret
+ add cq, 128*16-64 ; 0 1 2
+ lea dstq, [r5+16*2] ; 1 2
+ call .main ; 2
+ call .main2
+ call .main2
+ cmp eobd, 300
+ jl .ret
+ add cq, 128*24-96 ; 0 1 2 3
+ add r5, 16*3 ; 1 2 3
+ mov dstq, r5 ; 2 3
+ call .main ; 3
+ call .main2
+ call .main2
+ call .main2
+ cmp eobd, 535
+ jl .ret
+ add cq, 128*24-96 ; 0 1 2 3
+ lea dstq, [r5+strideq*8] ; 1 2 3 4
+ mov r5, dstq ; 2 3 4
+ call .main ; 3 4
+ call .main2
+ call .main2
+ cmp eobd, 755
+ jl .ret
+ add cq, 128*16-64 ; 0 1 2 3
+ lea dstq, [r5+strideq*8] ; 1 2 3 4
+ mov r5, dstq ; 2 3 4 5
+ call .main ; 3 4 5
+ call .main2
+ cmp eobd, 911
+ jl .ret
+ add cq, 128*8-32 ; 0 1 2 3
+ lea dstq, [r5+strideq*8] ; 1 2 3 4
+ call .main ; 2 3 4 5
+.ret: ; 3 4 5 6
+ RET
+ALIGN function_align
+.main2:
+ sub cq, 128*8
+ sub dstq, 16
+.main:
+ mova m0, [cq+128*0]
+ packssdw m0, [cq+128*1]
+ mova m1, [cq+128*2]
+ packssdw m1, [cq+128*3]
+ mova m2, [cq+128*4]
+ packssdw m2, [cq+128*5]
+ mova m3, [cq+128*6]
+ packssdw m3, [cq+128*7]
+ REPX {pmulhrsw x, m5}, m0, m1, m2, m3
+ call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero
+ lea dstq, [dstq+strideq*4]
+ add cq, 16
+ btc eobd, 16
+ jnc .main
+ ret
+
+cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 15, 0-36*16, \
+ dst, stride, c, eob
+%if ARCH_X86_32
+ LEA r6, $$
+%define base $$
+ DECLARE_REG_TMP 0, 4
+%else
+ lea r6, [tbl_Nx32_odd_offset]
+%define base tbl_Nx32_odd_offset
+ DECLARE_REG_TMP 4, 7
+%if WIN64
+ mov [rsp+gprsize*1+35*16], r7
+%endif
+%endif
+%define o2(x) r6-base+x
+ test eobd, eobd
+ jz .dconly
+
+%if ARCH_X86_32
+ mov [rsp+gprsize*1+35*16], r0
+%endif
+%undef cmp
+ ; remove entirely-zero iterations
+ mov r5d, 7*2
+ cmp eobw, word [o2(tbl_8x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+ mova [rsp+ 3*16+r5*8], m0
+ mova [rsp+11*16+r5*8], m0
+ mova [rsp+ 3*16+t0*8], m0
+ mova [rsp+ 3*16+t1*8], m0
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_8x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+ ; actual first pass after skipping all-zero data
+ mov [rsp+gprsize*0+35*16], eobd
+ mov r3, rsp
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+0*128+r5*8]
+ mova m1, [cq+1*128+r5*8]
+ mova m2, [cq+2*128+r5*8]
+ mova m3, [cq+3*128+r5*8]
+ mova m4, [cq+4*128+r5*8]
+ mova m5, [cq+5*128+r5*8]
+ mova m6, [cq+6*128+r5*8]
+ mova m7, [cq+7*128+r5*8]
+ call m(idct_8x4_internal_16bpc).main_pass1
+ mova m1, [o(pd_2)]
+ REPX {paddd x, m1}, m0, m6, m5, m3
+ call m(idct_8x4_internal_16bpc).round
+ REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+ mova [r3+ 3*16+r5*8], m0
+ mova [r3+11*16+r5*8], m2
+ mova [r3+ 3*16+t1*8], m1
+ mova [r3+ 3*16+t0*8], m3
+ pxor m7, m7
+ REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass 2 code starts here
+ ; m0 is already loaded from last iteration of first pass
+%if ARCH_X86_32
+ mov r0, [rsp+gprsize*1+35*16]
+%endif
+ mov eobd, [rsp+gprsize*0+35*16]
+ cmp eobd, 43
+ jl .load_veryfast
+ cmp eobd, 107
+ jl .load_fast
+ ; load normal
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
+ jmp .run
+.load_fast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ jmp .run
+.load_veryfast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ ; fall-through
+.run:
+ call .pass2
+%if WIN64
+ mov r7, [rsp+gprsize*1+35*16]
+%endif
+ RET
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ mova m1, [rsp+gprsize+16* 4]
+ mova m2, [rsp+gprsize+16* 5]
+ mova m3, [rsp+gprsize+16* 6]
+ mova m4, [rsp+gprsize+16* 7]
+ mova m5, [rsp+gprsize+16* 8]
+ mova m6, [rsp+gprsize+16* 9]
+ mova m7, [rsp+gprsize+16*10]
+ call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
+ mova [rsp+gprsize+ 3*16], m0
+ mova [rsp+gprsize+ 4*16], m1
+ mova [rsp+gprsize+ 5*16], m2
+ mova [rsp+gprsize+ 6*16], m3
+ mova [rsp+gprsize+ 7*16], m4
+ mova [rsp+gprsize+ 8*16], m5
+ mova [rsp+gprsize+ 9*16], m6
+ mova m0, [rsp+gprsize+11*16]
+ mova m1, [rsp+gprsize+12*16]
+ mova m2, [rsp+gprsize+13*16]
+ mova m3, [rsp+gprsize+14*16]
+ mova m4, [rsp+gprsize+15*16]
+ mova m5, [rsp+gprsize+16*16]
+ mova m6, [rsp+gprsize+17*16]
+ mova m7, [rsp+gprsize+18*16]
+ call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
+ mova m7, [rsp+gprsize+ 0*16]
+ mova [rsp+gprsize+11*16], m0
+ mova [rsp+gprsize+12*16], m1
+ mova [rsp+gprsize+13*16], m2
+ mova [rsp+gprsize+14*16], m3
+ mova [rsp+gprsize+15*16], m4
+ mova [rsp+gprsize+16*16], m5
+ mova [rsp+gprsize+17*16], m6
+ mova [rsp+gprsize+18*16], m7
+ call r4
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+%endif
+ lea r3, [strideq*3]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+ lea dstq, [dstq+strideq*8]
+ mova m0, [rsp+gprsize+11*16]
+ mova m1, [rsp+gprsize+12*16]
+ mova m2, [rsp+gprsize+13*16]
+ mova m3, [rsp+gprsize+14*16]
+ mova m4, [rsp+gprsize+15*16]
+ mova m5, [rsp+gprsize+16*16]
+ mova m6, [rsp+gprsize+17*16]
+ mova m7, [rsp+gprsize+18*16]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+ lea dstq, [dstq+strideq*8]
+ mova m0, [rsp+gprsize+19*16]
+ mova m1, [rsp+gprsize+20*16]
+ mova m2, [rsp+gprsize+21*16]
+ mova m3, [rsp+gprsize+22*16]
+ mova m4, [rsp+gprsize+23*16]
+ mova m5, [rsp+gprsize+24*16]
+ mova m6, [rsp+gprsize+25*16]
+ mova m7, [rsp+gprsize+26*16]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+ lea dstq, [dstq+strideq*8]
+ mova m0, [rsp+gprsize+27*16]
+ mova m1, [rsp+gprsize+28*16]
+ mova m2, [rsp+gprsize+29*16]
+ mova m3, [rsp+gprsize+30*16]
+ mova m4, [rsp+gprsize+31*16]
+ mova m5, [rsp+gprsize+32*16]
+ mova m6, [rsp+gprsize+33*16]
+ mova m7, [rsp+gprsize+34*16]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+ ret
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 8
+ add r5d, 640
+ sar r5d, 10
+ add rsp, (31+2*ARCH_X86_64)*16
+ jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end2
+
+cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+%if ARCH_X86_32
+ mov [rsp+gprsize*1+76*16], r0
+%elif WIN64
+ mov [rsp+gprsize*1+76*16], r7
+%endif
+%undef cmp
+ ; remove entirely-zero iterations
+ mov r5d, 7*2
+ cmp eobw, word [o2(tbl_16x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+ mova [rsp+12*16+r5*8], m0
+ mova [rsp+20*16+r5*8], m0
+ mova [rsp+12*16+t0*8], m0
+ mova [rsp+12*16+t1*8], m0
+ mova [rsp+44*16+r5*8], m0
+ mova [rsp+52*16+r5*8], m0
+ mova [rsp+44*16+t0*8], m0
+ mova [rsp+44*16+t1*8], m0
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_16x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+ ; actual first pass after skipping all-zero data
+ mov [rsp+gprsize*0+76*16], eobd
+ mov r3, rsp
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+ 1*128+r5*8]
+ mova m1, [cq+ 3*128+r5*8]
+ mova m2, [cq+ 5*128+r5*8]
+ mova m3, [cq+ 7*128+r5*8]
+ mova m4, [cq+ 9*128+r5*8]
+ mova m5, [cq+11*128+r5*8]
+ mova m6, [cq+13*128+r5*8]
+ mova m7, [cq+15*128+r5*8]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+
+ mova m0, [cq+ 0*128+r5*8]
+ mova m1, [cq+ 2*128+r5*8]
+ mova m2, [cq+ 4*128+r5*8]
+ mova m3, [cq+ 6*128+r5*8]
+ mova m4, [cq+ 8*128+r5*8]
+ mova m5, [cq+10*128+r5*8]
+ mova m6, [cq+12*128+r5*8]
+ mova m7, [cq+14*128+r5*8]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ call m(idct_16x4_internal_16bpc).round
+%if ARCH_X86_64
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+%if ARCH_X86_64
+ mova [rsp+12*16+r5*8], m0
+ mova [rsp+20*16+r5*8], m2
+ mova [rsp+12*16+t1*8], m1
+ mova [rsp+12*16+t0*8], m3
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+44*16+r5*8], m8
+ mova [rsp+52*16+r5*8], m10
+ mova [rsp+44*16+t1*8], m9
+ mova [rsp+44*16+t0*8], m11
+%else
+ mova [rsp+44*16+r5*8], m0
+ mova [rsp+52*16+r5*8], m2
+ mova [rsp+44*16+t1*8], m1
+ mova [rsp+44*16+t0*8], m3
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+ 9*16]
+ mova m4, [r3+10*16]
+ mova m6, [r3+11*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+12*16+r5*8], m0
+ mova [rsp+20*16+r5*8], m2
+ mova [rsp+12*16+t1*8], m1
+ mova [rsp+12*16+t0*8], m3
+%endif
+ pxor m7, m7
+ REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2
+ add rsp, 9*16
+%if ARCH_X86_64
+ mov r6, dstq
+%else
+ mov dstq, [rsp+gprsize*1+67*16]
+%endif
+ mov eobd, [rsp+gprsize*0+67*16]
+ cmp eobd, 44
+ jl .load_veryfast
+ cmp eobd, 151
+ jl .load_fast
+ ; load normal
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
+ jmp .run
+.load_fast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ jmp .run
+.load_veryfast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ ; fall-through
+.run:
+%if ARCH_X86_64
+ lea r2, [dstq+32]
+ mov r7, -4
+%else
+ lea r2, [rsp+67*16]
+ mov dword [r2+0*gprsize], 2
+%endif
+ jmp .loop_pass2_entry
+.loop_pass2:
+ mova m0, [rsp+16* 3]
+.loop_pass2_entry:
+%if ARCH_X86_32
+ mov dstq, [r2+1*gprsize]
+%endif
+ call m(inv_txfm_add_dct_dct_8x32_16bpc).pass2
+ add rsp, 32*16
+%if ARCH_X86_64
+ add r7, 2
+ lea dstq, [r2+r7*8]
+ jl .loop_pass2
+%if WIN64
+ mov r7, [rsp+gprsize*1+3*16]
+%endif
+%else
+ add dword [r2+1*gprsize], 16
+ dec dword [r2+0*gprsize]
+ jg .loop_pass2
+%endif
+%assign stack_size (stack_size-73*16)
+%if STACK_ALIGNMENT >= 16
+%assign stack_size_padded (stack_size_padded-73*16)
+%assign stack_offset (stack_offset-73*16)
+%else
+%xdefine rstkm [rsp + stack_size]
+%endif
+ RET
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 32
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add rsp, (65+4*ARCH_X86_64)*16
+ jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly
+
+cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
+ dst, stride, c, eob
+%if ARCH_X86_32
+ LEA r6, $$
+%endif
+ test eobd, eobd
+ jz .dconly
+
+ ; remove entirely-zero iterations
+%undef cmp
+%if ARCH_X86_64
+ xor r5d, r5d
+ cmp eobd, 10
+ setge r5b
+%else
+ mov r5d, 1
+ cmp eobd, 10
+ sbb r5d, 0
+%endif
+ add r5d, r5d
+
+ ; actual first pass after skipping all-zero data
+.loop_pass1:
+ mova m0, [cq+32* 1+r5*8]
+ mova m1, [cq+32* 7+r5*8]
+ mova m2, [cq+32* 9+r5*8]
+ mova m3, [cq+32*15+r5*8]
+ mova m4, [cq+32*17+r5*8]
+ mova m5, [cq+32*23+r5*8]
+ mova m6, [cq+32*25+r5*8]
+ mova m7, [cq+32*31+r5*8]
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mov r3, rsp
+ call .main_oddhalf_part1
+ mova m0, [cq+32* 3+r5*8]
+ mova m1, [cq+32* 5+r5*8]
+ mova m2, [cq+32*11+r5*8]
+ mova m3, [cq+32*13+r5*8]
+ mova m4, [cq+32*19+r5*8]
+ mova m5, [cq+32*21+r5*8]
+ mova m6, [cq+32*27+r5*8]
+ mova m7, [cq+32*29+r5*8]
+ call .main_oddhalf_part2
+ mova m0, [cq+32* 2+r5*8]
+ mova m1, [cq+32* 6+r5*8]
+ mova m2, [cq+32*10+r5*8]
+ mova m3, [cq+32*14+r5*8]
+ mova m4, [cq+32*18+r5*8]
+ mova m5, [cq+32*22+r5*8]
+ mova m6, [cq+32*26+r5*8]
+ mova m7, [cq+32*30+r5*8]
+ add r3, 16*(16+4*ARCH_X86_32)
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+ mova m0, [cq+32* 0+r5*8]
+ mova m1, [cq+32* 4+r5*8]
+ mova m2, [cq+32* 8+r5*8]
+ mova m3, [cq+32*12+r5*8]
+ mova m4, [cq+32*16+r5*8]
+ mova m5, [cq+32*20+r5*8]
+ mova m6, [cq+32*24+r5*8]
+ mova m7, [cq+32*28+r5*8]
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ sub r3, 16*(16+4*ARCH_X86_32)
+ call .round_dct32
+%if ARCH_X86_64
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+32* 8+r5*8], m8
+ mova [cq+32* 9+r5*8], m9
+ mova [cq+32*10+r5*8], m10
+ mova [cq+32*11+r5*8], m11
+ mova m8, [r3+16* 9] ; 8 9
+ mova m10, [r3+16*11] ; 10 11
+ mova m12, [r3+16*13] ; 12 13
+ mova m14, [r3+16*15] ; 14 15
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+32* 4+r5*8], m8
+ mova [cq+32* 5+r5*8], m9
+ mova [cq+32* 6+r5*8], m10
+ mova [cq+32* 7+r5*8], m11
+ mova m8, [r3+16* 8] ; 24 25
+ mova m10, [r3+16*10] ; 26 27
+ mova m12, [r3+16*12] ; 28 29
+ mova m14, [r3+16*14] ; 30 31
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+32*12+r5*8], m8
+ mova [cq+32*13+r5*8], m9
+ mova [cq+32*14+r5*8], m10
+ mova [cq+32*15+r5*8], m11
+%else
+ sub r3, 8*16
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+10*16]
+ mova m4, [r3+12*16]
+ mova m6, [r3+14*16]
+ packssdw m0, [r3+ 9*16]
+ packssdw m2, [r3+11*16]
+ packssdw m4, [r3+13*16]
+ packssdw m6, [r3+15*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+32* 4+r5*8], m0
+ mova [cq+32* 5+r5*8], m1
+ mova [cq+32* 6+r5*8], m2
+ mova [cq+32* 7+r5*8], m3
+ mova m0, [r3+16*16]
+ mova m2, [r3+18*16]
+ mova m4, [r3+20*16]
+ mova m6, [r3+22*16]
+ packssdw m0, [r3+17*16]
+ packssdw m2, [r3+19*16]
+ packssdw m4, [r3+21*16]
+ packssdw m6, [r3+23*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+32* 8+r5*8], m0
+ mova [cq+32* 9+r5*8], m1
+ mova [cq+32*10+r5*8], m2
+ mova [cq+32*11+r5*8], m3
+ mova m0, [r3+31*16]
+ mova m2, [r3+29*16]
+ mova m4, [r3+27*16]
+ mova m6, [r3+25*16]
+ packssdw m0, [r3+30*16]
+ packssdw m2, [r3+28*16]
+ packssdw m4, [r3+26*16]
+ packssdw m6, [r3+24*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+32*12+r5*8], m0
+ mova [cq+32*13+r5*8], m1
+ mova [cq+32*14+r5*8], m2
+ mova [cq+32*15+r5*8], m3
+ mova m0, [r3+ 0*16]
+ mova m2, [r3+ 2*16]
+ mova m4, [r3+ 4*16]
+ mova m6, [r3+ 6*16]
+ packssdw m0, [r3+ 1*16]
+ packssdw m2, [r3+ 3*16]
+ packssdw m4, [r3+ 5*16]
+ packssdw m6, [r3+ 7*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+ pxor m7, m7
+ ; clear lower half of [cq]
+ REPX {mova [cq+x*32+r5*8], m7}, 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+ test r5d, r5d
+ jz .end_pass1
+ mova [cq+32* 0+r5*8], m0
+ mova [cq+32* 1+r5*8], m1
+ mova [cq+32* 2+r5*8], m2
+ mova [cq+32* 3+r5*8], m3
+ sub r5d, 2
+ jmp .loop_pass1
+.end_pass1:
+
+ ; pass=2, we need to call this otherwise the stack pointer has
+ ; the wrong offset in the 8-bit code
+ mov r4d, 4
+ call m(idct_16x8_internal_16bpc).pass2_main
+ RET
+
+.main_oddhalf_part1_fast: ; lower half zero
+ pmulld m7, m0, [o(pd_4091)]
+ pmulld m0, [o(pd_201)]
+ pmulld m4, m3, [o(pd_m2751)]
+%if ARCH_X86_32
+ pmulld m3, [o(pd_3035)]
+ mova m5, [o(pd_2048)]
+ REPX {paddd x, m5}, m0, m7
+ REPX {psrad x, 12}, m0, m7
+ mova [r3+3*16], m7
+ mova m7, m3
+ mova m3, m5
+%else
+ pmulld m3, [o(pd_3035)]
+%endif
+ pmulld m6, m1, [o(pd_m1380)]
+ pmulld m1, [o(pd_3857)]
+ pmulld m5, m2, [o(pd_3703)]
+ pmulld m2, [o(pd_1751)]
+ jmp .main_oddhalf_part1_fast2
+.main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31
+%if ARCH_X86_64
+ ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a
+ ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a
+ ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a
+ ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a
+.main_oddhalf_part1_fast2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+ psubd m8, m0, m4 ; t17
+ paddd m0, m4 ; t16
+ psubd m4, m6, m2 ; t18
+ paddd m6, m2 ; t19
+ psubd m2, m1, m5 ; t29
+ paddd m1, m5 ; t28
+ psubd m5, m7, m3 ; t30
+ paddd m7, m3 ; t31
+ REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
+ REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
+ mova m15, [o(pd_4017)]
+ mova m10, [o(pd_799)]
+ ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a
+ ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 4 ; t29a, t18a
+ psubd m3, m0, m6 ; t19a
+ paddd m0, m6 ; t16a
+ psubd m6, m7, m1 ; t28a
+ paddd m7, m1 ; t31a
+ psubd m1, m5, m4 ; t18
+ paddd m5, m4 ; t17
+ psubd m4, m8, m2 ; t29
+ paddd m8, m2 ; t30
+ REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
+ REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
+ mova m15, [o(pd_3784)]
+ mova m10, [o(pd_1567)]
+ ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a
+ ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15 ; t19, t28
+ mova [r3+16*0], m0
+ mova [r3+16*1], m5
+ mova [r3+16*2], m4
+ mova [r3+16*3], m6
+ mova [r3+16*4], m3
+ mova [r3+16*5], m1
+ mova [r3+16*6], m8
+ mova [r3+16*7], m7
+%else
+ mova [r3+0*16], m2
+ mova [r3+1*16], m3
+ mova [r3+2*16], m4
+ mova [r3+3*16], m5
+ mova m3, [o(pd_2048)]
+ ITX_MULSUB_2D 0, 7, 2, 4, 5, 3, 201, 4091 ; t16a, t31a
+ ITX_MULSUB_2D 6, 1, 2, 4, 5, _, 3857, 1380 ; t19a, t28a
+ mova m4, [r3+2*16]
+ mova m5, [r3+3*16]
+ mova [r3+2*16], m6
+ mova [r3+3*16], m7
+ mova m2, [r3+0*16]
+ mova m7, [r3+1*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ ITX_MULSUB_2D 2, 5, 0, 1, 6, _, 1751, 3703 ; t18a, t29a
+ ITX_MULSUB_2D 4, 7, 0, 1, 6, _, 3035, 2751 ; t17a, t30a
+ mova m0, [r3+0*16]
+ mova m1, [r3+1*16]
+ mova m6, [r3+2*16]
+.main_oddhalf_part1_fast2:
+ REPX {paddd x, m3}, m1, m2, m4, m5, m6, m7
+ REPX {psrad x, 12}, m1, m2, m4, m5, m6, m7
+ psubd m3, m0, m4 ; t17
+ mova [r3+0*16], m3
+ mova m3, [r3+3*16]
+ paddd m0, m4 ; t16
+ psubd m4, m6, m2 ; t18
+ paddd m6, m2 ; t19
+ psubd m2, m1, m5 ; t29
+ paddd m1, m5 ; t28
+ psubd m5, m3, m7 ; t30
+ paddd m7, m3 ; t31
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7
+ pmaxsd m3, [r3+0*16]
+ mova [r3+0*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7
+ pminsd m3, [r3+0*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m1
+ mova [r3+2*16], m6
+ mova [r3+3*16], m7
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 5, 3, 1, 6, 7, 0, 799, 4017 ; t17a, t30a
+ ITX_MULSUB_2D 2, 4, 1, 6, _, 0, 7, 4017, 4 ; t29a, t18a
+ psubd m1, m5, m4 ; t18
+ paddd m5, m4 ; t17
+ psubd m4, m3, m2 ; t29
+ paddd m3, m2 ; t30
+ mova m0, [r3+0*16]
+ mova m2, [r3+1*16]
+ mova m6, [r3+2*16]
+ mova m7, [r3+3*16]
+ mova [r3+0*16], m3
+ psubd m3, m0, m6 ; t19a
+ paddd m0, m6 ; t16a
+ psubd m6, m7, m2 ; t28a
+ paddd m7, m2 ; t31a
+ mova m2, [o(clip_18b_min)]
+ REPX {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5
+ pmaxsd m2, [r3+0*16]
+ mova [r3+0*16], m2
+ mova m2, [o(clip_18b_max)]
+ REPX {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5
+ pminsd m2, [r3+0*16]
+ mova [r3+16*0], m0
+ mova [r3+16*1], m5
+ mova [r3+16*6], m2
+ mova [r3+16*7], m7
+ mova m7, [o(pd_2048)]
+ ITX_MULSUB_2D 4, 1, 0, 5, 2, 7, 1567, 3784 ; t18a, t29a
+ ITX_MULSUB_2D 6, 3, 0, 5, 2, 7, 2, 3784 ; t19, t28
+ mova [r3+16*2], m4
+ mova [r3+16*3], m6
+ mova [r3+16*4], m3
+ mova [r3+16*5], m1
+%endif
+ ret
+.main_oddhalf_part2_fast: ; lower half zero
+ pmulld m7, m0, [o(pd_m601)]
+ pmulld m0, [o(pd_4052)]
+ pmulld m4, m3, [o(pd_3290)]
+%if ARCH_X86_32
+ pmulld m3, [o(pd_2440)]
+ mova m5, [o(pd_2048)]
+ REPX {paddd x, m5}, m0, m7
+ REPX {psrad x, 12}, m0, m7
+ mova [r3+11*16], m7
+ mova m7, m3
+ mova m3, m5
+%else
+ pmulld m3, [o(pd_2440)]
+%endif
+ pmulld m6, m1, [o(pd_3973)]
+ pmulld m1, [o(pd_995)]
+ pmulld m5, m2, [o(pd_m2106)]
+ pmulld m2, [o(pd_3513)]
+ jmp .main_oddhalf_part2_fast2
+.main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29
+%if ARCH_X86_64
+ ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a
+ ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a
+ ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a
+ ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a
+.main_oddhalf_part2_fast2:
+ REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
+ REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
+ psubd m8, m0, m4 ; t25
+ paddd m0, m4 ; t24
+ psubd m4, m6, m2 ; t26
+ paddd m6, m2 ; t27
+ psubd m2, m1, m5 ; t21
+ paddd m1, m5 ; t20
+ psubd m5, m7, m3 ; t22
+ paddd m7, m3 ; t23
+ REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
+ REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
+ mova m15, [o(pd_2276)]
+ mova m10, [o(pd_3406)]
+ ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a
+ ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 4 ; t25a, t22a
+ psubd m3, m0, m6 ; t27a
+ paddd m0, m6 ; t24a
+ psubd m6, m7, m1 ; t20a
+ paddd m7, m1 ; t23a
+ psubd m1, m5, m4 ; t21
+ paddd m5, m4 ; t22
+ psubd m4, m8, m2 ; t26
+ paddd m8, m2 ; t25
+ REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
+ REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
+ mova m15, [o(pd_3784)]
+ mova m10, [o(pd_1567)]
+ ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 4 ; t26a, t21a
+ ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 4 ; t27, t20
+ mova m9, [r3+16*0] ; t16a
+ mova m10, [r3+16*1] ; t17
+ psubd m2, m9, m7 ; t23
+ paddd m9, m7 ; t16
+ psubd m7, m10, m5 ; t22a
+ paddd m10, m5 ; t17a
+ REPX {pmaxsd x, m12}, m9, m10, m2, m7
+ REPX {pminsd x, m13}, m9, m10, m2, m7
+ mova [r3+16*0], m9
+ mova [r3+16*1], m10
+ mova m9, [r3+16*2] ; t18a
+ mova m10, [r3+16*3] ; t19
+ psubd m5, m9, m1 ; t21
+ paddd m9, m1 ; t18
+ psubd m1, m10, m6 ; t20a
+ paddd m10, m6 ; t19a
+ REPX {pmaxsd x, m12}, m9, m10, m5, m1
+ REPX {pminsd x, m13}, m9, m10, m5, m1
+ mova [r3+16*2], m9
+ mova [r3+16*3], m10
+ mova m9, [r3+16*4] ; t28
+ mova m10, [r3+16*5] ; t29a
+ psubd m6, m9, m3 ; t27a
+ paddd m9, m3 ; t28a
+ psubd m3, m10, m4 ; t26
+ paddd m10, m4 ; t29
+ REPX {pmaxsd x, m12}, m9, m10, m6, m3
+ REPX {pminsd x, m13}, m9, m10, m6, m3
+ REPX {pmulld x, m14}, m6, m3, m1, m5
+ paddd m6, m11
+ paddd m3, m11
+ psubd m4, m6, m1 ; t20
+ paddd m6, m1 ; t27
+ psubd m1, m3, m5 ; t21a
+ paddd m3, m5 ; t26a
+ REPX {psrad x, 12 }, m4, m1, m3, m6
+ mova [r3+16*4], m4
+ mova [r3+16*5], m1
+ mova m4, [r3+16*6] ; t30
+ mova m1, [r3+16*7] ; t31a
+ psubd m5, m4, m8 ; t25a
+ paddd m4, m8 ; t30a
+ psubd m8, m1, m0 ; t24
+ paddd m1, m0 ; t31
+ REPX {pmaxsd x, m12}, m8, m5, m4, m1
+ REPX {pminsd x, m13}, m8, m5, m4, m1
+ REPX {pmulld x, m14}, m5, m8, m7, m2
+ paddd m5, m11
+ paddd m8, m11
+ psubd m0, m5, m7 ; t22
+ paddd m5, m7 ; t25
+ psubd m7, m8, m2 ; t23a
+ paddd m2, m8 ; t24a
+ REPX {psrad x, 12 }, m0, m7, m2, m5
+ mova [r3+16*6], m0
+ mova [r3+16*7], m7
+ mova [r3+16*8], m2
+ mova [r3+16*9], m5
+ mova [r3+16*10], m3
+ mova [r3+16*11], m6
+ mova [r3+16*12], m9
+ mova [r3+16*13], m10
+ mova [r3+16*14], m4
+ mova [r3+16*15], m1
+%else
+ mova [r3+ 8*16], m2
+ mova [r3+ 9*16], m3
+ mova [r3+10*16], m4
+ mova [r3+11*16], m5
+ mova m3, [o(pd_2048)]
+ ITX_MULSUB_2D 7, 0, 2, 4, 5, 3, 4052, 601 ; t23a, t24a
+ ITX_MULSUB_2D 1, 6, 2, 4, 5, _, 995, 3973 ; t20a, t27a
+ mova m2, [r3+ 8*16]
+ mova m4, [r3+10*16]
+ mova m5, [r3+11*16]
+ mova [r3+ 8*16], m0
+ mova [r3+10*16], m6
+ mova [r3+11*16], m7
+ mova m7, [r3+ 9*16]
+ mova [r3+ 9*16], m1
+ ITX_MULSUB_2D 5, 2, 0, 6, 1, _, 3513, 2106 ; t21a, t26a
+ ITX_MULSUB_2D 7, 4, 0, 6, 1, _, 2440, 3290 ; t22a, t25a
+ mova m0, [r3+ 8*16]
+ mova m1, [r3+ 9*16]
+ mova m6, [r3+10*16]
+.main_oddhalf_part2_fast2:
+ REPX {paddd x, m3}, m1, m2, m7, m4, m5, m6
+ REPX {psrad x, 12}, m1, m2, m7, m4, m5, m6
+ psubd m3, m0, m4 ; t25
+ mova [r3+ 8*16], m3
+ mova m3, [r3+11*16]
+ paddd m0, m4 ; t24
+ psubd m4, m6, m2 ; t26
+ paddd m6, m2 ; t27
+ psubd m2, m1, m5 ; t21
+ paddd m1, m5 ; t20
+ psubd m5, m3, m7 ; t22
+ paddd m7, m3 ; t23
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7
+ pmaxsd m3, [r3+ 8*16]
+ mova [r3+ 8*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7
+ pminsd m3, [r3+ 8*16]
+ mova [r3+ 8*16], m0
+ mova [r3+ 9*16], m1
+ mova [r3+10*16], m6
+ mova [r3+11*16], m7
+ mova m7, [o(pd_2048)]
+ ITX_MULSUB_2D 4, 2, 0, 1, 6, 7, 3406, 2276 ; t21a, t26a
+ ITX_MULSUB_2D 3, 5, 0, 1, _, 7, 6, 2276, 4 ; t25a, t22a
+ psubd m1, m5, m4 ; t21
+ paddd m5, m4 ; t22
+ psubd m4, m3, m2 ; t26
+ paddd m3, m2 ; t25
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+ 9*16]
+ mova m6, [r3+10*16]
+ mova m7, [r3+11*16]
+ mova [r3+ 8*16], m3
+ psubd m3, m0, m6 ; t27a
+ paddd m0, m6 ; t24a
+ psubd m6, m7, m2 ; t20a
+ paddd m7, m2 ; t23a
+ mova m2, [o(clip_18b_min)]
+ REPX {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5
+ pmaxsd m2, [r3+ 8*16]
+ mova [r3+ 8*16], m2
+ mova m2, [o(clip_18b_max)]
+ REPX {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5
+ pminsd m2, [r3+ 8*16]
+ mova [r3+ 8*16], m0
+ mova [r3+ 9*16], m2
+ mova [r3+14*16], m5
+ mova [r3+15*16], m7
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 4, 1, 2, 5, 7, 0, 1567, 3784, 4 ; t26a, t21a
+ ITX_MULSUB_2D 3, 6, 2, 5, _, 0, 7, 3784, 4 ; t27, t20
+ mova [r3+10*16], m3
+ mova m0, [o(clip_18b_min)]
+ mova m2, [o(clip_18b_max)]
+ mova m5, [r3+16*2] ; t18a
+ mova m7, [r3+16*3] ; t19
+ psubd m3, m5, m1 ; t21
+ paddd m5, m1 ; t18
+ psubd m1, m7, m6 ; t20a
+ paddd m7, m6 ; t19a
+ REPX {pmaxsd x, m0}, m5, m7, m3, m1
+ REPX {pminsd x, m2}, m5, m7, m3, m1
+ mova [r3+16*2], m5
+ mova [r3+16*3], m7
+ mova [r3+11*16], m3
+ mova m3, [r3+10*16]
+ mova m5, [r3+16*4] ; t28
+ mova m7, [r3+16*5] ; t29a
+ psubd m6, m5, m3 ; t27a
+ paddd m5, m3 ; t28a
+ psubd m3, m7, m4 ; t26
+ paddd m7, m4 ; t29
+ REPX {pmaxsd x, m0}, m5, m7, m6, m3
+ REPX {pminsd x, m2}, m5, m7, m6, m3
+ mova [r3+16*12], m5
+ mova [r3+16*13], m7
+ mova m5, [o(pd_2048)]
+ mova m7, [o(pd_2896)]
+ mova m4, [r3+11*16]
+ REPX {pmulld x, m7}, m6, m3, m1, m4
+ paddd m6, m5
+ paddd m3, m5
+ psubd m5, m6, m1 ; t20
+ paddd m6, m1 ; t27
+ psubd m1, m3, m4 ; t21a
+ paddd m3, m4 ; t26a
+ REPX {psrad x, 12}, m5, m1, m3, m6
+ mova [r3+16*4], m5
+ mova [r3+16*5], m1
+ mova [r3+16*10], m3
+ mova [r3+16*11], m6
+
+ mova m5, [r3+14*16]
+ mova m6, [r3+15*16]
+ mova m3, [r3+16*0] ; t16a
+ mova m4, [r3+16*1] ; t17
+ psubd m1, m3, m6 ; t23
+ paddd m3, m6 ; t16
+ psubd m6, m4, m5 ; t22a
+ paddd m4, m5 ; t17a
+ REPX {pmaxsd x, m0}, m3, m4, m1, m6
+ REPX {pminsd x, m2}, m3, m4, m1, m6
+ mova [r3+16*0], m3
+ mova [r3+16*1], m4
+ mova m5, [r3+ 8*16]
+ mova m3, [r3+ 9*16]
+ mova [r3+ 8*16], m1
+ mova [r3+ 9*16], m6
+ mova m4, [r3+16*6] ; t30
+ mova m1, [r3+16*7] ; t31a
+ psubd m6, m1, m5 ; t24
+ paddd m1, m5 ; t31
+ psubd m5, m4, m3 ; t25a
+ paddd m4, m3 ; t30a
+ REPX {pmaxsd x, m0}, m6, m5, m4, m1
+ REPX {pminsd x, m2}, m6, m5, m4, m1
+ mova [r3+16*14], m4
+ mova [r3+16*15], m1
+ mova m4, [o(pd_2048)]
+ mova m1, [r3+ 9*16]
+ mova m2, [r3+ 8*16]
+ REPX {pmulld x, m7}, m5, m6, m1, m2
+ paddd m5, m4
+ paddd m6, m4
+ psubd m0, m5, m1 ; t22
+ paddd m5, m1 ; t25
+ psubd m1, m6, m2 ; t23a
+ paddd m2, m6 ; t24a
+ REPX {psrad x, 12}, m0, m1, m2, m5
+ mova [r3+16*6], m0
+ mova [r3+16*7], m1
+ mova [r3+16*8], m2
+ mova [r3+16*9], m5
+%endif
+ ret
+
+ ; final sumsub for idct16 as well as idct32, plus final downshift
+%macro IDCT32_END 6 ; in/out1, out2-4, tmp, shift, idx
+ mova m%4, [r3+16*(23-%1)]
+ pmaxsd m%1, m12
+ pminsd m%1, m13
+ psubd m%3, m%1, m%4 ; idct16 out15 - n
+ paddd m%1, m%4 ; idct16 out0 + n
+ pmaxsd m%1, m12
+ pmaxsd m%3, m12
+ pminsd m%1, m13
+ pminsd m%3, m13
+ paddd m%1, m11
+ paddd m%3, m11
+ mova m%5, [r3+16*( 0+%1)]
+ mova m%2, [r3+16*(15-%1)]
+ psubd m%4, m%1, m%2 ; out31 - n
+ paddd m%1, m%2 ; out0 + n
+ paddd m%2, m%3, m%5 ; out15 - n
+ psubd m%3, m%5 ; out16 + n
+ REPX {psrad x, %6}, m%1, m%3, m%2, m%4
+%endmacro
+
+.round_dct32:
+%if ARCH_X86_64
+ psrld m11, 10 ; pd_2
+ IDCT32_END 0, 15, 8, 9, 10, 2 ; 0 15 16 31
+ mova [r3+ 0*16], m6
+ mova [r3+23*16], m7
+ IDCT32_END 1, 14, 6, 7, 10, 2 ; 1 14 17 30
+ packssdw m0, m1 ; 0 1
+ packssdw m14, m15 ; 14 15
+ packssdw m8, m6 ; 16 17
+ packssdw m7, m9 ; 30 31
+ mova [r3+16*15], m14
+ mova [r3+16*14], m7
+ IDCT32_END 2, 15, 10, 7, 6, 2 ; 2 13 18 29
+ IDCT32_END 3, 14, 1, 9, 6, 2 ; 3 12 19 28
+ packssdw m2, m3 ; 2 3
+ packssdw m14, m15 ; 12 13
+ packssdw m10, m1 ; 18 19
+ packssdw m9, m7 ; 28 29
+ mova [r3+16*13], m14
+ mova [r3+16*12], m9
+ IDCT32_END 4, 15, 1, 7, 6, 2 ; 4 11 20 27
+ IDCT32_END 5, 14, 3, 9, 6, 2 ; 5 10 21 26
+ packssdw m4, m5 ; 4 5
+ packssdw m14, m15 ; 10 11
+ packssdw m1, m3 ; 20 21
+ packssdw m9, m7 ; 26 27
+ mova [r3+16*11], m14
+ mova [r3+16*10], m9
+ mova m6, [r3+ 0*16]
+ mova m7, [r3+23*16]
+ IDCT32_END 6, 15, 14, 5, 3, 2 ; 6 9 22 25
+ IDCT32_END 7, 11, 3, 9, 13, 2 ; 7 8 23 24
+ packssdw m6, m7 ; 6 7
+ packssdw m11, m15 ; 8 9
+ packssdw m14, m3 ; 22 23
+ packssdw m9, m5 ; 24 25
+ mova [r3+16*9], m11
+ mova [r3+16*8], m9
+ mova m12, m1
+ ret
+%else
+ mova [r3+16*16], m0
+ mova [r3+17*16], m1
+ mova [r3+18*16], m2
+ mova [r3+19*16], m3
+ mova [r3+20*16], m4
+ mova [r3+21*16], m5
+ mova [r3+22*16], m6
+ mova [r3+23*16], m7
+ mova m1, [o(pd_2)]
+ mova m2, [o(clip_18b_min)]
+ mova m3, [o(clip_18b_max)]
+
+ mov r4, 15*16
+.loop_dct32_end:
+ mova m0, [r3+16*16]
+ mova m6, [r3+16*24]
+ pmaxsd m0, m2
+ pminsd m0, m3
+ psubd m5, m0, m6 ; idct16 out15 - n
+ paddd m0, m6 ; idct16 out0 + n
+ pmaxsd m0, m2
+ pmaxsd m5, m2
+ pminsd m0, m3
+ pminsd m5, m3
+ paddd m0, m1
+ paddd m5, m1
+ mova m7, [r3]
+ mova m4, [r3+r4]
+ psubd m6, m0, m4 ; out31 - n
+ paddd m0, m4 ; out0 + n
+ paddd m4, m5, m7 ; out15 - n
+ psubd m5, m7 ; out16 + n
+ REPX {psrad x, 2}, m0, m5, m4, m6
+ mova [r3], m0
+ mova [r3+r4], m4
+ mova [r3+16*16], m5
+ mova [r3+24*16], m6
+ add r3, 16
+ sub r4, 32
+ jg .loop_dct32_end
+ ret
+%endif
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 8
+.dconly1:
+ add r5d, 640
+ sar r5d, 10
+.dconly2:
+ imul r5d, 2896
+ add r5d, 34816
+ movd m0, r5d
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ mova m6, [o(pixel_10bpc_max)]
+ pxor m5, m5
+.dconly_loop:
+ mova m1, [dstq+16*0]
+ mova m2, [dstq+16*1]
+ mova m3, [dstq+16*2]
+ mova m4, [dstq+16*3]
+ REPX {paddw x, m0}, m1, m2, m3, m4
+ REPX {pminsw x, m6}, m1, m2, m3, m4
+ REPX {pmaxsw x, m5}, m1, m2, m3, m4
+ mova [dstq+16*0], m1
+ mova [dstq+16*1], m2
+ mova [dstq+16*2], m3
+ mova [dstq+16*3], m4
+ add dstq, strideq
+ dec r3d
+ jg .dconly_loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+ ; remove entirely-zero iterations
+%undef cmp
+ mov r5d, 8
+.zero_loop:
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_32x16_2d)+r5]
+ jl .zero_loop
+
+ ; actual first pass after skipping all-zero data
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+64* 1+r5*8]
+ mova m1, [cq+64* 7+r5*8]
+ mova m2, [cq+64* 9+r5*8]
+ mova m3, [cq+64*15+r5*8]
+ mova m4, [cq+64*17+r5*8]
+ mova m5, [cq+64*23+r5*8]
+ mova m6, [cq+64*25+r5*8]
+ mova m7, [cq+64*31+r5*8]
+ mov r3, rsp
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
+
+ mova m0, [cq+64* 3+r5*8]
+ mova m1, [cq+64* 5+r5*8]
+ mova m2, [cq+64*11+r5*8]
+ mova m3, [cq+64*13+r5*8]
+ mova m4, [cq+64*19+r5*8]
+ mova m5, [cq+64*21+r5*8]
+ mova m6, [cq+64*27+r5*8]
+ mova m7, [cq+64*29+r5*8]
+%if ARCH_X86_32
+ add r3, 16*8
+%endif
+ call m(idct_8x4_internal_16bpc).rect2_mul
+%if ARCH_X86_32
+ sub r3, 16*8
+%endif
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
+ add r3, 16*(16+4*ARCH_X86_32)
+
+ mova m0, [cq+64* 2+r5*8]
+ mova m1, [cq+64* 6+r5*8]
+ mova m2, [cq+64*10+r5*8]
+ mova m3, [cq+64*14+r5*8]
+ mova m4, [cq+64*18+r5*8]
+ mova m5, [cq+64*22+r5*8]
+ mova m6, [cq+64*26+r5*8]
+ mova m7, [cq+64*30+r5*8]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+
+ mova m0, [cq+64* 0+r5*8]
+ mova m1, [cq+64* 4+r5*8]
+ mova m2, [cq+64* 8+r5*8]
+ mova m3, [cq+64*12+r5*8]
+ mova m4, [cq+64*16+r5*8]
+ mova m5, [cq+64*20+r5*8]
+ mova m6, [cq+64*24+r5*8]
+ mova m7, [cq+64*28+r5*8]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ sub r3, 16*(16+4*ARCH_X86_32)
+ call .round_dct32
+
+%if ARCH_X86_64
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+64* 8+r5*8], m8
+ mova [cq+64* 9+r5*8], m9
+ mova [cq+64*10+r5*8], m10
+ mova [cq+64*11+r5*8], m11
+ mova m8, [r3+16* 9] ; 8 9
+ mova m10, [r3+16*11] ; 10 11
+ mova m12, [r3+16*13] ; 12 13
+ mova m14, [r3+16*15] ; 14 15
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+64* 4+r5*8], m8
+ mova [cq+64* 5+r5*8], m9
+ mova [cq+64* 6+r5*8], m10
+ mova [cq+64* 7+r5*8], m11
+ mova m8, [r3+16* 8] ; 24 25
+ mova m10, [r3+16*10] ; 26 27
+ mova m12, [r3+16*12] ; 28 29
+ mova m14, [r3+16*14] ; 30 31
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [cq+64*12+r5*8], m8
+ mova [cq+64*13+r5*8], m9
+ mova [cq+64*14+r5*8], m10
+ mova [cq+64*15+r5*8], m11
+%else
+ sub r3, 8*16
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+10*16]
+ mova m4, [r3+12*16]
+ mova m6, [r3+14*16]
+ packssdw m0, [r3+ 9*16]
+ packssdw m2, [r3+11*16]
+ packssdw m4, [r3+13*16]
+ packssdw m6, [r3+15*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+64* 4+r5*8], m0
+ mova [cq+64* 5+r5*8], m1
+ mova [cq+64* 6+r5*8], m2
+ mova [cq+64* 7+r5*8], m3
+ mova m0, [r3+16*16]
+ mova m2, [r3+18*16]
+ mova m4, [r3+20*16]
+ mova m6, [r3+22*16]
+ packssdw m0, [r3+17*16]
+ packssdw m2, [r3+19*16]
+ packssdw m4, [r3+21*16]
+ packssdw m6, [r3+23*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+64* 8+r5*8], m0
+ mova [cq+64* 9+r5*8], m1
+ mova [cq+64*10+r5*8], m2
+ mova [cq+64*11+r5*8], m3
+ mova m0, [r3+31*16]
+ mova m2, [r3+29*16]
+ mova m4, [r3+27*16]
+ mova m6, [r3+25*16]
+ packssdw m0, [r3+30*16]
+ packssdw m2, [r3+28*16]
+ packssdw m4, [r3+26*16]
+ packssdw m6, [r3+24*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [cq+64*12+r5*8], m0
+ mova [cq+64*13+r5*8], m1
+ mova [cq+64*14+r5*8], m2
+ mova [cq+64*15+r5*8], m3
+ mova m0, [r3+ 0*16]
+ mova m2, [r3+ 2*16]
+ mova m4, [r3+ 4*16]
+ mova m6, [r3+ 6*16]
+ packssdw m0, [r3+ 1*16]
+ packssdw m2, [r3+ 3*16]
+ packssdw m4, [r3+ 5*16]
+ packssdw m6, [r3+ 7*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+ mova [cq+64* 0+r5*8], m0
+ mova [cq+64* 1+r5*8], m1
+ mova [cq+64* 2+r5*8], m2
+ mova [cq+64* 3+r5*8], m3
+ pxor m0, m0
+ REPX {mova [cq+x*64+r5*8], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2, we need to call this otherwise the stack pointer has
+ ; the wrong offset in the 8-bit code
+ call .pass2
+ RET
+
+.pass2:
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+%if WIN64
+ mov [rsp+16*16+gprsize], r7
+%endif
+ mov r7, dstq
+%else
+ mov [rsp+2*gprsize+16*16], dstq
+%endif
+ lea r3, [strideq*3]
+ mov r4d, 4
+ jmp m(idct_16x16_internal_16bpc).loop_pass2
+
+.round_dct32:
+%if ARCH_X86_64
+ psrld m11, 11 ; pd_1
+ IDCT32_END 0, 15, 8, 9, 10, 1 ; 0 15 16 31
+ mova [r3+ 0*16], m6
+ mova [r3+23*16], m7
+ IDCT32_END 1, 14, 6, 7, 10, 1 ; 1 14 17 30
+ packssdw m0, m1 ; 0 1
+ packssdw m14, m15 ; 14 15
+ packssdw m8, m6 ; 16 17
+ packssdw m7, m9 ; 30 31
+ mova [r3+16*15], m14
+ mova [r3+16*14], m7
+ IDCT32_END 2, 15, 10, 7, 6, 1 ; 2 13 18 29
+ IDCT32_END 3, 14, 1, 9, 6, 1 ; 3 12 19 28
+ packssdw m2, m3 ; 2 3
+ packssdw m14, m15 ; 12 13
+ packssdw m10, m1 ; 18 19
+ packssdw m9, m7 ; 28 29
+ mova [r3+16*13], m14
+ mova [r3+16*12], m9
+ IDCT32_END 4, 15, 1, 7, 6, 1 ; 4 11 20 27
+ IDCT32_END 5, 14, 3, 9, 6, 1 ; 5 10 21 26
+ packssdw m4, m5 ; 4 5
+ packssdw m14, m15 ; 10 11
+ packssdw m1, m3 ; 20 21
+ packssdw m9, m7 ; 26 27
+ mova [r3+16*11], m14
+ mova [r3+16*10], m9
+ mova m6, [r3+ 0*16]
+ mova m7, [r3+23*16]
+ IDCT32_END 6, 15, 14, 5, 3, 1 ; 6 9 22 25
+ IDCT32_END 7, 11, 3, 9, 13, 1 ; 7 8 23 24
+ packssdw m6, m7 ; 6 7
+ packssdw m11, m15 ; 8 9
+ packssdw m14, m3 ; 22 23
+ packssdw m9, m5 ; 24 25
+ mova [r3+16*9], m11
+ mova [r3+16*8], m9
+ mova m12, m1
+ ret
+%else
+ mova [r3+16*16], m0
+ mova [r3+17*16], m1
+ mova [r3+18*16], m2
+ mova [r3+19*16], m3
+ mova [r3+20*16], m4
+ mova [r3+21*16], m5
+ mova [r3+22*16], m6
+ mova [r3+23*16], m7
+ pcmpeqd m1, m1 ; -1
+ mova m2, [o(clip_18b_min)]
+ mova m3, [o(clip_18b_max)]
+
+ mov r4, 15*16
+.loop_dct32_end:
+ mova m0, [r3+16*16]
+ mova m6, [r3+16*24]
+ psubd m5, m0, m6 ; idct16 out15 - n
+ paddd m0, m6 ; idct16 out0 + n
+ pmaxsd m0, m2
+ pmaxsd m5, m2
+ pminsd m0, m3
+ pminsd m5, m3
+ psubd m0, m1
+ psubd m5, m1
+ mova m7, [r3]
+ mova m4, [r3+r4]
+ psubd m6, m0, m4 ; out31 - n
+ paddd m0, m4 ; out0 + n
+ paddd m4, m5, m7 ; out15 - n
+ psubd m5, m7 ; out16 + n
+ REPX {psrad x, 1}, m0, m5, m4, m6
+ mova [r3], m0
+ mova [r3+r4], m4
+ mova [r3+16*16], m5
+ mova [r3+24*16], m6
+ add r3, 16
+ sub r4, 32
+ jg .loop_dct32_end
+ ret
+%endif
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 16
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 384
+ sar r5d, 9
+ jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
+
+cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+ ; remove entirely-zero iterations
+%if ARCH_X86_32
+ mov [rsp+5*32*16+1*gprsize], dstq
+%elif WIN64
+ mov [rsp+5*32*16+1*gprsize], r7
+%endif
+%undef cmp
+ mov r5d, 14
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+ mova [rsp+32*16+r5*8+0*32*16], m0
+ mova [rsp+40*16+r5*8+0*32*16], m0
+ mova [rsp+32*16+t0*8+0*32*16], m0
+ mova [rsp+32*16+t1*8+0*32*16], m0
+ mova [rsp+32*16+r5*8+1*32*16], m0
+ mova [rsp+40*16+r5*8+1*32*16], m0
+ mova [rsp+32*16+t0*8+1*32*16], m0
+ mova [rsp+32*16+t1*8+1*32*16], m0
+ mova [rsp+32*16+r5*8+2*32*16], m0
+ mova [rsp+40*16+r5*8+2*32*16], m0
+ mova [rsp+32*16+t0*8+2*32*16], m0
+ mova [rsp+32*16+t1*8+2*32*16], m0
+ mova [rsp+32*16+r5*8+3*32*16], m0
+ mova [rsp+40*16+r5*8+3*32*16], m0
+ mova [rsp+32*16+t0*8+3*32*16], m0
+ mova [rsp+32*16+t1*8+3*32*16], m0
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+
+ ; actual first pass after skipping all-zero data
+ mov [rsp+gprsize*0+5*32*16], eobd
+.loop_pass1:
+ mova m0, [cq+128* 1+r5*8]
+ mova m1, [cq+128* 7+r5*8]
+ mova m2, [cq+128* 9+r5*8]
+ mova m3, [cq+128*15+r5*8]
+ mova m4, [cq+128*17+r5*8]
+ mova m5, [cq+128*23+r5*8]
+ mova m6, [cq+128*25+r5*8]
+ mova m7, [cq+128*31+r5*8]
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mov r3, rsp
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
+ mova m0, [cq+128* 3+r5*8]
+ mova m1, [cq+128* 5+r5*8]
+ mova m2, [cq+128*11+r5*8]
+ mova m3, [cq+128*13+r5*8]
+ mova m4, [cq+128*19+r5*8]
+ mova m5, [cq+128*21+r5*8]
+ mova m6, [cq+128*27+r5*8]
+ mova m7, [cq+128*29+r5*8]
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
+ mova m0, [cq+128* 2+r5*8]
+ mova m1, [cq+128* 6+r5*8]
+ mova m2, [cq+128*10+r5*8]
+ mova m3, [cq+128*14+r5*8]
+ mova m4, [cq+128*18+r5*8]
+ mova m5, [cq+128*22+r5*8]
+ mova m6, [cq+128*26+r5*8]
+ mova m7, [cq+128*30+r5*8]
+ add r3, 16*(16+4*ARCH_X86_32)
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+ mova m0, [cq+128* 0+r5*8]
+ mova m1, [cq+128* 4+r5*8]
+ mova m2, [cq+128* 8+r5*8]
+ mova m3, [cq+128*12+r5*8]
+ mova m4, [cq+128*16+r5*8]
+ mova m5, [cq+128*20+r5*8]
+ mova m6, [cq+128*24+r5*8]
+ mova m7, [cq+128*28+r5*8]
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ sub r3, 16*(16+4*ARCH_X86_32)
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).round_dct32
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+%if ARCH_X86_64
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+32*16+r5*8+2*32*16], m8
+ mova [rsp+40*16+r5*8+2*32*16], m10
+ mova [rsp+32*16+t1*8+2*32*16], m9
+ mova [rsp+32*16+t0*8+2*32*16], m11
+ mova m8, [r3+16* 9] ; 8 9
+ mova m10, [r3+16*11] ; 10 11
+ mova m12, [r3+16*13] ; 12 13
+ mova m14, [r3+16*15] ; 14 15
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+32*16+r5*8+1*32*16], m8
+ mova [rsp+40*16+r5*8+1*32*16], m10
+ mova [rsp+32*16+t1*8+1*32*16], m9
+ mova [rsp+32*16+t0*8+1*32*16], m11
+ mova m8, [r3+16* 8] ; 24 25
+ mova m10, [r3+16*10] ; 26 27
+ mova m12, [r3+16*12] ; 28 29
+ mova m14, [r3+16*14] ; 30 31
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+32*16+r5*8+3*32*16], m8
+ mova [rsp+40*16+r5*8+3*32*16], m10
+ mova [rsp+32*16+t1*8+3*32*16], m9
+ mova [rsp+32*16+t0*8+3*32*16], m11
+%else
+ sub r3, 8*16
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+10*16]
+ mova m4, [r3+12*16]
+ mova m6, [r3+14*16]
+ packssdw m0, [r3+ 9*16]
+ packssdw m2, [r3+11*16]
+ packssdw m4, [r3+13*16]
+ packssdw m6, [r3+15*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+32*16+r5*8+1*32*16], m0
+ mova [rsp+40*16+r5*8+1*32*16], m2
+ mova [rsp+32*16+t1*8+1*32*16], m1
+ mova [rsp+32*16+t0*8+1*32*16], m3
+ mova m0, [r3+16*16]
+ mova m2, [r3+18*16]
+ mova m4, [r3+20*16]
+ mova m6, [r3+22*16]
+ packssdw m0, [r3+17*16]
+ packssdw m2, [r3+19*16]
+ packssdw m4, [r3+21*16]
+ packssdw m6, [r3+23*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+32*16+r5*8+2*32*16], m0
+ mova [rsp+40*16+r5*8+2*32*16], m2
+ mova [rsp+32*16+t1*8+2*32*16], m1
+ mova [rsp+32*16+t0*8+2*32*16], m3
+ mova m0, [r3+31*16]
+ mova m2, [r3+29*16]
+ mova m4, [r3+27*16]
+ mova m6, [r3+25*16]
+ packssdw m0, [r3+30*16]
+ packssdw m2, [r3+28*16]
+ packssdw m4, [r3+26*16]
+ packssdw m6, [r3+24*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+32*16+r5*8+3*32*16], m0
+ mova [rsp+40*16+r5*8+3*32*16], m2
+ mova [rsp+32*16+t1*8+3*32*16], m1
+ mova [rsp+32*16+t0*8+3*32*16], m3
+ mova m0, [r3+ 0*16]
+ mova m2, [r3+ 2*16]
+ mova m4, [r3+ 4*16]
+ mova m6, [r3+ 6*16]
+ packssdw m0, [r3+ 1*16]
+ packssdw m2, [r3+ 3*16]
+ packssdw m4, [r3+ 5*16]
+ packssdw m6, [r3+ 7*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+ pxor m7, m7
+ ; clear lower half of [cq]
+ REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \
+ 8, 9, 10, 11, 12, 13, 14, 15, \
+ 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+ mova [rsp+32*16+r5*8+0*32*16], m0
+ mova [rsp+40*16+r5*8+0*32*16], m2
+ mova [rsp+32*16+t1*8+0*32*16], m1
+ mova [rsp+32*16+t0*8+0*32*16], m3
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2 code starts here
+ mov eobd, [rsp+gprsize*0+5*32*16]
+ add rsp, 29*16
+ cmp eobd, 36
+ jl .load_veryfast
+ cmp eobd, 136
+ jl .load_fast
+ ; load normal
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
+ jmp .run
+.load_fast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ jmp .run
+.load_veryfast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ ; fall-through
+.run:
+%if ARCH_X86_64
+ lea r2, [dstq+64]
+ mov r7, -8
+%else
+ lea r2, [rsp+(4*32+3)*16]
+ mov dword [r2+0*gprsize], 4
+%endif
+ jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 32
+ add rsp, (5*32+1-(24+8*ARCH_X86_32))*16
+ jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly1
+
+cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 16, \
+ 0-(12+2*64)*16-(4+4*ARCH_X86_32)*gprsize, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 4, 1, 2, 0
+ mov [rsp+gprsize*1+(64*2+12)*16], r0
+ mov [rsp+gprsize*2+(64*2+12)*16], r1
+ mov [rsp+gprsize*3+(64*2+12)*16], r2
+%else
+ DECLARE_REG_TMP 8, 9, 4, 7
+ mov [rsp+gprsize*1+(64*2+12)*16], r9
+%if WIN64
+ mov [rsp+gprsize*2+(64*2+12)*16], r7
+ mov [rsp+gprsize*3+(64*2+12)*16], r8
+%endif
+%endif
+%undef cmp
+ ; remove entirely-zero iterations
+ mov r5d, 7*2
+ cmp eobw, word [o2(tbl_16x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
+ movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
+ movzx t0d, t1b
+ movzx t2d, t3b
+ shr t1d, 8
+ shr t3d, 8
+ mova [rsp+12*16+t0*8], m0
+ mova [rsp+12*16+t1*8], m0
+ mova [rsp+12*16+t2*8], m0
+ mova [rsp+12*16+t3*8], m0
+ mova [rsp+76*16+t0*8], m0
+ mova [rsp+76*16+t1*8], m0
+ mova [rsp+76*16+t2*8], m0
+ mova [rsp+76*16+t3*8], m0
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_16x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+ ; actual first pass after skipping all-zero data
+ mov [rsp+gprsize*0+(64*2+12)*16], eobd
+ mov r3, rsp
+%if ARCH_X86_32
+ DECLARE_REG_TMP 4, 1, 6, 0
+ mov r2, [rsp+gprsize*3+(64*2+12)*16]
+ mov [rsp+gprsize*3+(64*2+12)*16], r6
+%endif
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+ 1*128+r5*8]
+ mova m1, [cq+ 3*128+r5*8]
+ mova m2, [cq+ 5*128+r5*8]
+ mova m3, [cq+ 7*128+r5*8]
+ mova m4, [cq+ 9*128+r5*8]
+ mova m5, [cq+11*128+r5*8]
+ mova m6, [cq+13*128+r5*8]
+ mova m7, [cq+15*128+r5*8]
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+
+ mova m0, [cq+ 0*128+r5*8]
+ mova m1, [cq+ 2*128+r5*8]
+ mova m2, [cq+ 4*128+r5*8]
+ mova m3, [cq+ 6*128+r5*8]
+ mova m4, [cq+ 8*128+r5*8]
+ mova m5, [cq+10*128+r5*8]
+ mova m6, [cq+12*128+r5*8]
+ mova m7, [cq+14*128+r5*8]
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ call m(idct_16x16_internal_16bpc).round
+%if ARCH_X86_64
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m8, m9
+ packssdw m10, m11
+ packssdw m12, m13
+ packssdw m14, m15
+%endif
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
+ movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
+ movzx t0d, t1b
+ movzx t2d, t3b
+ shr t1d, 8
+ shr t3d, 8
+%if ARCH_X86_64
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+76*16+t0*8], m8
+ mova [rsp+76*16+t1*8], m9
+ mova [rsp+76*16+t2*8], m10
+ mova [rsp+76*16+t3*8], m11
+%else
+ mova [rsp+76*16+t0*8], m0
+ mova [rsp+76*16+t1*8], m1
+ mova [rsp+76*16+t2*8], m2
+ mova [rsp+76*16+t3*8], m3
+ mova m0, [rsp+ 8*16]
+ mova m2, [rsp+ 9*16]
+ mova m4, [rsp+10*16]
+ mova m6, [rsp+11*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+ mova [rsp+12*16+t0*8], m0
+ mova [rsp+12*16+t1*8], m1
+ mova [rsp+12*16+t2*8], m2
+ mova [rsp+12*16+t3*8], m3
+%if ARCH_X86_32
+ mov r6, [rsp+gprsize*3+(64*2+12)*16]
+%endif
+ pxor m7, m7
+ REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2
+ mov eobd, [rsp+gprsize*0+(64*2+12)*16]
+ cmp eobd, 151
+ jl .fast
+ ; fall-through
+%if ARCH_X86_64
+ DECLARE_REG_TMP 8, 9
+%else
+ DECLARE_REG_TMP 1, 5
+%endif
+ lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)]
+ jmp .run
+.fast:
+ lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)]
+.run:
+ add rsp, 9*16
+
+%if ARCH_X86_64
+ lea r2, [dstq+32]
+ mov r7, -4
+%else
+ lea r2, [rsp+(64*2+3)*16]
+ mov [r2+4*gprsize], t0
+ mov [r2+5*gprsize], t1
+ mov r1, [r2+2*gprsize]
+ mov dword [r2+0*gprsize], 2
+%endif
+.loop_pass2:
+%if ARCH_X86_32
+ mov dstq, [r2+1*gprsize]
+%endif
+ call .pass2
+ add rsp, 64*16
+%if ARCH_X86_64
+ add r7, 2
+ lea dstq, [r2+r7*8]
+ jl .loop_pass2
+%else
+ add dword [r2+1*gprsize], 16
+ dec dword [r2+0*gprsize]
+ jg .loop_pass2
+%endif
+%assign stack_size (stack_size-(64*2+9)*16)
+%if STACK_ALIGNMENT >= 16
+%assign stack_size_padded (stack_size_padded-(64*2+9)*16)
+%assign stack_offset (stack_offset-(64*2+9)*16)
+%else
+%xdefine rstkm [rsp + stack_size]
+%endif
+%if ARCH_X86_64
+ mov r9, [rsp+gprsize*1+3*16]
+%if WIN64
+ mov r7, [rsp+gprsize*2+3*16]
+ mov r8, [rsp+gprsize*3+3*16]
+%endif
+%endif
+ RET
+
+.pass2:
+%if ARCH_X86_32
+ lea r5, [o(itx8_start)]
+%endif
+ mova m0, [rsp+gprsize+16* 3]
+ mova m1, [rsp+gprsize+16* 4]
+ mova m2, [rsp+gprsize+16* 5]
+ mova m3, [rsp+gprsize+16* 6]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m_suffix(idct_8x8_internal_8bpc, _ssse3).main
+ mova [rsp+gprsize+ 3*16], m0
+ mova [rsp+gprsize+ 4*16], m1
+ mova [rsp+gprsize+ 5*16], m2
+ mova [rsp+gprsize+ 6*16], m3
+ mova [rsp+gprsize+ 7*16], m4
+ mova [rsp+gprsize+ 8*16], m5
+ mova [rsp+gprsize+ 9*16], m6
+ mova [rsp+gprsize+10*16], m7
+ mova m0, [rsp+gprsize+16*11]
+ mova m1, [rsp+gprsize+16*12]
+ mova m2, [rsp+gprsize+16*13]
+ mova m3, [rsp+gprsize+16*14]
+ pxor m4, m4
+ REPX {mova x, m4}, m5, m6, m7
+ call m_suffix(idct_16x8_internal_8bpc, _ssse3).main
+ mova m7, [rsp+gprsize+ 0*16]
+ mova [rsp+gprsize+11*16], m0
+ mova [rsp+gprsize+12*16], m1
+ mova [rsp+gprsize+13*16], m2
+ mova [rsp+gprsize+14*16], m3
+ mova [rsp+gprsize+15*16], m4
+ mova [rsp+gprsize+16*16], m5
+ mova [rsp+gprsize+17*16], m6
+ mova [rsp+gprsize+18*16], m7
+%if ARCH_X86_64
+ call r8
+%else
+ call [r2+4*gprsize]
+%endif
+ mova [rsp+gprsize+ 3*16], m0
+ mova [rsp+gprsize+ 5*16], m2
+ mova [rsp+gprsize+ 8*16], m5
+ mova [rsp+gprsize+10*16], m7
+%if ARCH_X86_64
+ call r9
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+%else
+ call [r2+5*gprsize]
+%endif
+ lea r3, [strideq*3]
+ lea r4, [rsp+gprsize+ 3*16]
+%if ARCH_X86_64
+ mov r6d, 8
+%else
+ mov dword [r2+2*gprsize], 8
+%endif
+.loop_write:
+ mova m0, [r4+0*16]
+ mova m1, [r4+1*16]
+ mova m2, [r4+2*16]
+ mova m3, [r4+3*16]
+ mova m4, [r4+4*16]
+ mova m5, [r4+5*16]
+ mova m6, [r4+6*16]
+ mova m7, [r4+7*16]
+ call m(idct_8x8_internal_16bpc).round1_and_write_8x8
+ lea dstq, [dstq+strideq*8]
+ add r4, 8*16
+%if ARCH_X86_64
+ dec r6d
+%else
+ dec dword [r2+2*gprsize]
+%endif
+ jg .loop_write
+ ret
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 64
+ add r5d, 640
+ sar r5d, 10
+ add rsp, (12+2*64)*16+(4+4*ARCH_X86_32)*gprsize-(8+4*ARCH_X86_32)*16
+ jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
+
+cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 16, \
+ 0-(32+4*64)*16-(4+4*ARCH_X86_32)*gprsize, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 4, 1, 2, 0
+ mov [rsp+gprsize*1+(64*4+32)*16], r0
+ mov [rsp+gprsize*2+(64*4+32)*16], r1
+ mov [rsp+gprsize*3+(64*4+32)*16], r2
+%else
+ DECLARE_REG_TMP 8, 9, 4, 7
+ mov [rsp+gprsize*1+(64*4+32)*16], r9
+%if WIN64
+ mov [rsp+gprsize*2+(64*4+32)*16], r7
+ mov [rsp+gprsize*3+(64*4+32)*16], r8
+%endif
+%endif
+%undef cmp
+ ; remove entirely-zero iterations
+ mov r5d, 7*2
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
+ movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
+ movzx t0d, t1b
+ movzx t2d, t3b
+ shr t1d, 8
+ shr t3d, 8
+ mova [rsp+ 32*16+t0*8], m0
+ mova [rsp+ 32*16+t1*8], m0
+ mova [rsp+ 32*16+t2*8], m0
+ mova [rsp+ 32*16+t3*8], m0
+ mova [rsp+ 96*16+t0*8], m0
+ mova [rsp+ 96*16+t1*8], m0
+ mova [rsp+ 96*16+t2*8], m0
+ mova [rsp+ 96*16+t3*8], m0
+ mova [rsp+160*16+t0*8], m0
+ mova [rsp+160*16+t1*8], m0
+ mova [rsp+160*16+t2*8], m0
+ mova [rsp+160*16+t3*8], m0
+ mova [rsp+224*16+t0*8], m0
+ mova [rsp+224*16+t1*8], m0
+ mova [rsp+224*16+t2*8], m0
+ mova [rsp+224*16+t3*8], m0
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+ ; actual first pass after skipping all-zero data
+ mov [rsp+gprsize*0+(64*4+32)*16], eobd
+ mov r3, rsp
+%if ARCH_X86_32
+ DECLARE_REG_TMP 4, 1, 6, 0
+ mov r2, [rsp+gprsize*3+(64*4+32)*16]
+ mov [rsp+gprsize*3+(64*4+32)*16], r6
+%endif
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+ mova m0, [cq+128* 1+r5*8]
+ mova m1, [cq+128* 7+r5*8]
+ mova m2, [cq+128* 9+r5*8]
+ mova m3, [cq+128*15+r5*8]
+ mova m4, [cq+128*17+r5*8]
+ mova m5, [cq+128*23+r5*8]
+ mova m6, [cq+128*25+r5*8]
+ mova m7, [cq+128*31+r5*8]
+ mov r3, rsp
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1
+
+ mova m0, [cq+128* 3+r5*8]
+ mova m1, [cq+128* 5+r5*8]
+ mova m2, [cq+128*11+r5*8]
+ mova m3, [cq+128*13+r5*8]
+ mova m4, [cq+128*19+r5*8]
+ mova m5, [cq+128*21+r5*8]
+ mova m6, [cq+128*27+r5*8]
+ mova m7, [cq+128*29+r5*8]
+%if ARCH_X86_32
+ add r3, 16*8
+%endif
+ call m(idct_8x4_internal_16bpc).rect2_mul
+%if ARCH_X86_32
+ sub r3, 16*8
+%endif
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2
+ add r3, 16*(16+4*ARCH_X86_32)
+
+ mova m0, [cq+128* 2+r5*8]
+ mova m1, [cq+128* 6+r5*8]
+ mova m2, [cq+128*10+r5*8]
+ mova m3, [cq+128*14+r5*8]
+ mova m4, [cq+128*18+r5*8]
+ mova m5, [cq+128*22+r5*8]
+ mova m6, [cq+128*26+r5*8]
+ mova m7, [cq+128*30+r5*8]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_16x4_internal_16bpc).main_oddhalf
+
+ mova m0, [cq+128* 0+r5*8]
+ mova m1, [cq+128* 4+r5*8]
+ mova m2, [cq+128* 8+r5*8]
+ mova m3, [cq+128*12+r5*8]
+ mova m4, [cq+128*16+r5*8]
+ mova m5, [cq+128*20+r5*8]
+ mova m6, [cq+128*24+r5*8]
+ mova m7, [cq+128*28+r5*8]
+ call m(idct_8x4_internal_16bpc).rect2_mul
+ call m(idct_8x4_internal_16bpc).main_pass1
+ call m(idct_8x4_internal_16bpc).round
+ sub r3, 16*(16+4*ARCH_X86_32)
+ call m(inv_txfm_add_dct_dct_32x16_16bpc).round_dct32
+
+ movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
+ movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
+ movzx t0d, t1b
+ movzx t2d, t3b
+ shr t1d, 8
+ shr t3d, 8
+%if ARCH_X86_64
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+160*16+t0*8], m8
+ mova [rsp+160*16+t1*8], m9
+ mova [rsp+160*16+t2*8], m10
+ mova [rsp+160*16+t3*8], m11
+ mova m8, [r3+16* 9] ; 8 9
+ mova m10, [r3+16*11] ; 10 11
+ mova m12, [r3+16*13] ; 12 13
+ mova m14, [r3+16*15] ; 14 15
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+ 96*16+t0*8], m8
+ mova [rsp+ 96*16+t1*8], m9
+ mova [rsp+ 96*16+t2*8], m10
+ mova [rsp+ 96*16+t3*8], m11
+ mova m8, [r3+16* 8] ; 24 25
+ mova m10, [r3+16*10] ; 26 27
+ mova m12, [r3+16*12] ; 28 29
+ mova m14, [r3+16*14] ; 30 31
+ call m(idct_16x4_internal_16bpc).transpose4x8packed_hi
+ mova [rsp+224*16+t0*8], m8
+ mova [rsp+224*16+t1*8], m9
+ mova [rsp+224*16+t2*8], m10
+ mova [rsp+224*16+t3*8], m11
+%else
+ sub r3, 8*16
+ mova m0, [r3+ 8*16]
+ mova m2, [r3+10*16]
+ mova m4, [r3+12*16]
+ mova m6, [r3+14*16]
+ packssdw m0, [r3+ 9*16]
+ packssdw m2, [r3+11*16]
+ packssdw m4, [r3+13*16]
+ packssdw m6, [r3+15*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+ 96*16+t0*8], m0
+ mova [rsp+ 96*16+t1*8], m1
+ mova [rsp+ 96*16+t2*8], m2
+ mova [rsp+ 96*16+t3*8], m3
+ mova m0, [r3+16*16]
+ mova m2, [r3+18*16]
+ mova m4, [r3+20*16]
+ mova m6, [r3+22*16]
+ packssdw m0, [r3+17*16]
+ packssdw m2, [r3+19*16]
+ packssdw m4, [r3+21*16]
+ packssdw m6, [r3+23*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+160*16+t0*8], m0
+ mova [rsp+160*16+t1*8], m1
+ mova [rsp+160*16+t2*8], m2
+ mova [rsp+160*16+t3*8], m3
+ mova m0, [r3+31*16]
+ mova m2, [r3+29*16]
+ mova m4, [r3+27*16]
+ mova m6, [r3+25*16]
+ packssdw m0, [r3+30*16]
+ packssdw m2, [r3+28*16]
+ packssdw m4, [r3+26*16]
+ packssdw m6, [r3+24*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [rsp+224*16+t0*8], m0
+ mova [rsp+224*16+t1*8], m1
+ mova [rsp+224*16+t2*8], m2
+ mova [rsp+224*16+t3*8], m3
+ mova m0, [r3+ 0*16]
+ mova m2, [r3+ 2*16]
+ mova m4, [r3+ 4*16]
+ mova m6, [r3+ 6*16]
+ packssdw m0, [r3+ 1*16]
+ packssdw m2, [r3+ 3*16]
+ packssdw m4, [r3+ 5*16]
+ packssdw m6, [r3+ 7*16]
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+%endif
+ mova [rsp+ 32*16+t0*8], m0
+ mova [rsp+ 32*16+t1*8], m1
+ mova [rsp+ 32*16+t2*8], m2
+ mova [rsp+ 32*16+t3*8], m3
+ pxor m0, m0
+ REPX {mova [cq+x*128+r5*8], m0}, 0, 1, 2, 3, 4, 5, 6, 7, \
+ 8, 9, 10, 11, 12, 13, 14, 15, \
+ 16, 17, 18, 19, 20, 21, 22, 23, \
+ 24, 25, 26, 27, 28, 29, 30, 31
+%if ARCH_X86_32
+ mov r6, [rsp+gprsize*3+(64*4+32)*16]
+%endif
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2
+ mov eobd, [rsp+gprsize*0+(64*4+32)*16]
+ cmp eobd, 136
+ jl .fast
+ ; fall-through
+%if ARCH_X86_64
+ DECLARE_REG_TMP 8, 9
+%else
+ DECLARE_REG_TMP 1, 5
+%endif
+ lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)]
+ jmp .run
+.fast:
+ lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)]
+.run:
+ add rsp, 29*16
+
+%if ARCH_X86_64
+ lea r2, [dstq+64]
+ mov r7, -8
+%else
+ lea r2, [rsp+(64*4+3)*16]
+ mov [r2+4*gprsize], t0
+ mov [r2+5*gprsize], t1
+ mov r1, [r2+2*gprsize]
+ mov dword [r2+0*gprsize], 4
+%endif
+ jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 64
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 384
+ sar r5d, 9
+ add rsp, (32+4*64)*16+(4+4*ARCH_X86_32)*gprsize-(24+8*ARCH_X86_32)*16
+ jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
+
+cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+ ; remove entirely-zero iterations
+%undef cmp
+ mov r5d, 8
+.zero_loop:
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_32x16_2d)+r5]
+ jl .zero_loop
+
+ ; actual first pass after skipping all-zero data
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+
+ mov r3, rsp
+ lea r4, [o(idct64_mul_16bpc)]
+ mova m0, [cq+64* 1+r5*8]
+ mova m1, [cq+64*31+r5*8]
+ mova m2, [cq+64*17+r5*8]
+ mova m3, [cq+64*15+r5*8]
+ call .main_part1
+ mova m0, [cq+64* 7+r5*8]
+ mova m1, [cq+64*25+r5*8]
+ mova m2, [cq+64*23+r5*8]
+ mova m3, [cq+64* 9+r5*8]
+ call .main_part1
+ mova m0, [cq+64* 5+r5*8]
+ mova m1, [cq+64*27+r5*8]
+ mova m2, [cq+64*21+r5*8]
+ mova m3, [cq+64*11+r5*8]
+ call .main_part1
+ mova m0, [cq+64* 3+r5*8]
+ mova m1, [cq+64*29+r5*8]
+ mova m2, [cq+64*19+r5*8]
+ mova m3, [cq+64*13+r5*8]
+ call .main_part1
+ call .main_part2
+
+ mova m0, [cq+64* 2+r5*8]
+ mova m1, [cq+64*14+r5*8]
+ mova m2, [cq+64*18+r5*8]
+ mova m3, [cq+64*30+r5*8]
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast
+
+ mova m0, [cq+64* 6+r5*8]
+ mova m1, [cq+64*10+r5*8]
+ mova m2, [cq+64*22+r5*8]
+ mova m3, [cq+64*26+r5*8]
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast
+ add r3, 16*(24+4*ARCH_X86_32)
+
+ mova m0, [cq+64* 4+r5*8]
+ mova m1, [cq+64*12+r5*8]
+ mova m2, [cq+64*20+r5*8]
+ mova m3, [cq+64*28+r5*8]
+ call m(idct_16x4_internal_16bpc).main_oddhalf_fast
+
+ mova m0, [cq+64* 0+r5*8]
+ mova m1, [cq+64* 8+r5*8]
+ mova m2, [cq+64*16+r5*8]
+ mova m3, [cq+64*24+r5*8]
+ call m(idct_8x4_internal_16bpc).main_pass1_fast
+ call m(idct_8x4_internal_16bpc).round
+ mova [r3-(7+4*ARCH_X86_32)*16], m1
+ mova [r3-(6+4*ARCH_X86_32)*16], m2
+ mova [r3-(5+4*ARCH_X86_32)*16], m3
+ mova [r3-(4+4*ARCH_X86_32)*16], m4
+ mova [r3-(3+4*ARCH_X86_32)*16], m5
+ mova [r3-(2+4*ARCH_X86_32)*16], m6
+ mova [r3-(1+4*ARCH_X86_32)*16], m7
+ sub r3, 16*(40+4*ARCH_X86_32-4)
+
+%if ARCH_X86_64
+ psrld m15, m11, 10 ; pd_2
+%else
+ mova m7, [o(pd_2)]
+%endif
+ call .main_end_loop_start
+
+ lea r3, [rsp+56*16]
+ lea r4, [cq+r5*8+64*28]
+ call .shift_transpose
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2, we need to call this otherwise the stack pointer has
+ ; the wrong offset in the 8-bit code
+ call .pass2
+ RET
+
+.pass2:
+%if ARCH_X86_64
+ mova m8, [o(pw_2048)]
+ pxor m9, m9
+ mova m10, [o(pixel_10bpc_max)]
+%if WIN64
+ mov [rsp+16*16+gprsize], r7
+%endif
+ mov r7, dstq
+%else
+ mov [rsp+2*gprsize+16*16], dstq
+%endif
+ lea r3, [strideq*3]
+ mov r4d, 8
+ jmp m(idct_16x16_internal_16bpc).loop_pass2
+
+.main_part1: ; idct64 steps 1-5
+ ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+%if ARCH_X86_64
+ movd m7, [r4+4*0]
+ movd m8, [r4+4*1]
+ movd m6, [r4+4*2]
+ movd m9, [r4+4*3]
+ movd m5, [r4+4*4]
+ movd m10, [r4+4*5]
+ movd m4, [r4+4*6]
+ movd m15, [r4+4*7]
+ REPX {pshufd x, x, q0000}, m7, m8, m6, m9, m5, m10, m4, m15
+ pmulld m7, m0 ; t63a
+ pmulld m0, m8 ; t32a
+ pmulld m6, m1 ; t62a
+ pmulld m1, m9 ; t33a
+ pmulld m5, m2 ; t61a
+ pmulld m2, m10 ; t34a
+ pmulld m4, m3 ; t60a
+ pmulld m3, m15 ; t35a
+ movd m10, [r4+4*8]
+ movd m15, [r4+4*9]
+ REPX {pshufd x, x, q0000}, m10, m15
+ REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3
+ REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4
+ psubd m8, m0, m1 ; t33
+ paddd m0, m1 ; t32
+ psubd m1, m7, m6 ; t62
+ paddd m7, m6 ; t63
+ psubd m6, m3, m2 ; t34
+ paddd m3, m2 ; t35
+ psubd m2, m4, m5 ; t61
+ paddd m4, m5 ; t60
+ REPX {pmaxsd x, m12}, m8, m1, m6, m2
+ REPX {pminsd x, m13}, m8, m1, m6, m2
+ ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a
+ ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 4 ; t61a, t34a
+ REPX {pmaxsd x, m12}, m0, m3, m7, m4
+ REPX {pminsd x, m13}, m0, m3, m7, m4
+ movd m10, [r4+4*10]
+ movd m15, [r4+4*11]
+ REPX {pshufd x, x, q0000}, m10, m15
+ psubd m5, m0, m3 ; t35a
+ paddd m0, m3 ; t32a
+ psubd m3, m7, m4 ; t60a
+ paddd m7, m4 ; t63a
+ psubd m4, m1, m6 ; t34
+ paddd m1, m6 ; t33
+ psubd m6, m8, m2 ; t61
+ paddd m8, m2 ; t62
+ REPX {pmaxsd x, m12}, m5, m3, m4, m6
+ REPX {pminsd x, m13}, m5, m3, m4, m6
+ ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15 ; t35, t60
+ ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a
+ REPX {pmaxsd x, m12}, m0, m7, m1, m8
+ REPX {pminsd x, m13}, m0, m7, m1, m8
+ add r4, 4*12
+ mova [r3+16*0], m0
+ mova [r3+16*7], m7
+ mova [r3+16*1], m1
+ mova [r3+16*6], m8
+ mova [r3+16*2], m6
+ mova [r3+16*5], m4
+ mova [r3+16*3], m3
+ mova [r3+16*4], m5
+%else
+ movd m7, [r4+4*0]
+ movd m6, [r4+4*2]
+ movd m5, [r4+4*4]
+ movd m4, [r4+4*6]
+ REPX {pshufd x, x, q0000}, m7, m6, m5, m4
+ pmulld m7, m0 ; t63a
+ pmulld m6, m1 ; t62a
+ pmulld m5, m2 ; t61a
+ pmulld m4, m3 ; t60a
+ mova [r3+0*16], m6
+ mova [r3+1*16], m7
+ movd m6, [r4+4*1]
+ movd m7, [r4+4*3]
+ REPX {pshufd x, x, q0000}, m7, m6
+ pmulld m0, m6 ; t32a
+ pmulld m1, m7 ; t33a
+ movd m6, [r4+4*5]
+ movd m7, [r4+4*7]
+ REPX {pshufd x, x, q0000}, m7, m6
+ pmulld m2, m6 ; t34a
+ pmulld m3, m7 ; t35a
+ mova m6, [r3+0*16]
+ mova m7, [o(pd_2048)]
+ REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6
+ paddd m7, [r3+1*16]
+ REPX {psrad x, 12}, m0, m1, m7, m6, m2, m3, m5, m4
+ mova [r3+0*16], m5
+ psubd m5, m0, m1 ; t33
+ paddd m0, m1 ; t32
+ mova [r3+1*16], m0
+ mova m0, [r3+0*16]
+ psubd m1, m7, m6 ; t62
+ paddd m7, m6 ; t63
+ psubd m6, m3, m2 ; t34
+ paddd m3, m2 ; t35
+ psubd m2, m4, m0 ; t61
+ paddd m4, m0 ; t60
+ mova m0, [o(clip_18b_min)]
+ REPX {pmaxsd x, m0}, m5, m1, m7, m6, m3, m2, m4
+ pmaxsd m0, [r3+1*16]
+ mova [r3+0*16], m0
+ mova m0, [o(clip_18b_max)]
+ REPX {pminsd x, m0}, m5, m1, m7, m6, m3, m2, m4
+ pminsd m0, [r3+0*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m3
+ mova [r3+2*16], m4
+ mova [r3+3*16], m7
+ mova m0, [o(pd_2048)]
+ movd m3, [r4+4*8]
+ movd m4, [r4+4*9]
+ REPX {pshufd x, x, q0000}, m3, m4
+ mova [r3+4*16], m2
+ ITX_MULSUB_2D 1, 5, 2, 7, _, 0, 3, 4 ; t33a, t62a
+ mova m2, [r3+4*16]
+ mova [r3+4*16], m5
+ ITX_MULSUB_2D 2, 6, 5, 7, _, 0, 3, 4, 4 ; t61a, t34a
+ mova m0, [r3+0*16]
+ mova m3, [r3+1*16]
+ mova m4, [r3+2*16]
+ mova m7, [r3+3*16]
+ psubd m5, m0, m3 ; t35a
+ paddd m0, m3 ; t32a
+ mova [r3+0*16], m5
+ mova m5, [r3+4*16]
+ psubd m3, m7, m4 ; t60a
+ paddd m7, m4 ; t63a
+ psubd m4, m1, m6 ; t34
+ paddd m1, m6 ; t33
+ psubd m6, m5, m2 ; t61
+ paddd m2, m5 ; t62
+ mova m5, [o(clip_18b_min)]
+ REPX {pmaxsd x, m5}, m0, m3, m7, m4, m1, m6, m2
+ pmaxsd m5, [r3+0*16]
+ mova [r3+0*16], m5
+ mova m5, [o(clip_18b_max)]
+ REPX {pminsd x, m5}, m0, m3, m7, m4, m1, m6, m2
+ pminsd m5, [r3+0*16]
+ mova [r3+16*0], m0
+ mova [r3+16*7], m7
+ mova [r3+16*1], m1
+ mova [r3+16*6], m2
+ mova [r3+16*2], m4
+ mova m7, [o(pd_2048)]
+ movd m0, [r4+4*10]
+ movd m1, [r4+4*11]
+ REPX {pshufd x, x, q0000}, m0, m1
+ ITX_MULSUB_2D 3, 5, 2, 4, _, 7, 0, 1 ; t35, t60
+ mova [r3+16*3], m3
+ mova [r3+16*4], m5
+ mova m4, [r3+2*16]
+ ITX_MULSUB_2D 6, 4, 2, 3, _, 7, 0, 1 ; t34a, t61a
+ add r4, 4*12
+ mova [r3+16*2], m6
+ mova [r3+16*5], m4
+%endif
+ add r3, 16*8
+ ret
+
+.main_part2: ; idct64 steps 6-9
+ lea r4, [r3+16*7]
+%if ARCH_X86_64
+ mova m10, [o(pd_1567)]
+ mova m15, [o(pd_3784)]
+.main_part2_loop:
+ mova m0, [r3-16*32] ; t32a
+ mova m1, [r4-16*24] ; t39a
+ mova m2, [r4-16*32] ; t63a
+ mova m3, [r3-16*24] ; t56a
+ mova m4, [r3-16*16] ; t40a
+ mova m5, [r4-16* 8] ; t47a
+ mova m6, [r4-16*16] ; t55a
+ mova m7, [r3-16* 8] ; t48a
+ psubd m8, m0, m1 ; t39
+ paddd m0, m1 ; t32
+ psubd m1, m2, m3 ; t56
+ paddd m2, m3 ; t63
+ psubd m3, m5, m4 ; t40
+ paddd m5, m4 ; t47
+ psubd m4, m7, m6 ; t55
+ paddd m7, m6 ; t48
+ REPX {pmaxsd x, m12}, m8, m1, m3, m4
+ REPX {pminsd x, m13}, m8, m1, m3, m4
+ ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a
+ ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 4 ; t55a, t40a
+ REPX {pmaxsd x, m12}, m0, m2, m5, m7
+ REPX {pminsd x, m13}, m0, m5, m2, m7
+ psubd m6, m2, m7 ; t48a
+ paddd m2, m7 ; t63a
+ psubd m7, m0, m5 ; t47a
+ paddd m0, m5 ; t32a
+ psubd m5, m8, m4 ; t55
+ paddd m8, m4 ; t56
+ psubd m4, m1, m3 ; t40
+ paddd m1, m3 ; t39
+ REPX {pmaxsd x, m12}, m6, m7, m5, m4
+ REPX {pminsd x, m13}, m6, m7, m5, m4
+ REPX {pmulld x, m14}, m6, m7, m5, m4
+ REPX {pmaxsd x, m12}, m2, m0, m8, m1
+ REPX {pminsd x, m13}, m2, m0, m8, m1
+ paddd m6, m11
+ paddd m5, m11
+ psubd m3, m6, m7 ; t47
+ paddd m6, m7 ; t48
+ psubd m7, m5, m4 ; t40a
+ paddd m5, m4 ; t55a
+ REPX {psrad x, 12}, m3, m6, m7, m5
+ mova [r4-16* 8], m2
+ mova [r3-16*32], m0
+ mova [r3-16* 8], m8
+ mova [r4-16*32], m1
+ mova [r4-16*24], m3
+ mova [r3-16*16], m6
+ mova [r3-16*24], m7
+ mova [r4-16*16], m5
+%else
+.main_part2_loop:
+ mova m0, [r3-16*32] ; t32a
+ mova m1, [r4-16*24] ; t39a
+ mova m2, [r4-16*32] ; t63a
+ mova m3, [r3-16*24] ; t56a
+ mova m4, [r3-16*16] ; t40a
+ mova m5, [r4-16* 8] ; t47a
+ mova m6, [r4-16*16] ; t55a
+ psubd m7, m0, m1 ; t39
+ paddd m0, m1 ; t32
+ mova [r3+0*16], m7
+ mova m7, [r3-16* 8] ; t48a
+ psubd m1, m2, m3 ; t56
+ paddd m2, m3 ; t63
+ psubd m3, m5, m4 ; t40
+ paddd m5, m4 ; t47
+ psubd m4, m7, m6 ; t55
+ paddd m7, m6 ; t48
+ mova m6, [o(clip_18b_min)]
+ REPX {pmaxsd x, m6}, m0, m1, m2, m3, m5, m4, m7
+ pmaxsd m6, [r3+0*16]
+ mova [r3+0*16], m6
+ mova m6, [o(clip_18b_max)]
+ REPX {pminsd x, m6}, m0, m1, m2, m3, m5, m4, m7
+ pminsd m6, [r3+0*16]
+ mova [r3+0*16], m0
+ mova [r3+1*16], m2
+ mova [r3+2*16], m5
+ mova [r3+3*16], m7
+ mova m0, [o(pd_2048)]
+ ITX_MULSUB_2D 1, 6, 2, 5, 7, 0, 1567, 3784 ; t39a, t56a
+ ITX_MULSUB_2D 4, 3, 2, 5, _, 0, 7, 3784, 4 ; t55a, t40a
+ mova m2, [r3+1*16]
+ mova m7, [r3+3*16]
+ psubd m5, m2, m7 ; t48a
+ paddd m2, m7 ; t63a
+ mova [r3+1*16], m5
+ mova m0, [r3+0*16]
+ mova m5, [r3+2*16]
+ psubd m7, m0, m5 ; t47a
+ paddd m0, m5 ; t32a
+ psubd m5, m6, m4 ; t55
+ paddd m6, m4 ; t56
+ psubd m4, m1, m3 ; t40
+ paddd m1, m3 ; t39
+ mova m3, [o(clip_18b_min)]
+ REPX {pmaxsd x, m3}, m2, m7, m0, m5, m6, m4, m1
+ pmaxsd m3, [r3+1*16]
+ mova [r3+0*16], m3
+ mova m3, [o(clip_18b_max)]
+ REPX {pminsd x, m3}, m2, m7, m0, m5, m6, m4, m1
+ pminsd m3, [r3+0*16]
+ mova [r4-16* 8], m2
+ mova [r3-16*32], m0
+ mova [r3-16* 8], m6
+ mova [r4-16*32], m1
+ mova m0, [o(pd_2896)]
+ mova m1, [o(pd_2048)]
+ REPX {pmulld x, m0}, m3, m7, m5, m4
+ REPX {paddd x, m1}, m3, m5
+ psubd m6, m3, m7 ; t47
+ paddd m3, m7 ; t48
+ psubd m7, m5, m4 ; t40a
+ paddd m5, m4 ; t55a
+ REPX {psrad x, 12}, m6, m3, m7, m5
+ mova [r4-16*24], m6
+ mova [r3-16*16], m3
+ mova [r3-16*24], m7
+ mova [r4-16*16], m5
+%endif
+ add r3, 16
+ sub r4, 16
+ cmp r3, r4
+ jl .main_part2_loop
+ sub r3, 4*16
+ ret
+
+.main_end_loop:
+ mova m0, [r3+16*28] ; idct8 0 + n
+.main_end_loop_start:
+ mova m2, [r3+16*12] ; idct32 16 + n
+ mova m3, [r4+16*12] ; idct32 31 - n
+%if ARCH_X86_64
+ mova m1, [r4+16*28] ; idct16 15 - n
+ mova m4, [r4-16* 4] ; idct64 63 - n
+ mova m5, [r3-16* 4] ; idct64 48 + n
+ mova m6, [r4-16*20] ; idct64 47 - n
+ mova m7, [r3-16*20] ; idct64 32 + n
+ pmaxsd m0, m12
+ pminsd m0, m13
+ paddd m8, m0, m1 ; idct16 out0 + n
+ psubd m0, m1 ; idct16 out15 - n
+ REPX {pmaxsd x, m12}, m8, m0
+ REPX {pminsd x, m13}, m8, m0
+ paddd m1, m8, m3 ; idct32 out0 + n
+ psubd m8, m3 ; idct32 out31 - n
+ paddd m3, m0, m2 ; idct32 out15 - n
+ psubd m0, m2 ; idct32 out16 + n
+ REPX {pmaxsd x, m12}, m1, m8, m3, m0
+ REPX {pminsd x, m13}, m1, m3, m8, m0
+ REPX {paddd x, m15}, m1, m3, m0, m8
+ paddd m2, m1, m4 ; idct64 out0 + n (unshifted)
+ psubd m1, m4 ; idct64 out63 - n (unshifted)
+ paddd m4, m3, m5 ; idct64 out15 - n (unshifted)
+ psubd m3, m5 ; idct64 out48 + n (unshifted)
+ paddd m5, m0, m6 ; idct64 out16 + n (unshifted)
+ psubd m0, m6 ; idct64 out47 - n (unshifted)
+ paddd m6, m8, m7 ; idct64 out31 - n (unshifted)
+ psubd m8, m7 ; idct64 out32 + n (unshifted)
+ mova [r3-16*20], m2
+ mova [r4+16*28], m1
+ mova [r4-16*20], m4
+ mova [r3+16*28], m3
+ mova [r3-16* 4], m5
+ mova [r4+16*12], m0
+ mova [r4-16* 4], m6
+ mova [r3+16*12], m8
+%else
+ mova m5, [o(clip_18b_min)]
+ mova m6, [o(clip_18b_max)]
+ mova m1, [r3+16*44] ; idct16 15 - n
+ pmaxsd m0, m5
+ pminsd m0, m6
+ paddd m4, m0, m1 ; idct16 out0 + n
+ psubd m0, m1 ; idct16 out15 - n
+ REPX {pmaxsd x, m5}, m4, m0
+ REPX {pminsd x, m6}, m4, m0
+ paddd m1, m4, m3 ; idct32 out0 + n
+ psubd m4, m3 ; idct32 out31 - n
+ paddd m3, m0, m2 ; idct32 out15 - n
+ psubd m0, m2 ; idct32 out16 + n
+ REPX {pmaxsd x, m5}, m1, m4, m3, m0
+ REPX {pminsd x, m6}, m1, m3, m4, m0
+ REPX {paddd x, m7}, m1, m3, m0, m4
+ mova m5, [r4-16* 4] ; idct64 63 - n
+ mova m6, [r3-16* 4] ; idct64 48 + n
+ paddd m2, m1, m5 ; idct64 out0 + n (unshifted)
+ psubd m1, m5 ; idct64 out63 - n (unshifted)
+ paddd m5, m3, m6 ; idct64 out15 - n (unshifted)
+ psubd m3, m6 ; idct64 out48 + n (unshifted)
+ mova [r4+16*28], m1
+ mova [r3+16*28], m3
+ mova m6, [r4-16*20] ; idct64 47 - n
+ mova m1, [r3-16*20] ; idct64 32 + n
+ mova [r3-16*20], m2
+ mova [r4-16*20], m5
+ paddd m5, m0, m6 ; idct64 out16 + n (unshifted)
+ psubd m0, m6 ; idct64 out47 - n (unshifted)
+ paddd m6, m4, m1 ; idct64 out31 - n (unshifted)
+ psubd m4, m1 ; idct64 out32 + n (unshifted)
+ mova [r3-16* 4], m5
+ mova [r4+16*12], m0
+ mova [r4-16* 4], m6
+ mova [r3+16*12], m4
+%endif
+ sub r4, 16
+ add r3, 16
+ cmp r3, r4
+ jl .main_end_loop
+ ret
+
+.shift_transpose:
+ mova m0, [r3+0*16]
+ mova m1, [r3+1*16]
+ mova m2, [r3+2*16]
+ mova m3, [r3+3*16]
+ mova m4, [r3+4*16]
+ mova m5, [r3+5*16]
+ mova m6, [r3+6*16]
+ mova m7, [r3+7*16]
+ REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [r4+0*64], m0
+ mova [r4+1*64], m1
+ mova [r4+2*64], m2
+ mova [r4+3*64], m3
+ sub r4, 4*64
+ sub r3, 8*16
+ cmp r3, rsp
+ jg .shift_transpose
+ ret
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 16
+.dconly1:
+ add r5d, 640
+ sar r5d, 10
+.dconly2:
+ imul r5d, 2896
+ add r5d, 34816
+ movd m0, r5d
+ pshuflw m0, m0, q1111
+ punpcklqdq m0, m0
+ mova m6, [o(pixel_10bpc_max)]
+ pxor m5, m5
+.dconly_loop:
+ paddw m1, m0, [dstq+16*0]
+ paddw m2, m0, [dstq+16*1]
+ paddw m3, m0, [dstq+16*2]
+ paddw m4, m0, [dstq+16*3]
+ REPX {pmaxsw x, m5}, m1, m2, m3, m4
+ REPX {pminsw x, m6}, m1, m2, m3, m4
+ mova [dstq+16*0], m1
+ mova [dstq+16*1], m2
+ mova [dstq+16*2], m3
+ mova [dstq+16*3], m4
+ add dstq, 64
+ btc r3d, 16
+ jnc .dconly_loop
+ lea dstq, [dstq+strideq-128]
+ dec r3d
+ jg .dconly_loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 16, \
+ 0-(1+64+8*ARCH_X86_32+8*32+1*WIN64)*16, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 0, 4, 1
+ mov [rsp+(8*32+64+8)*16+1*gprsize], dstq
+ mov [rsp+(8*32+64+8)*16+2*gprsize], strideq
+%else
+ DECLARE_REG_TMP 4, 7, 8
+%if WIN64
+ mov [rsp+(8*32+64+1)*16+1*gprsize], r7
+ mov [rsp+64*16+0*gprsize], r8
+%endif
+%endif
+%undef cmp
+ ; remove entirely-zero iterations
+ mov r5d, 14
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+ lea t2, [rsp+7*32*16]
+.zero_loop_inner:
+ mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0
+ mova [t2+(72+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0
+ mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t0*8], m0
+ mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t1*8], m0
+ sub t2, 32*16
+ cmp t2, rsp
+ jge .zero_loop_inner
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+ mov [rsp+(8*32+64+8*ARCH_X86_32+1*WIN64)*16+0*gprsize], eobd
+ ; actual first pass after skipping all-zero data
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+
+ mov r3, rsp
+ lea r4, [o(idct64_mul_16bpc)]
+ mova m0, [cq+128* 1+r5*8]
+ mova m1, [cq+128*31+r5*8]
+ mova m2, [cq+128*17+r5*8]
+ mova m3, [cq+128*15+r5*8]
+ call .rect2_mul_fast
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ mova m0, [cq+128* 7+r5*8]
+ mova m1, [cq+128*25+r5*8]
+ mova m2, [cq+128*23+r5*8]
+ mova m3, [cq+128* 9+r5*8]
+ call .rect2_mul_fast
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ mova m0, [cq+128* 5+r5*8]
+ mova m1, [cq+128*27+r5*8]
+ mova m2, [cq+128*21+r5*8]
+ mova m3, [cq+128*11+r5*8]
+ call .rect2_mul_fast
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ mova m0, [cq+128* 3+r5*8]
+ mova m1, [cq+128*29+r5*8]
+ mova m2, [cq+128*19+r5*8]
+ mova m3, [cq+128*13+r5*8]
+ call .rect2_mul_fast
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2
+
+ mova m0, [cq+128* 2+r5*8]
+ mova m1, [cq+128*14+r5*8]
+ mova m2, [cq+128*18+r5*8]
+ mova m3, [cq+128*30+r5*8]
+ call .rect2_mul_fast
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast
+
+ mova m0, [cq+128* 6+r5*8]
+ mova m1, [cq+128*10+r5*8]
+ mova m2, [cq+128*22+r5*8]
+ mova m3, [cq+128*26+r5*8]
+ call .rect2_mul_fast
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast
+ add r3, 16*(24+4*ARCH_X86_32)
+
+ mova m0, [cq+128* 4+r5*8]
+ mova m1, [cq+128*12+r5*8]
+ mova m2, [cq+128*20+r5*8]
+ mova m3, [cq+128*28+r5*8]
+ call .rect2_mul_fast
+ call m(idct_16x4_internal_16bpc).main_oddhalf_fast
+
+ mova m0, [cq+128* 0+r5*8]
+ mova m1, [cq+128* 8+r5*8]
+ mova m2, [cq+128*16+r5*8]
+ mova m3, [cq+128*24+r5*8]
+ call .rect2_mul_fast
+ call m(idct_8x4_internal_16bpc).main_pass1_fast
+ call m(idct_8x4_internal_16bpc).round
+ mova [r3-(7+4*ARCH_X86_32)*16], m1
+ mova [r3-(6+4*ARCH_X86_32)*16], m2
+ mova [r3-(5+4*ARCH_X86_32)*16], m3
+ mova [r3-(4+4*ARCH_X86_32)*16], m4
+ mova [r3-(3+4*ARCH_X86_32)*16], m5
+ mova [r3-(2+4*ARCH_X86_32)*16], m6
+ mova [r3-(1+4*ARCH_X86_32)*16], m7
+ sub r3, 16*(40+4*ARCH_X86_32-4)
+
+%if ARCH_X86_64
+ psrld m15, m11, 11 ; pd_1
+%else
+ mova m7, [o(pd_1)]
+%endif
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start
+
+ lea r3, [rsp+56*16]
+ lea t2, [rsp+7*32*16+(64+8*ARCH_X86_32+1*WIN64)*16]
+ movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5]
+ movzx t1d, t0b
+ shr t0d, 8
+ call .shift_transpose
+ ; zero cq
+ pxor m7, m7
+ lea r4, [cq+30*128+r5*8]
+.zero_cq_loop:
+ REPX {mova [r4+x*128], m7}, -2, -1, 0, 1
+ sub r4, 4*128
+ cmp r4, cq
+ jg .zero_cq_loop
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2 code starts here
+ mov eobd, [rsp+gprsize*0+(8*32+64+8*ARCH_X86_32+1*WIN64)*16]
+%if ARCH_X86_32
+ mov strideq, [rsp+gprsize*2+(8*32+64+8)*16]
+%elif WIN64
+ mov r8, [rsp+gprsize*0+64*16]
+%endif
+ add rsp, (64+8*ARCH_X86_32+1*WIN64-3)*16
+ cmp eobd, 36
+ jl .load_veryfast
+ cmp eobd, 136
+ jl .load_fast
+ ; load normal
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)]
+ jmp .run
+.load_fast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ jmp .run
+.load_veryfast:
+ lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ ; fall-through
+.run:
+%if ARCH_X86_64
+ lea r2, [dstq+128]
+ mov r7, -16
+%else
+ lea r2, [rsp+(8*32+3)*16]
+ mov dword [r2+0*gprsize], 8
+%endif
+ jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry
+
+.rect2_mul_fast:
+%if ARCH_X86_64
+ REPX {pmulld x, m14}, m0, m1, m2, m3
+ REPX {paddd x, m11}, m0, m1, m2, m3
+%else
+ mova m4, [o(pd_2896)]
+ mova m5, [o(pd_2048)]
+ REPX {pmulld x, m4 }, m0, m1, m2, m3
+ REPX {paddd x, m5 }, m0, m1, m2, m3
+%endif
+ REPX {psrad x, 12 }, m0, m1, m2, m3
+ ret
+
+.shift_transpose:
+ mova m0, [r3+0*16]
+ mova m1, [r3+1*16]
+ mova m2, [r3+2*16]
+ mova m3, [r3+3*16]
+ mova m4, [r3+4*16]
+ mova m5, [r3+5*16]
+ mova m6, [r3+6*16]
+ mova m7, [r3+7*16]
+ REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [t2+0*16+r5*8], m0
+ mova [t2+8*16+r5*8], m2
+ mova [t2+0*16+t0*8], m3
+ mova [t2+0*16+t1*8], m1
+ sub t2, 16*32
+ sub r3, 8*16
+ cmp r3, rsp
+ jg .shift_transpose
+ ret
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 32
+ add r5d, 128
+ sar r5d, 8
+ imul r5d, 181
+ add r5d, 384
+ sar r5d, 9
+ add rsp, (1+8*32+1*WIN64)*16
+ jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2
+
+cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 16, \
+ 0-(64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16-(4+4*ARCH_X86_32)*gprsize, \
+ dst, stride, c, eob
+ LEA r6, base
+ test eobd, eobd
+ jz .dconly
+
+%if ARCH_X86_32
+ DECLARE_REG_TMP 4, 1, 2, 0, 6
+ mov [rsp+gprsize*1+(64*9+8)*16], r0
+ mov [rsp+gprsize*2+(64*9+8)*16], r1
+ mov [rsp+gprsize*3+(64*9+8)*16], r2
+ mov [rsp+gprsize*4+(64*9+8)*16], r6
+%else
+ DECLARE_REG_TMP 8, 9, 4, 7, 0
+ mov [rsp+gprsize*1+(64*9+1)*16], r9
+ mov [rsp+gprsize*0+64*16], r0
+%if WIN64
+ mov [rsp+gprsize*2+(64*9+1)*16], r7
+ mov [rsp+gprsize*3+(64*9+1)*16], r8
+%endif
+%endif
+%undef cmp
+
+ ; remove entirely-zero iterations
+ mov r5d, 14
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jge .end_zero_loop
+ pxor m0, m0
+.zero_loop:
+ movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
+ movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
+ movzx t0d, t1b
+ movzx t2d, t3b
+ shr t1d, 8
+ shr t3d, 8
+ lea t4, [rsp+7*64*16]
+.zero_loop_inner:
+ mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t0*8], m0
+ mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t1*8], m0
+ mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t2*8], m0
+ mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t3*8], m0
+ sub t4, 64*16
+ cmp t4, rsp
+ jge .zero_loop_inner
+%if ARCH_X86_32
+ mov r6, [rsp+gprsize*4+(64*9+8)*16]
+%endif
+ sub r5d, 2
+ cmp eobw, word [o2(tbl_32x32_2d)+r5]
+ jl .zero_loop
+.end_zero_loop:
+ mov [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16], eobd
+%if ARCH_X86_32
+ mov cq, [rsp+gprsize*3+(64*9+8)*16]
+%endif
+ ; actual first pass after skipping all-zero data
+.loop_pass1:
+%if ARCH_X86_64
+ mova m11, [o(pd_2048)]
+ mova m12, [o(clip_18b_min)]
+ mova m13, [o(clip_18b_max)]
+ mova m14, [o(pd_2896)]
+%endif
+
+ mov r3, rsp
+ lea r4, [o(idct64_mul_16bpc)]
+ mova m0, [cq+128* 1+r5*8]
+ mova m1, [cq+128*31+r5*8]
+ mova m2, [cq+128*17+r5*8]
+ mova m3, [cq+128*15+r5*8]
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ mova m0, [cq+128* 7+r5*8]
+ mova m1, [cq+128*25+r5*8]
+ mova m2, [cq+128*23+r5*8]
+ mova m3, [cq+128* 9+r5*8]
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ mova m0, [cq+128* 5+r5*8]
+ mova m1, [cq+128*27+r5*8]
+ mova m2, [cq+128*21+r5*8]
+ mova m3, [cq+128*11+r5*8]
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ mova m0, [cq+128* 3+r5*8]
+ mova m1, [cq+128*29+r5*8]
+ mova m2, [cq+128*19+r5*8]
+ mova m3, [cq+128*13+r5*8]
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2
+
+ mova m0, [cq+128* 2+r5*8]
+ mova m1, [cq+128*14+r5*8]
+ mova m2, [cq+128*18+r5*8]
+ mova m3, [cq+128*30+r5*8]
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast
+
+ mova m0, [cq+128* 6+r5*8]
+ mova m1, [cq+128*10+r5*8]
+ mova m2, [cq+128*22+r5*8]
+ mova m3, [cq+128*26+r5*8]
+ call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast
+ add r3, 16*(24+4*ARCH_X86_32)
+
+ mova m0, [cq+128* 4+r5*8]
+ mova m1, [cq+128*12+r5*8]
+ mova m2, [cq+128*20+r5*8]
+ mova m3, [cq+128*28+r5*8]
+ call m(idct_16x4_internal_16bpc).main_oddhalf_fast
+
+ mova m0, [cq+128* 0+r5*8]
+ mova m1, [cq+128* 8+r5*8]
+ mova m2, [cq+128*16+r5*8]
+ mova m3, [cq+128*24+r5*8]
+ call m(idct_8x4_internal_16bpc).main_pass1_fast
+ call m(idct_8x4_internal_16bpc).round
+ mova [r3-(7+4*ARCH_X86_32)*16], m1
+ mova [r3-(6+4*ARCH_X86_32)*16], m2
+ mova [r3-(5+4*ARCH_X86_32)*16], m3
+ mova [r3-(4+4*ARCH_X86_32)*16], m4
+ mova [r3-(3+4*ARCH_X86_32)*16], m5
+ mova [r3-(2+4*ARCH_X86_32)*16], m6
+ mova [r3-(1+4*ARCH_X86_32)*16], m7
+ sub r3, 16*(40+4*ARCH_X86_32-4)
+
+%if ARCH_X86_64
+ psrld m15, m11, 10 ; pd_2
+%else
+ mova m7, [o(pd_2)]
+%endif
+ call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start
+
+ lea r3, [rsp+56*16]
+ movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0]
+ movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2]
+ movzx t0d, t1b
+ movzx t2d, t3b
+ shr t1d, 8
+ shr t3d, 8
+ lea t4, [rsp+7*64*16+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16]
+ call .shift_transpose
+ ; zero cq
+ pxor m7, m7
+%if ARCH_X86_32
+ mov cq, [rsp+gprsize*3+(64*9+8)*16]
+%endif
+ lea r4, [cq+30*128+r5*8]
+.zero_cq_loop:
+ REPX {mova [r4+x*128], m7}, -2, -1, 0, 1
+ sub r4, 4*128
+ cmp r4, cq
+ jg .zero_cq_loop
+%if ARCH_X86_32
+ mov r6, [rsp+gprsize*4+(64*9+8)*16]
+%endif
+ sub r5d, 2
+ jge .loop_pass1
+
+ ; pass=2 code starts here
+ mov eobd, [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16]
+%if ARCH_X86_32
+ mov strideq, [rsp+gprsize*2+(9*64+8)*16]
+%else
+ mov r0, [rsp+gprsize*0+64*16]
+%endif
+ add rsp, (64+8*ARCH_X86_32+1*ARCH_X86_64-3)*16
+ cmp eobd, 151
+ jl .fast
+ ; fall-through
+%if ARCH_X86_64
+ DECLARE_REG_TMP 8, 9
+%else
+ DECLARE_REG_TMP 1, 5
+%endif
+ lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)]
+ lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)]
+ jmp .run
+.fast:
+ lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)]
+ lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)]
+.run:
+
+%if ARCH_X86_64
+ lea r2, [dstq+128]
+ mov r7, -16
+%else
+ lea r2, [rsp+(64*8+3)*16]
+ mov [r2+4*gprsize], t0
+ mov [r2+5*gprsize], t1
+ mov r1, [r2+2*gprsize]
+ mov dword [r2+0*gprsize], 8
+%endif
+ jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2
+
+ ; copy of pass=1 tmp-regs
+%if ARCH_X86_32
+ DECLARE_REG_TMP 4, 1, 2, 0, 6
+%else
+ DECLARE_REG_TMP 8, 9, 4, 7, 0
+%endif
+
+.shift_transpose:
+ mova m0, [r3+0*16]
+ mova m1, [r3+1*16]
+ mova m2, [r3+2*16]
+ mova m3, [r3+3*16]
+ mova m4, [r3+4*16]
+ mova m5, [r3+5*16]
+ mova m6, [r3+6*16]
+ mova m7, [r3+7*16]
+ REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ packssdw m4, m5
+ packssdw m6, m7
+ call m(idct_8x4_internal_16bpc).transpose4x8packed
+ mova [t4+t0*8], m0
+ mova [t4+t1*8], m1
+ mova [t4+t2*8], m2
+ mova [t4+t3*8], m3
+ sub t4, 16*64
+ sub r3, 8*16
+ cmp r3, rsp
+ jg .shift_transpose
+ ret
+
+.dconly:
+ imul r5d, [cq], 181
+ mov [cq], eobd ; 0
+ mov r3d, 64
+ add rsp, (64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16 + \
+ (4+4*ARCH_X86_32)*gprsize - (64+8*ARCH_X86_32)*16
+ jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly1