diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 09:22:09 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 09:22:09 +0000 |
commit | 43a97878ce14b72f0981164f87f2e35e14151312 (patch) | |
tree | 620249daf56c0258faa40cbdcf9cfba06de2a846 /third_party/dav1d/src/x86/itx16_sse.asm | |
parent | Initial commit. (diff) | |
download | firefox-upstream.tar.xz firefox-upstream.zip |
Adding upstream version 110.0.1.upstream/110.0.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/dav1d/src/x86/itx16_sse.asm')
-rw-r--r-- | third_party/dav1d/src/x86/itx16_sse.asm | 8135 |
1 files changed, 8135 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/itx16_sse.asm b/third_party/dav1d/src/x86/itx16_sse.asm new file mode 100644 index 0000000000..3833e17c99 --- /dev/null +++ b/third_party/dav1d/src/x86/itx16_sse.asm @@ -0,0 +1,8135 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; Copyright © 2017-2021, The rav1e contributors +; Copyright © 2020, Nathan Egge +; Copyright © 2021, Matthias Dressel +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +SECTION_RODATA +%macro COEF 1-2 +pd_%1: times 4 dd %1 +%if %0 == 2 +pd_m%1: times 4 dd -%1 +%endif +%endmacro + +COEF 201 +COEF 401 +COEF 601, 1 +COEF 799 +COEF 995 +COEF 1189, 1 +COEF 1380, 1 +COEF 1567 +COEF 1751 +COEF 1931 +COEF 2106, 1 +COEF 2276, 1 +COEF 2440 +COEF 2598, 1 +COEF 2751, 1 +COEF 2896 +COEF 3035 +COEF 3166 +COEF 3290 +COEF 3406 +COEF 3513 +COEF 3612 +COEF 3703 +COEF 3784 +COEF 3857 +COEF 3920 +COEF 3973 +COEF 4017 +COEF 4052 +COEF 4076 +COEF 4091 + +deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 + +%if ARCH_X86_32 +pd_1: times 4 dd 1 +%endif +pd_2: times 4 dd 2 +pw_5: times 8 dw 5 +pd_1321: times 4 dd 1321 +pd_2482: times 4 dd 2482 +pd_m3344: times 4 dd -3344 +pd_2048: times 4 dd 2048 +pw_4x2048_4xm2048: times 4 dw 2048 + times 4 dw -2048 +pw_4xm2048_4x2048: times 4 dw -2048 + times 4 dw 2048 +pw_2048: times 8 dw 2048 +pw_m2048: times 8 dw -2048 +pd_3803: times 4 dd 3803 +pw_4096: times 8 dw 4096 +pd_5793: times 4 dd 5793 +pd_6144: times 4 dd 6144 +pw_8192: times 8 dw 8192 +pd_10240: times 4 dd 10240 +pd_11586: times 4 dd 11586 +pw_1697x8: times 8 dw 1697*8 +pw_2896x8: times 8 dw 2896*8 +pw_1697x16: times 8 dw 1697*16 +pw_16384: times 8 dw 16384 +pixel_10bpc_max: times 8 dw 0x03ff + +pw_1567_3784: times 4 dw 1567, 3784 +pw_m3784_1567: times 4 dw -3784, 1567 +pw_2896_2896: times 4 dw 2896, 2896 +pw_m2896_2896: times 4 dw -2896, 2896 + +clip_18b_min: times 4 dd -0x20000 +clip_18b_max: times 4 dd 0x1ffff + +idct64_mul_16bpc: +dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017 +dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799 +dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276 +dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406 + +cextern inv_txfm_add_dct_dct_4x4_8bpc_ssse3 +cextern iadst_4x4_internal_8bpc_ssse3.main +cextern idct_4x8_internal_8bpc_ssse3.main +cextern iadst_4x8_internal_8bpc_ssse3.main +cextern idct_16x4_internal_8bpc_ssse3.main +cextern iadst_16x4_internal_8bpc_ssse3.main +cextern iadst_16x4_internal_8bpc_ssse3.main_pass2_end +cextern idct_8x4_internal_8bpc_ssse3.main +cextern iadst_8x4_internal_8bpc_ssse3.main +cextern idct_8x8_internal_8bpc_ssse3.main +cextern idct_8x8_internal_8bpc_ssse3.pass1_end3 +cextern iadst_8x8_internal_8bpc_ssse3.main +cextern iadst_8x8_internal_8bpc_ssse3.main_pass2_end +cextern idct_16x8_internal_8bpc_ssse3.main +cextern iadst_16x8_internal_8bpc_ssse3.main +cextern iadst_16x8_internal_8bpc_ssse3.main_pass2_end +cextern idct_8x32_internal_8bpc_ssse3.main +cextern idct_8x32_internal_8bpc_ssse3.main_fast +cextern idct_8x32_internal_8bpc_ssse3.main_veryfast +cextern idct_16x64_internal_8bpc_ssse3.main +cextern idct_16x64_internal_8bpc_ssse3.main_fast + +tbl_4x16_2d: db 0, 13, 29, 45 +tbl_4x16_h: db 0, 16, 32, 48 +tbl_4x16_v: db 0, 4, 8, 12 + +tbl_8x16_2d: db 0, 14, 30, 46 +tbl_8x16_v: db 0, 4, 8, 12 +tbl_8x16_h: db 0, 32, 64, 96 + +tbl_16x16_2d: db 0, 10, 36, 78 +tbl_16x16_v: db 0, 4, 8, 12 +tbl_16x16_h: db 0, 64, 128, 192 + +tbl_8x32_2d: dw 0, 14, 43, 75, 107, 139, 171, 203 + +tbl_16x32_2d: dw 0, 14, 44, 90, 151, 215, 279, 343 + +tbl_32x16_2d: ; first 4 entries of 32x32 are identical to this one +tbl_32x32_2d: dw 0, 10, 36, 78, 136, 210, 300, 406 + +tbl_Nx32_odd_offset: db 2*16, 2*23 + db 2*20, 2*19 + db 2*18, 2*21 + db 2*22, 2*17 + db 2*30, 2*25 + db 2*26, 2*29 + db 2*28, 2*27 + db 2*24, 2*31 + +tbl_Nx64_offset: db 2* 0, 2*32, 2*16, 2*46 + db 2* 8, 2*40, 2*23, 2*38 + db 2* 1, 2*36, 2*20, 2*42 + db 2* 9, 2*44, 2*19, 2*34 + db 2* 2, 2*60, 2*18, 2*50 + db 2*10, 2*52, 2*21, 2*58 + db 2* 3, 2*56, 2*22, 2*54 + db 2*11, 2*48, 2*17, 2*62 + +SECTION .text + +%define m_suffix(x, sfx) mangle(private_prefix %+ _ %+ x %+ sfx) +%define m(x) m_suffix(x, SUFFIX) + +; This refers to the first function in itx_sse i.e. the start of the text section +; which is needed as a base pointer for constants. +%define itx8_start m_suffix(inv_txfm_add_dct_dct_4x4_8bpc, _ssse3) + +%if ARCH_X86_64 +%define o(x) x +%else +%define o(x) r6-$$+x ; PIC +%endif + +%macro IWHT4_1D 0 + ; m0 = in0, m1 = in1, m2 = in2, m3 = in3 + paddd m0, m1 ; in0 += in1 + psubd m4, m2, m3 ; tmp0 = in2 - in3 + psubd m5, m0, m4 ; tmp1 = (in0 - tmp0) >> 1 + psrad m5, 1 + psubd m2, m5, m1 ; in2 = tmp1 - in1 + psubd m5, m3 ; in1 = tmp1 - in3 + psubd m0, m5 ; in0 -= in1 + paddd m4, m2 ; in3 = tmp0 + in2 + ; m0 = out0, m1 = in1, m2 = out2, m3 = in3 + ; m4 = out3, m5 = out1 +%endmacro + +INIT_XMM sse2 +cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 3, 6, dst, stride, c, eob, bdmax + mova m0, [cq+16*0] + mova m1, [cq+16*1] + mova m2, [cq+16*2] + mova m3, [cq+16*3] + REPX {psrad x, 2}, m0, m1, m2, m3 + IWHT4_1D + punpckldq m1, m0, m5 + punpckhdq m3, m0, m5 + punpckldq m5, m2, m4 + punpckhdq m2, m4 + punpcklqdq m0, m1, m5 + punpckhqdq m1, m5 + punpcklqdq m4, m3, m2 + punpckhqdq m3, m2 + mova m2, m4 + IWHT4_1D + packssdw m0, m4 ; low: out3, high: out0 + packssdw m2, m5 ; low: out2, high: out1 + pxor m4, m4 + mova [cq+16*0], m4 + mova [cq+16*1], m4 + mova [cq+16*2], m4 + mova [cq+16*3], m4 + lea r2, [dstq+strideq*2] + movq m1, [dstq+strideq*0] + movhps m1, [r2 +strideq*1] + movq m3, [r2 +strideq*0] + movhps m3, [dstq+strideq*1] + movd m5, bdmaxm + pshuflw m5, m5, q0000 ; broadcast + punpcklqdq m5, m5 ; broadcast + paddsw m0, m1 + paddsw m2, m3 + pmaxsw m0, m4 + pmaxsw m2, m4 + pminsw m0, m5 + pminsw m2, m5 + movhps [r2 +strideq*1], m0 ; write out0 + movhps [dstq+strideq*1], m2 ; write out1 + movq [r2 +strideq*0], m2 ; write out2 + movq [dstq+strideq*0], m0 ; write out3 + RET + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +; flags: 2 = inv_dst1, 4 = inv_dst2 +; skip round/shift if rnd is not a number +%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags +; %1 dst/src[1] +; %2 dst/src[2] +; %3 tmp[1] +; %4 tmp[2] +; %5 tmp[3] +; %6 rnd +; %7 coef[1] +; %8 coef[2] +; %9 flags +%ifnidn %7,%8 ; optimize when coef1 == coef2 +%if %8 < 32 + pmulld m%4, m%1, m%8 + pmulld m%3, m%2, m%8 +%else + mova m%3, [o(pd_%8)] + pmulld m%4, m%1, m%3 + pmulld m%3, m%2 +%endif +%endif +%if %7 < 32 + pmulld m%1, m%7 + pmulld m%2, m%7 +%else + mova m%5, [o(pd_%7)] + pmulld m%1, m%5 + pmulld m%2, m%5 +%endif +%if %9 & 4 ; invert dst2 + paddd m%4, m%2 + psubd m%2, m%6, m%4 +%else +%ifnum %6 +%ifnidn %7,%8 + paddd m%4, m%6 +%else + paddd m%1, m%6 +%endif +%endif +%ifnidn %7,%8 + paddd m%2, m%4 +%else + mova m%3, m%2 + paddd m%2, m%1 +%endif +%endif +%if %9 & 2 ; invert dst1 + psubd m%3, m%1 + paddd m%1, m%3, m%6 +%else +%ifnum %6 +%ifnidn %7,%8 + paddd m%1, m%6 +%endif +%endif + psubd m%1, m%3 +%endif +%ifnum %6 + psrad m%2, 12 + psrad m%1, 12 +%endif +%endmacro + +%macro INV_TXFM_FN 4-5+ 8 ; type1, type2, eob_offset, size, mmsize/stack +cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 7, %5, dst, stride, c, eob, tx2 + %define %%p1 m(i%1_%4_internal_16bpc) +%if ARCH_X86_32 + LEA r6, $$ +%endif +%if has_epilogue +%ifidn %1_%2, dct_dct + test eobd, eobd + jz %%end +%endif + lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)] +%ifnum %3 +%if %3 + add eobd, %3 +%endif +%else + lea r5, [o(%3)] +%endif + call %%p1 + RET +%%end: +%else + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [o(m(i%2_%4_internal_16bpc).pass2)] +%ifnum %3 +%if %3 + add eobd, %3 +%endif +%else + lea r5, [o(%3)] +%endif +%ifidn %1_%2, dct_dct + test eobd, eobd + jnz %%p1 +%else + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endif +%endmacro + +%macro INV_TXFM_4X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 0, 4x4 +%ifidn %1_%2, dct_dct + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 4 +.dconly: + add r5d, 128 + sar r5d, 8 +.dconly2: + imul r5d, 2896 + mova m2, [o(pixel_10bpc_max)] + add r5d, 34816 + movd m0, r5d + pshuflw m0, m0, q1111 + pxor m3, m3 + punpcklqdq m0, m0 +.dconly_loop: + movq m1, [dstq+strideq*0] + movhps m1, [dstq+strideq*1] + paddw m1, m0 + pminsw m1, m2 + pmaxsw m1, m3 + movq [dstq+strideq*0], m1 + movhps [dstq+strideq*1], m1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +%endif +%endmacro + +%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd + ; butterfly rotation + ITX_MULSUB_2D %1, %3, %5, %6, %7, %8, 2896, 2896 ; %1 out1 %3 out0 + ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; %2 out2 %4 out3 + ; Hadamard rotation + psubd m%5, m%1, m%2 + paddd m%2, m%1 + paddd m%1, m%3, m%4 + psubd m%3, m%4 + ; %1 (src1) = out0 + ; %2 (src2) = out1 + ; %3 (src3) = out3 + ; $5 (tmp1) = out2 +%endmacro + +INIT_XMM sse4 + +INV_TXFM_4X4_FN dct, dct +INV_TXFM_4X4_FN dct, identity +INV_TXFM_4X4_FN dct, adst +INV_TXFM_4X4_FN dct, flipadst + +cglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + mova m2, [cq+16*2] + mova m3, [cq+16*3] + mova m5, [o(pd_2048)] + call .pass1_main + packssdw m0, m1 ; out0 out1 + packssdw m4, m2 ; out2 out3 + ; transpose + punpckhwd m2, m0, m4 + punpcklwd m0, m4 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + ; m0 = out0 out1 + ; m1 = out2 out3 + ; m5 = pd_2048 + jmp tx2q +.pass1_main: + IDCT4_1D 0, 1, 2, 3, 4, 6, 7, 5 + ret +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 + ; m5 = pd_2048 + punpckhwd m2, m1, m0 + punpcklwd m1, m0 + pmaddwd m4, m2, [o(pw_m3784_1567)] + pmaddwd m2, [o(pw_1567_3784)] + pmaddwd m0, m1, [o(pw_m2896_2896)] + pmaddwd m1, [o(pw_2896_2896)] + REPX {paddd x, m5}, m4, m2, m0, m1 + packssdw m5, m5 ; pw_2048 + REPX {psrad x, 12}, m4, m2, m0, m1 + packssdw m2, m4 ; t3 t2 + packssdw m1, m0 ; t0 t1 + paddsw m0, m1, m2 ; out0 out1 + psubsw m1, m2 ; out3 out2 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + movq m2, [dstq+strideq*0] + movhps m2, [dstq+strideq*1] + lea r5, [dstq+strideq*2] + movq m3, [r5 +strideq*1] + movhps m3, [r5 +strideq*0] + mova m5, [o(pixel_10bpc_max)] + pxor m4, m4 + mova [cq+16*0], m4 + mova [cq+16*1], m4 + mova [cq+16*2], m4 + mova [cq+16*3], m4 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movhps [r5 +strideq*0], m1 + movq [r5 +strideq*1], m1 + RET + +INV_TXFM_4X4_FN adst, dct +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst +INV_TXFM_4X4_FN adst, identity + +cglobal iadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call .main + packssdw m0, m2 ; out0 out1 + packssdw m1, m4 ; out2 out3 + ; transpose + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + ; m0 = out0 out1 + ; m1 = out2 out3 + ; m5 = pd_2048 + jmp tx2q +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main +.end: + mova m4, [o(pw_2048)] + movq m2, [dstq+strideq*0] + movhps m2, [dstq+strideq*1] + lea r5, [dstq+strideq*2] + movq m3, [r5 +strideq*0] + movhps m3, [r5 +strideq*1] + mova m5, [o(pixel_10bpc_max)] + pmulhrsw m0, m4 + pmulhrsw m1, m4 + pxor m4, m4 + mova [cq+16*0], m4 + mova [cq+16*1], m4 + mova [cq+16*2], m4 + mova [cq+16*3], m4 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [r5 +strideq*0], m1 + movhps [r5 +strideq*1], m1 + RET +ALIGN function_align +.main: + mova m1, [cq+16*2] + mova m3, [cq+16*3] + mova m5, [cq+16*0] + lea r3, [cq+16*1] +.main2: + mova m0, [o(pd_1321)] ; SINPI_1_9 + mova m2, [o(pd_2482)] ; SINPI_2_9 + mova m6, [o(pd_3803)] ; SINPI_4_9 + pmulld m4, m0, m1 ; s[4] = SINPI_1_9 * T[2] + pmulld m7, m3, m6 ; s[6] = SINPI_4_9 * T[3] + pmulld m6, m1 ; s[3] = SINPI_4_9 * T[2] + pmulld m0, m5 ; s[0] = SINPI_1_9 * T[0] + psubd m1, m3 ; T[2] - T[3] + pmulld m3, m2 ; s[5] = SINPI_2_9 * T[3] + pmulld m2, m5 ; s[1] = SINPI_2_9 * T[0] + paddd m0, m6 ; s[0] += s[3] + paddd m0, m3 ; s[0] += s[5] + mova m3, [o(pd_m3344)] ; -SINPI_3_9 + psubd m2, m4 ; s[1] -= s[4] + psubd m2, m7 ; s[1] -= s[6] + psubd m1, m5 ; -b7 = (T[2] -T[3]) - T[0] + pmulld m1, m3 ; s[2] = -SINPI_3_9 * -b7 + pmulld m3, [r3] ; -s[3] = -SINPI_3_9 * T[1] + mova m5, [o(pd_2048)] + REPX {paddd x, m5}, m0, m1 ; {s[0], s[2]} + 2048 + paddd m4, m0, m2 ; x[3] = s[0] + s[1] + psubd m2, m3 ; x[1] = s[1] + s[3] + psubd m0, m3 ; x[0] = s[0] + s[3] + paddd m4, m3 ; x[3] -= s[3] + paddd m2, m5 ; x[1] + 2048 + REPX {psrad x, 12}, m0, m2, m1, m4 + ret + + +INV_TXFM_4X4_FN flipadst, dct +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst +INV_TXFM_4X4_FN flipadst, identity + +cglobal iflipadst_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call m(iadst_4x4_internal_16bpc).main + packssdw m0, m2 ; out0 out1 + packssdw m1, m4 ; out2 out3 + ; transpose + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + ; m0 = out0 out1 + ; m1 = out2 out3 + ; m5 = pd_2048 + jmp tx2q +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_4x4_internal_8bpc, _ssse3).main + mova m4, [o(pw_2048)] + movq m3, [dstq+strideq*1] + movhps m3, [dstq+strideq*0] + lea r5, [dstq+strideq*2] + movq m2, [r5 +strideq*1] + movhps m2, [r5 +strideq*0] + mova m5, [o(pixel_10bpc_max)] + pmulhrsw m0, m4 + pmulhrsw m1, m4 + pxor m4, m4 + mova [cq+16*0], m4 + mova [cq+16*1], m4 + mova [cq+16*2], m4 + mova [cq+16*3], m4 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + movhps [dstq+strideq*0], m1 + movq [dstq+strideq*1], m1 + movhps [r5 +strideq*0], m0 + movq [r5 +strideq*1], m0 + RET + +INV_TXFM_4X4_FN identity, dct +INV_TXFM_4X4_FN identity, adst +INV_TXFM_4X4_FN identity, flipadst +INV_TXFM_4X4_FN identity, identity + +cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + mova m3, [o(pd_5793)] + pmulld m0, m3, [cq+16*0] + pmulld m1, m3, [cq+16*1] + pmulld m2, m3, [cq+16*2] + pmulld m3, [cq+16*3] + mova m5, [o(pd_2048)] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + packssdw m0, m1 + packssdw m2, m3 + ; transpose + punpckhwd m3, m0, m2 + punpcklwd m0, m2 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + ; m0 = out0 out1 + ; m1 = out2 out3 + ; m5 = pd_2048 + jmp tx2q +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 + ; m5 = pd_2048 + mova m4, [o(pw_1697x8)] + movq m2, [dstq+strideq*0] + movhps m2, [dstq+strideq*1] + lea r5, [dstq+strideq*2] + pmulhrsw m3, m4, m0 + pmulhrsw m4, m1 + paddsw m0, m3 + paddsw m1, m4 + movq m3, [r5 +strideq*0] + movhps m3, [r5 +strideq*1] + mova m4, [o(pixel_10bpc_max)] + packssdw m5, m5 ; pw_2048 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + pxor m5, m5 + mova [cq+16*0], m5 + mova [cq+16*1], m5 + mova [cq+16*2], m5 + mova [cq+16*3], m5 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m5 + pmaxsw m1, m5 + pminsw m0, m4 + pminsw m1, m4 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [r5 +strideq*0], m1 + movhps [r5 +strideq*1], m1 + RET + +%macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2, eob_offset + INV_TXFM_FN %1, %2, %3, 4x8 +%ifidn %1_%2, dct_dct + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 8 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly +%endif +%endmacro + +INV_TXFM_4X8_FN dct, dct +INV_TXFM_4X8_FN dct, identity, 9 +INV_TXFM_4X8_FN dct, adst +INV_TXFM_4X8_FN dct, flipadst + +cglobal idct_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp + mova m5, [o(pd_2048)] +%if ARCH_X86_64 + xor r5d, r5d + cmp eobd, 13 + setge r5b +%else + mov r5d, 1 + cmp eobd, 13 + sbb r5d, 0 +%endif + shl r5d, 4 +.loop_pass1: + mova m3, [o(pd_2896)] + pmulld m0, m3, [cq+32*0+r5] + pmulld m1, m3, [cq+32*1+r5] + pmulld m2, m3, [cq+32*2+r5] + pmulld m3, [cq+32*3+r5] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + call m(idct_4x4_internal_16bpc).pass1_main + packssdw m0, m1 ; out0 out1 + packssdw m4, m2 ; out2 out3 + test r5d, r5d + jz .end_pass1 + mova [cq+32*0+16], m0 + mova [cq+32*1+16], m4 + xor r5d, r5d + jmp .loop_pass1 +.end_pass1: + punpckhwd m2, m0, m4 + punpcklwd m0, m4 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + mova m2, [cq+32*0+16] + mova m6, [cq+32*1+16] + punpckhwd m4, m2, m6 + punpcklwd m2, m6 + punpckhwd m3, m2, m4 + punpcklwd m2, m4 + ; m0-3 = packed & transposed output + jmp tx2q +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(idct_4x8_internal_8bpc, _ssse3).main + ; m0-3 is now out0/1,3/2,4/5,7/6 + mova m4, [o(pw_2048)] + shufps m1, m1, q1032 + shufps m3, m3, q1032 +.end: + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + pxor m4, m4 + REPX {mova [cq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 + mova m7, [o(pixel_10bpc_max)] + lea r2, [strideq*3] + movq m5, [dstq+strideq*0] + movq m6, [dstq+strideq*2] + movhps m5, [dstq+strideq*1] + movhps m6, [dstq+r2] + lea r4, [dstq+strideq*4] + paddw m0, m5 + paddw m1, m6 + movq m5, [r4+strideq*0] + movq m6, [r4+strideq*2] + movhps m5, [r4+strideq*1] + movhps m6, [r4+r2] + paddw m2, m5 + paddw m3, m6 + REPX {pminsw x, m7}, m0, m1, m2, m3 + REPX {pmaxsw x, m4}, m0, m1, m2, m3 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [dstq+strideq*2], m1 + movhps [dstq+r2 ], m1 + movq [r4 +strideq*0], m2 + movhps [r4 +strideq*1], m2 + movq [r4 +strideq*2], m3 + movhps [r4 +r2 ], m3 + RET + +INV_TXFM_4X8_FN adst, dct +INV_TXFM_4X8_FN adst, adst +INV_TXFM_4X8_FN adst, flipadst +INV_TXFM_4X8_FN adst, identity, 9 + +cglobal iadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call .pass1_main + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + mova m2, [cq+32*2+16] + mova m6, [cq+32*3+16] + punpckhwd m4, m2, m6 + punpcklwd m2, m6 + punpckhwd m3, m2, m4 + punpcklwd m2, m4 + ; m0-3 = packed & transposed output + jmp tx2q +.pass1_main: +%undef cmp +%if ARCH_X86_64 + xor r5d, r5d + cmp eobd, 13 + setge r5b +%else + mov r5d, 1 + cmp eobd, 13 + sbb r5d, 0 +%endif + shl r5d, 4 + lea r3, [cq+32*1+16] +.loop_pass1: + mova m0, [o(pd_2048)] + mova m3, [o(pd_2896)] + pmulld m5, m3, [cq+32*0+r5] + pmulld m2, m3, [cq+32*1+r5] + pmulld m1, m3, [cq+32*2+r5] + pmulld m3, [cq+32*3+r5] + REPX {paddd x, m0}, m5, m2, m1, m3 + REPX {psrad x, 12}, m5, m2, m1, m3 + mova [r3], m2 + call m(iadst_4x4_internal_16bpc).main2 + packssdw m0, m2 ; out0 out1 + packssdw m1, m4 ; out2 out3 + test r5d, r5d + jz .end_pass1 + mova [cq+32*2+16], m0 + mova [cq+32*3+16], m1 + xor r5d, r5d + jmp .loop_pass1 +.end_pass1: + ret +.pass2: + shufps m0, m0, q1032 + shufps m1, m1, q1032 +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main + mova m4, [o(pw_4x2048_4xm2048)] + jmp m(idct_4x8_internal_16bpc).end + +INV_TXFM_4X8_FN flipadst, dct +INV_TXFM_4X8_FN flipadst, adst +INV_TXFM_4X8_FN flipadst, flipadst +INV_TXFM_4X8_FN flipadst, identity, 9 + +cglobal iflipadst_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + call m(iadst_4x8_internal_16bpc).pass1_main + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + mova m6, [cq+32*2+16] + mova m2, [cq+32*3+16] + punpcklwd m4, m2, m6 + punpckhwd m2, m6 + punpckhwd m3, m2, m4 + punpcklwd m2, m4 + ; m0-3 = packed & transposed output + jmp tx2q +.pass2: + shufps m0, m0, q1032 + shufps m1, m1, q1032 +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_4x8_internal_8bpc, _ssse3).main + mova m4, m0 + mova m5, m1 + pshufd m0, m3, q1032 + pshufd m1, m2, q1032 + pshufd m2, m5, q1032 + pshufd m3, m4, q1032 + mova m4, [o(pw_4xm2048_4x2048)] + jmp m(idct_4x8_internal_16bpc).end + +INV_TXFM_4X8_FN identity, dct +INV_TXFM_4X8_FN identity, adst +INV_TXFM_4X8_FN identity, flipadst +INV_TXFM_4X8_FN identity, identity, 3 + +cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp + mova m5, [o(pd_2048)] + mova m4, [o(pd_2896)] + mova m6, [o(pd_5793)] + ; clear m7 in case we skip the bottom square + pxor m7, m7 +%if ARCH_X86_64 + xor r5d, r5d + cmp eobd, 16 + setge r5b +%else + mov r5d, 1 + cmp eobd, 16 + sbb r5d, 0 +%endif + shl r5d, 4 +.loop_pass1: + pmulld m0, m4, [cq+32*0+r5] + pmulld m1, m4, [cq+32*1+r5] + pmulld m2, m4, [cq+32*2+r5] + pmulld m3, m4, [cq+32*3+r5] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + REPX {pmulld x, m6}, m0, m1, m2, m3 + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + packssdw m0, m1 + packssdw m2, m3 + test r5d, r5d + jz .end_pass1 + mova [cq+32*0+16], m0 + mova m7, m2 + xor r5d, r5d + jmp .loop_pass1 +.end_pass1: + punpckhwd m4, m0, m2 + punpcklwd m0, m2 + punpckhwd m1, m0, m4 + punpcklwd m0, m4 + mova m2, [cq+32*0+16] + punpckhwd m4, m2, m7 + punpcklwd m2, m7 + punpckhwd m3, m2, m4 + punpcklwd m2, m4 + ; m0-3 = packed & transposed output + jmp tx2q +.pass2: + mova m4, [o(pw_4096)] + jmp m(idct_4x8_internal_16bpc).end + +%macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix + INV_TXFM_FN %1, %2, tbl_4x16_%3, 4x16 +%ifidn %1_%2, dct_dct + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 16 + add r5d, 384 + sar r5d, 9 + jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly2 +%endif +%endmacro + +INV_TXFM_4X16_FN dct, dct +INV_TXFM_4X16_FN dct, identity, v +INV_TXFM_4X16_FN dct, adst +INV_TXFM_4X16_FN dct, flipadst + +cglobal idct_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp +%if ARCH_X86_32 + mov r5m, r6d +%endif + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r5+r6] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, r5m +%endif + mova m5, [o(pd_2048)] +.loop_pass1: + mova m0, [cq+64*0+r5] + mova m1, [cq+64*1+r5] + mova m2, [cq+64*2+r5] + mova m3, [cq+64*3+r5] + call m(idct_4x4_internal_16bpc).pass1_main + pcmpeqd m3, m3 + REPX {psubd x, m3}, m0, m1, m4, m2 + REPX {psrad x, 1}, m0, m1, m4, m2 + packssdw m0, m1 ; out0 out1 + packssdw m4, m2 ; out2 out3 + punpckhwd m2, m0, m4 + punpcklwd m0, m4 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + test r5d, r5d + jz .end_pass1 + mova [cq+64*0+r5], m0 + mova [cq+64*1+r5], m1 + sub r5d, 16 + jmp .loop_pass1 +.end_pass1: + mova m2, [cq+64*0+16] + mova m3, [cq+64*1+16] + mova m4, [cq+64*0+32] + mova m5, [cq+64*1+32] + mova m6, [cq+64*0+48] + mova m7, [cq+64*1+48] + ; m0-7 = packed & transposed output + jmp tx2q +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(idct_16x4_internal_8bpc, _ssse3).main + ; m0-6 is out0-13 [with odd registers having inversed output] + ; [coeffq+16*7] has out15/14 + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [cq+16*7] + REPX {shufps x, x, q1032}, m1, m3, m5, m7 + mova [cq+16*0], m4 + mova [cq+16*1], m5 + mova [cq+16*2], m6 + mova [cq+16*3], m7 +.end: + pxor m4, m4 + REPX {mova [cq+16*x], m4}, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + mova m7, [o(pixel_10bpc_max)] + mov r5d, 2 + lea r3, [strideq*3] +.loop: + movq m5, [dstq+strideq*0] + movq m6, [dstq+strideq*2] + movhps m5, [dstq+strideq*1] + movhps m6, [dstq+r3] + lea r4, [dstq+strideq*4] + paddw m0, m5 + paddw m1, m6 + movq m5, [r4+strideq*0] + movq m6, [r4+strideq*2] + movhps m5, [r4+strideq*1] + movhps m6, [r4+r3] + paddw m2, m5 + paddw m3, m6 + REPX {pminsw x, m7}, m0, m1, m2, m3 + REPX {pmaxsw x, m4}, m0, m1, m2, m3 + movq [dstq+strideq*0], m0 + movhps [dstq+strideq*1], m0 + movq [dstq+strideq*2], m1 + movhps [dstq+r3 ], m1 + movq [r4 +strideq*0], m2 + movhps [r4 +strideq*1], m2 + movq [r4 +strideq*2], m3 + movhps [r4 +r3 ], m3 + dec r5d + jz .end2 + lea dstq, [dstq+strideq*8] + mova m0, [cq+0*16] + mova m1, [cq+1*16] + mova m2, [cq+2*16] + mova m3, [cq+3*16] + REPX {mova [cq+x*16], m4}, 0, 1, 2, 3 + jmp .loop +.end2: + RET + +INV_TXFM_4X16_FN adst, dct +INV_TXFM_4X16_FN adst, adst +INV_TXFM_4X16_FN adst, flipadst +INV_TXFM_4X16_FN adst, identity, v + +cglobal iadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp +%if ARCH_X86_32 + mov r5m, r6d +%endif + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r6+r5] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, r5m +%endif +.loop_pass1: + mova m5, [cq+64*0+r5] + lea r3, [cq+64*1+r5] + mova m1, [cq+64*2+r5] + mova m3, [cq+64*3+r5] + call m(iadst_4x4_internal_16bpc).main2 + pcmpeqd m3, m3 + REPX {psubd x, m3}, m0, m2, m1, m4 + REPX {psrad x, 1}, m0, m2, m1, m4 + packssdw m0, m2 ; out0 out1 + packssdw m1, m4 ; out2 out3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + test r5d, r5d + jz m(idct_4x16_internal_16bpc).end_pass1 + mova [cq+64*0+r5], m0 + mova [cq+64*1+r5], m1 + sub r5d, 16 + jmp .loop_pass1 +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main + call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end + ; m7/5/2/4 = out4/-11,-5/10,6/-9,-7/8 + ; m0/3 & cq6/7 = out0/-15,-3/12,-1/14,2/-13 + mova m1, [o(pw_4x2048_4xm2048)] + REPX {pmulhrsw x, m1}, m7, m2, m0 + pshufd m6, m1, q1032 ; 4x-2048,4x2048 + pmulhrsw m1, [cq+16*7] + REPX {pmulhrsw x, m6}, m5, m4, m3 + pmulhrsw m6, [cq+16*6] + ; m7/5/2/4 = out4/11,5/10,6/9,7/8 + ; m0/3/6/1 = out0/15,3/12,1/14,2/13 + ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15 + movhps [cq+0*8], m4 + movhps [cq+1*8], m2 + movhps [cq+2*8], m5 + movhps [cq+3*8], m7 + movhps [cq+4*8], m3 + movhps [cq+5*8], m1 + movhps [cq+6*8], m6 + movhps [cq+7*8], m0 + punpcklqdq m0, m6 + punpcklqdq m1, m3 + punpcklqdq m3, m2, m4 + punpcklqdq m2, m7, m5 + jmp m(idct_4x16_internal_16bpc).end + +INV_TXFM_4X16_FN flipadst, dct +INV_TXFM_4X16_FN flipadst, adst +INV_TXFM_4X16_FN flipadst, flipadst +INV_TXFM_4X16_FN flipadst, identity, v + +cglobal iflipadst_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp +%if ARCH_X86_32 + mov r5m, r6d +%endif + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r5+r6] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, r5m +%endif +.loop_pass1: + mova m5, [cq+64*0+r5] + lea r3, [cq+64*1+r5] + mova m1, [cq+64*2+r5] + mova m3, [cq+64*3+r5] + call m(iadst_4x4_internal_16bpc).main2 + pcmpeqd m3, m3 + REPX {psubd x, m3}, m0, m2, m1, m4 + REPX {psrad x, 1}, m0, m2, m1, m4 + packssdw m0, m2 ; out3 out2 + packssdw m1, m4 ; out1 out0 + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + test r5d, r5d + jz m(idct_4x16_internal_16bpc).end_pass1 + mova [cq+64*0+r5], m0 + mova [cq+64*1+r5], m1 + sub r5d, 16 + jmp .loop_pass1 +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main + call m_suffix(iadst_16x4_internal_8bpc, _ssse3).main_pass2_end + ; m7/5/2/4 = out11/-4,-10/5,9/-6,-8/7 + ; m0/3 & cq6/7 = out15/-0,-12/3,-14/1,13/-2 + mova m1, [o(pw_4x2048_4xm2048)] + REPX {pmulhrsw x, m1}, m7, m2, m0 + pshufd m6, m1, q1032 ; 4x-2048,4x2048 + pmulhrsw m1, [cq+16*7] + REPX {pmulhrsw x, m6}, m5, m4, m3 + pmulhrsw m6, [cq+16*6] + ; m7/5/2/4 = out11/4,10/5,9/6,8/7 + ; m0/3/6/1 = out15/0,12/3,14/1,13/2 + ; output should be as 0-3 for out0-7, and cq+0-3*16 for out8-15 + movq [cq+0*8], m4 + movq [cq+1*8], m2 + movq [cq+2*8], m5 + movq [cq+3*8], m7 + movq [cq+4*8], m3 + movq [cq+5*8], m1 + movq [cq+6*8], m6 + movq [cq+7*8], m0 + punpckhqdq m0, m6 + punpckhqdq m1, m3 + punpckhqdq m3, m2, m4 + punpckhqdq m2, m7, m5 + jmp m(idct_4x16_internal_16bpc).end + +INV_TXFM_4X16_FN identity, dct, h +INV_TXFM_4X16_FN identity, adst, h +INV_TXFM_4X16_FN identity, flipadst, h +INV_TXFM_4X16_FN identity, identity + +cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%undef cmp +%if ARCH_X86_32 + mov r5m, r6d +%endif + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r5+r6] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, r5m +%endif + mova m5, [o(pd_6144)] + mova m4, [o(pd_5793)] +.loop_pass1: + pmulld m0, m4, [cq+64*0+r5] + pmulld m1, m4, [cq+64*1+r5] + pmulld m2, m4, [cq+64*2+r5] + pmulld m3, m4, [cq+64*3+r5] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 13}, m0, m1, m2, m3 + packssdw m0, m1 + packssdw m2, m3 + punpckhwd m3, m0, m2 + punpcklwd m0, m2 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + test r5d, r5d + jz m(idct_4x16_internal_16bpc).end_pass1 + mova [cq+64*0+r5], m0 + mova [cq+64*1+r5], m1 + sub r5d, 16 + jmp .loop_pass1 +.pass2: + mova [cq+16*4], m0 + mova [cq+16*5], m1 + mova [cq+16*6], m2 + mova [cq+16*7], m7 + mova m0, [o(pw_1697x16)] + mova m7, [o(pw_2048)] + pmulhrsw m1, m0, m4 + pmulhrsw m2, m0, m5 + REPX {paddsw x, x}, m4, m5 + paddsw m4, m1 + paddsw m5, m2 + REPX {pmulhrsw x, m7}, m4, m5 + mova [cq+16*0], m4 + mova [cq+16*1], m5 + mova m4, [cq+16*7] + pmulhrsw m1, m0, m6 + pmulhrsw m2, m0, m4 + REPX {paddsw x, x}, m6, m4 + paddsw m6, m1 + paddsw m4, m2 + REPX {pmulhrsw x, m7}, m6, m4 + mova [cq+16*2], m6 + mova [cq+16*3], m4 + mova m4, [cq+16*4] + mova m1, [cq+16*5] + mova m2, [cq+16*6] + pmulhrsw m5, m0, m2 + pmulhrsw m6, m0, m3 + REPX {paddsw x, x}, m2, m3 + paddsw m2, m5 + paddsw m3, m6 + pmulhrsw m6, m0, m1 + pmulhrsw m0, m4 + REPX {paddsw x, x}, m1, m4 + paddsw m1, m6 + paddsw m0, m4 + REPX {pmulhrsw x, m7}, m2, m3, m1, m0 + jmp m(idct_4x16_internal_16bpc).end + +%macro INV_TXFM_8X4_FN 2 ; type1, type2 +%if ARCH_X86_64 + INV_TXFM_FN %1, %2, 0, 8x4, 15 +%else + INV_TXFM_FN %1, %2, 0, 8x4, 8, 0-4*16 +%endif +%ifidn %1_%2, dct_dct + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + add r5d, 128 + sar r5d, 8 + imul r5d, 2896 + add r5d, 34816 + movd m0, r5d + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + mova m6, [o(pixel_10bpc_max)] + pxor m5, m5 + lea r2, [strideq*3] + mova m1, [dstq+strideq*0] + mova m2, [dstq+strideq*1] + mova m3, [dstq+strideq*2] + mova m4, [dstq+r2] + REPX {paddw x, m0}, m1, m2, m3, m4 + REPX {pmaxsw x, m5}, m1, m2, m3, m4 + REPX {pminsw x, m6}, m1, m2, m3, m4 + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 + mova [dstq+strideq*2], m3 + mova [dstq+r2 ], m4 + RET +%endif +%endmacro + +INV_TXFM_8X4_FN dct, dct +INV_TXFM_8X4_FN dct, identity +INV_TXFM_8X4_FN dct, adst +INV_TXFM_8X4_FN dct, flipadst + +cglobal idct_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + lea r5, [o(.main)] +.pass1_entry: +%if ARCH_X86_32 + lea r3, [rsp+gprsize] +%else + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + mova m0, [cq+0*16] + mova m1, [cq+1*16] + mova m2, [cq+2*16] + mova m3, [cq+3*16] + mova m4, [cq+4*16] + mova m5, [cq+5*16] + mova m6, [cq+6*16] + mova m7, [cq+7*16] + call .rect2_mul + call r5 + call .transpose4x8packed + ; m0-3 = packed & transposed output + jmp tx2q +.transpose4x8packed: + ; transpose + punpcklwd m1, m2, m6 + punpckhwd m2, m6 + punpckhwd m6, m0, m4 + punpcklwd m0, m4 + + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + punpckhwd m4, m6, m2 + punpcklwd m6, m2 + + punpcklwd m2, m3, m4 + punpckhwd m3, m4 + punpckhwd m1, m0, m6 + punpcklwd m0, m6 + ret +.main: + call .main_pass1 + call .round + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + ret +.rect2_mul: +%if ARCH_X86_64 + REPX {pmulld x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 +%else + mova [r3], m7 + mova m7, [o(pd_2896)] + REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulld m7, [r3] + mova [r3], m7 + mova m7, [o(pd_2048)] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 + paddd m7, [r3] +%endif + REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 + ret +%if ARCH_X86_64 +.main_pass1_fast: + pmulld m5, m3, [o(pd_m2276)] + pmulld m3, [o(pd_3406)] + pmulld m7, m1, [o(pd_4017)] + pmulld m1, [o(pd_799)] + pmulld m6, m2, [o(pd_3784)] + pmulld m2, [o(pd_1567)] + pmulld m0, m14 + pxor m4, m4 + jmp .main_pass1_fast2 +.main_pass1: + ITX_MULSUB_2D 5, 3, 8, 9, 10, _, 3406, 2276 ; t5a t6a + ITX_MULSUB_2D 1, 7, 8, 9, 10, _, 799, 4017 ; t4a t7a + ITX_MULSUB_2D 2, 6, 8, 9, 10, _, 1567, 3784 ; t2 t3 + REPX {pmulld x, m14}, m0, m4 +.main_pass1_fast2: + REPX {paddd x, m11}, m1, m2, m3, m5, m6, m7 + REPX {psrad x, 12 }, m1, m2, m3, m5, m6, m7 + paddd m8, m1, m5 ; t4 + psubd m1, m5 ; t5a + paddd m9, m7, m3 ; t7 + psubd m7, m3 ; t6a + REPX {pmaxsd x, m12}, m1, m8, m7, m9 + REPX {pminsd x, m13}, m1, m8, m7, m9 + REPX {pmulld x, m14}, m7, m1 + paddd m0, m11 + paddd m7, m11 + psubd m5, m0, m4 + paddd m0, m4 + psubd m4, m7, m1 + paddd m7, m1 + REPX {psrad x, 12 }, m5, m0, m4, m7 + psubd m3, m0, m6 ; dct4 out3 + paddd m0, m6 ; dct4 out0 + paddd m6, m5, m2 ; dct4 out1 + psubd m5, m2 ; dct4 out2 + REPX {pmaxsd x, m12}, m0, m6, m5, m3 + REPX {pminsd x, m13}, m0, m6, m5, m3 + ret +.round: + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + psubd m7, m0, m9 ; out7 + paddd m0, m9 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 +%else +.main_pass1_fast: + pmulld m5, m3, [o(pd_m2276)] + pmulld m3, [o(pd_3406)] + pmulld m7, m1, [o(pd_4017)] + pmulld m1, [o(pd_799)] + pmulld m6, m2, [o(pd_3784)] + pmulld m2, [o(pd_1567)] + mova m4, [o(pd_2048)] + mova [r3+0*16], m2 + REPX {paddd x, m4}, m5, m3, m7, m1 + REPX {psrad x, 12}, m5, m3, m7, m1 + paddd m2, m1, m5 ; t4 + psubd m1, m5 ; t5a + pmulld m5, m0, [o(pd_2896)] + mova m0, m4 + paddd m4, m7, m3 ; t7 + psubd m7, m3 ; t6a + mova m3, [o(clip_18b_min)] + REPX {pmaxsd x, m3 }, m1, m2, m7, m4 + mova m3, [o(clip_18b_max)] + REPX {pminsd x, m3 }, m1, m2, m7, m4 + mova [r3+3*16], m2 + mova [r3+1*16], m4 + pxor m4, m4 + mova m2, [r3+0*16] + mova m3, [o(pd_2896)] + jmp .main_pass1_fast2 +.main_pass1: + mova [r3+0*16], m0 + mova [r3+1*16], m2 + mova [r3+2*16], m4 + mova [r3+3*16], m6 + mova m0, [o(pd_2048)] + ITX_MULSUB_2D 5, 3, 2, 4, 6, 0, 3406, 2276 ; t5a t6a + ITX_MULSUB_2D 1, 7, 2, 4, 6, 0, 799, 4017 ; t4a t7a + paddd m2, m1, m5 ; t4 + psubd m1, m5 ; t5a + paddd m4, m7, m3 ; t7 + psubd m7, m3 ; t6a + mova m6, [o(clip_18b_min)] + REPX {pmaxsd x, m6 }, m1, m2, m7, m4 + mova m6, [o(clip_18b_max)] + REPX {pminsd x, m6 }, m1, m2, m7, m4 + mova m6, [r3+3*16] + mova [r3+3*16], m2 + mova m2, [r3+1*16] + mova [r3+1*16], m4 + + ITX_MULSUB_2D 2, 6, 4, 3, 5, _, 1567, 3784 ; t2 t3 + mova m3, [o(pd_2896)] + mova m5, [r3+0*16] + mova m4, [r3+2*16] + REPX {pmulld x, m3 }, m5, m4 +.main_pass1_fast2: + REPX {paddd x, m0 }, m2, m6 + REPX {psrad x, 12 }, m2, m6 + REPX {pmulld x, m3 }, m7, m1 + paddd m7, m0 + paddd m0, m5 + + psubd m5, m0, m4 + paddd m0, m4 + psubd m4, m7, m1 + paddd m7, m1 + REPX {psrad x, 12 }, m5, m0, m4, m7 + psubd m3, m0, m6 ; dct4 out3 + paddd m0, m6 ; dct4 out0 + paddd m6, m5, m2 ; dct4 out1 + psubd m5, m2 ; dct4 out2 + + mova m1, [o(clip_18b_min)] + REPX {pmaxsd x, m1 }, m0, m6, m5, m3 + mova m1, [o(clip_18b_max)] + REPX {pminsd x, m1 }, m0, m6, m5, m3 + ret +.round: + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + mova [r3+0*16], m6 + mova m6, [r3+1*16] + psubd m7, m0, m6 ; out7 + paddd m0, m6 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + mova m6, [r3+3*16] + psubd m4, m3, m6 ; out4 + paddd m3, m6 ; out3 + mova m6, [r3+0*16] +%endif + ret + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(idct_8x4_internal_8bpc, _ssse3).main +.end: + lea r3, [strideq*3] + call .round2_and_write_8x4 + REPX {mova [cq+16*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 + RET +.round2_and_write_8x4: + pxor m6, m6 + mova m5, [o(pixel_10bpc_max)] + mova m4, [o(pw_2048)] +.round1_and_write_8x4: + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 +.write_8x4: + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] + paddw m2, [dstq+strideq*2] + paddw m3, [dstq+r3] + REPX {pminsw x, m5}, m0, m1, m2, m3 + REPX {pmaxsw x, m6}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r3 ], m3 + ret + +INV_TXFM_8X4_FN adst, dct +INV_TXFM_8X4_FN adst, adst +INV_TXFM_8X4_FN adst, flipadst +INV_TXFM_8X4_FN adst, identity + +cglobal iadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + lea r5, [o(.main)] + jmp m(idct_8x4_internal_16bpc).pass1_entry +.main: + call .main_pass1 + call .round + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + ret +.main_pass1: +%if ARCH_X86_64 + ITX_MULSUB_2D 7, 0, 8, 9, 10, 11, 401, 4076 ; t1a, t0a + ITX_MULSUB_2D 1, 6, 8, 9, 10, 11, 3920, 1189 ; t7a, t6a + ITX_MULSUB_2D 5, 2, 8, 9, 10, 11, 1931, 3612 ; t3a, t2a + ITX_MULSUB_2D 3, 4, 8, 9, 10, 11, 3166, 2598 ; t5a, t4a + psubd m8, m2, m6 ; t6 + paddd m2, m6 ; t2 + psubd m6, m0, m4 ; t4 + paddd m0, m4 ; t0 + psubd m4, m5, m1 ; t7 + paddd m5, m1 ; t3 + psubd m1, m7, m3 ; t5 + paddd m7, m3 ; t1 + REPX {pmaxsd x, m12}, m6, m1, m8, m4, m2, m0, m5, m7 + REPX {pminsd x, m13}, m6, m1, m8, m4, m2, m0, m5, m7 + ITX_MULSUB_2D 6, 1, 3, 9, 10, 11, 1567, 3784 ; t5a, t4a + ITX_MULSUB_2D 4, 8, 3, 9, 10, 11, 3784, 10 ; t6a, t7a + psubd m9, m6, m8 ; t7 + paddd m6, m8 ; out6 + mova m8, [o(pd_2896)] + psubd m3, m7, m5 ; t3 + paddd m7, m5 ; -out7 + psubd m5, m0, m2 ; t2 + paddd m0, m2 ; out0 + psubd m2, m1, m4 ; t6 + paddd m1, m4 ; -out1 + REPX {pmaxsd x, m12}, m5, m3, m2, m9 + REPX {pminsd x, m13}, m5, m3, m2, m9 + REPX {pmulld x, m14}, m5, m3, m2, m9 + psubd m4, m5, m3 ; (t2 - t3) * 2896 + paddd m3, m5 ; (t2 + t3) * 2896 + psubd m5, m2, m9 ; (t6 - t7) * 2896 + paddd m2, m9 ; (t6 + t7) * 2896 + ret +.round: + + ; m0=out0,m1=-out1,m6=out6,m7=-out7 + + pcmpeqd m8, m8 + REPX {pxor x, m8 }, m1, m7, m3, m5 + REPX {psubd x, m8 }, m1, m7 + REPX {paddd x, m11}, m2, m3, m4, m5 + REPX {psrad x, 12 }, m2, m3, m4, m5 +%else + mova [r3+0*16], m2 + mova [r3+1*16], m3 + mova [r3+2*16], m4 + mova [r3+3*16], m5 + mova m5, [o(pd_2048)] + + ITX_MULSUB_2D 7, 0, 2, 3, 4, 5, 401, 4076 ; t1a, t0a + ITX_MULSUB_2D 1, 6, 2, 3, 4, 5, 3920, 1189 ; t7a, t6a + mova m2, [r3+0*16] + mova m3, [r3+1*16] + mova m4, [r3+2*16] + mova [r3+0*16], m0 + mova [r3+1*16], m1 + mova [r3+2*16], m6 + mova m1, [r3+3*16] + mova [r3+3*16], m7 + ITX_MULSUB_2D 1, 2, 0, 6, 7, 5, 1931, 3612 ; t3a, t2a + ITX_MULSUB_2D 3, 4, 0, 6, 7, 5, 3166, 2598 ; t5a, t4a + mova m0, [r3+0*16] + mova m6, [r3+2*16] + psubd m7, m2, m6 ; t6 + paddd m2, m6 ; t2 + psubd m6, m0, m4 ; t4 + paddd m0, m4 ; t0 + mova [r3+0*16], m7 + mova m5, [r3+1*16] + mova m7, [r3+3*16] + psubd m4, m1, m5 ; t7 + paddd m5, m1 ; t3 + psubd m1, m7, m3 ; t5 + paddd m7, m3 ; t1 + mova m3, [o(clip_18b_min)] + REPX {pmaxsd x, m3 }, m6, m1, m4, m2, m0, m5, m7 + mova [r3+1*16], m7 + mova m7, [o(clip_18b_max)] + pmaxsd m3, [r3+0*16] + REPX {pminsd x, m7 }, m6, m1, m3, m4, m2, m0, m5 + pminsd m7, [r3+1*16] + mova [r3+0*16], m0 + mova [r3+1*16], m2 + mova [r3+2*16], m5 + mova [r3+3*16], m7 + mova m0, [o(pd_2048)] + ITX_MULSUB_2D 6, 1, 2, 5, 7, 0, 1567, 3784 ; t5a, t4a + ITX_MULSUB_2D 4, 3, 2, 5, 7, 0, 3784, 7 ; t6a, t7a + mova m5, [r3+2*16] + mova m7, [r3+3*16] + psubd m2, m6, m3 ; t7 + paddd m6, m3 ; out6 + mova [r3+3*16], m6 + mova m0, [r3+0*16] + mova m6, [r3+1*16] + psubd m3, m7, m5 ; t3 + paddd m7, m5 ; -out7 + psubd m5, m0, m6 ; t2 + paddd m0, m6 ; out0 + psubd m6, m1, m4 ; t6 + paddd m1, m4 ; -out1 + mova m4, [o(clip_18b_min)] + REPX {pmaxsd x, m4 }, m5, m3, m6, m2 + mova m4, [o(clip_18b_max)] + REPX {pminsd x, m4 }, m5, m3, m6, m2 + mova m4, [o(pd_2896)] + REPX {pmulld x, m4 }, m5, m3, m6, m2 + psubd m4, m5, m3 ; (t2 - t3) * 2896 + paddd m3, m5 ; (t2 + t3) * 2896 + psubd m5, m6, m2 ; (t6 - t7) * 2896 + paddd m2, m6 ; (t6 + t7) * 2896 + ret +.round: + mova [r3+2*16], m0 + + pcmpeqd m0, m0 + mova m6, [o(pd_2048)] + REPX {pxor x, m0 }, m1, m7, m3, m5 + REPX {psubd x, m0 }, m1, m7 + REPX {paddd x, m6 }, m2, m3, m4, m5 + REPX {psrad x, 12 }, m2, m3, m4, m5 + + mova m6, [r3+3*16] + mova m0, [r3+2*16] +%endif + ret + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main + jmp m(idct_8x4_internal_16bpc).end + +INV_TXFM_8X4_FN flipadst, dct +INV_TXFM_8X4_FN flipadst, adst +INV_TXFM_8X4_FN flipadst, flipadst +INV_TXFM_8X4_FN flipadst, identity + +cglobal iflipadst_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + lea r5, [o(.main)] + jmp m(idct_8x4_internal_16bpc).pass1_entry +.main: + call m(iadst_8x4_internal_16bpc).main_pass1 + call m(iadst_8x4_internal_16bpc).round + packssdw m7, m6 + packssdw m5, m4 + packssdw m3, m2 + packssdw m1, m0 + mova m0, m7 + mova m2, m5 + mova m4, m3 + mova m6, m1 + ret +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_8x4_internal_8bpc, _ssse3).main + lea r3, [strideq*3] + add dstq, r3 + neg strideq + jmp m(idct_8x4_internal_16bpc).end + +INV_TXFM_8X4_FN identity, dct +INV_TXFM_8X4_FN identity, adst +INV_TXFM_8X4_FN identity, flipadst +INV_TXFM_8X4_FN identity, identity + +cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + lea r5, [o(.main)] + jmp m(idct_8x4_internal_16bpc).pass1_entry +.main: + REPX {paddd x, x}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + ret +.pass2: + mova m7, [o(pw_1697x8)] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(idct_8x4_internal_16bpc).end + +%macro INV_TXFM_8X8_FN 2-3 0 ; type1, type2, eob_offset +%if ARCH_X86_64 + INV_TXFM_FN %1, %2, %3, 8x8, 15, 0-3*16 +%else + INV_TXFM_FN %1, %2, %3, 8x8, 8, 0-5*16 +%endif +%ifidn %1_%2, dct_dct + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 2 +.end: + add r5d, 384 + sar r5d, 9 +.end2: + imul r5d, 2896 + add r5d, 34816 + movd m0, r5d + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + mova m6, [o(pixel_10bpc_max)] + pxor m5, m5 + lea r2, [strideq*3] +.loop: + mova m1, [dstq+strideq*0] + mova m2, [dstq+strideq*1] + mova m3, [dstq+strideq*2] + mova m4, [dstq+r2] + REPX {paddw x, m0}, m1, m2, m3, m4 + REPX {pmaxsw x, m5}, m1, m2, m3, m4 + REPX {pminsw x, m6}, m1, m2, m3, m4 + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 + mova [dstq+strideq*2], m3 + mova [dstq+r2 ], m4 + lea dstq, [dstq+strideq*4] + dec r3d + jg .loop + RET +%endif +%endmacro + +INV_TXFM_8X8_FN dct, dct +INV_TXFM_8X8_FN dct, identity, 6 +INV_TXFM_8X8_FN dct, adst +INV_TXFM_8X8_FN dct, flipadst + +cglobal idct_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_32 + DECLARE_REG_TMP 1 + mov [rsp+4*16+1*gprsize], r1 +%else + DECLARE_REG_TMP 6 +%endif + lea t0, [o(.pass1_main)] + +.pass1_full: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif +%undef cmp +%if ARCH_X86_64 + xor r5d, r5d + cmp eobd, 10 + setge r5b +%else + mov r5d, 1 + cmp eobd, 10 + sbb r5d, 0 +%endif + shl r5d, 4 +%if ARCH_X86_32 + lea r3, [rsp+gprsize] +%endif +.loop_pass1: + mova m0, [cq+0*32+r5] + mova m1, [cq+1*32+r5] + mova m2, [cq+2*32+r5] + mova m3, [cq+3*32+r5] + mova m4, [cq+4*32+r5] + mova m5, [cq+5*32+r5] + mova m6, [cq+6*32+r5] + mova m7, [cq+7*32+r5] + call t0 + + test r5d, r5d + jz .end_pass1 + + mova [cq+0*32+16], m0 + mova [cq+1*32+16], m1 + mova [cq+2*32+16], m2 + mova [cq+3*32+16], m3 + + sub r5d, 16 + jmp .loop_pass1 +.end_pass1: + mova m4, [cq+0*32+16] + mova m5, [cq+1*32+16] + mova m6, [cq+2*32+16] + mova m7, [cq+3*32+16] +%if ARCH_X86_32 + mov r1, [rsp+4*16+1*gprsize] +%endif + jmp tx2q +.pass1_main: + call m(idct_8x4_internal_16bpc).main_pass1 + pcmpeqd m1, m1 + REPX {psubd x, m1}, m0, m6, m5, m3 + call m(idct_8x4_internal_16bpc).round + REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 +.pack_and_transpose: + packssdw m2, m3 + packssdw m6, m7 + packssdw m0, m1 + packssdw m4, m5 + jmp m(idct_8x4_internal_16bpc).transpose4x8packed + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(idct_8x8_internal_8bpc, _ssse3).main + lea r3, [strideq*3] +%if ARCH_X86_64 + mova m10, [o(pixel_10bpc_max)] + pxor m9, m9 +%endif + call .round3_and_write_8x8 +.zero: +%if ARCH_X86_64 +%define mzero m9 +%else +%define mzero m7 + pxor m7, m7 +%endif + REPX {mova [cq+16*x], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +%undef mzero + RET + + ; round (rounded right-shift by 5) before writing + ; data in m0-7 + ; on x86-64, pw_2048 is in m8 + ; .round1 is for m0-7 + ; .round2 is for m0-6 & [rsp+gprsize*2] + ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32) + ; .round4 is x86-32-only, it is similar to .round2 but with constant already in m7 +%if ARCH_X86_32 +.round1_and_write_8x8: + mova [rsp+gprsize*2], m7 +.round2_and_write_8x8: +%endif +.round3_and_write_8x8: + mova m7, [o(pw_2048)] +%if ARCH_X86_32 +.round4_and_write_8x8: +%endif + REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulhrsw m7, [rsp+gprsize*2] +%if ARCH_X86_64 + jmp .write_8x8 +.round2_and_write_8x8: + mova m7, [rsp+gprsize*2] +.round1_and_write_8x8: + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 +%endif + + ; m0-7 have to-be-written data [pre-rounded] + ; on x86-64, m9-10 contain a zero/pixel_max + ; on x86-32, these are runtime-generated, and [rsp+gprsize*2] is scratch + ; r0,1,3 contain dstq/strideq/stride3q + ; r5 is a scratch register +.write_8x8: + lea r5, [dstq+strideq*4] + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] + paddw m2, [dstq+strideq*2] + paddw m3, [dstq+r3] + paddw m4, [r5 +strideq*0] + paddw m5, [r5 +strideq*1] + paddw m6, [r5 +strideq*2] + paddw m7, [r5 +r3] +%if ARCH_X86_64 + REPX {pmaxsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 +%else + mova [rsp+gprsize*2], m7 + pxor m7, m7 + REPX {pmaxsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmaxsw m7, [rsp+gprsize*2] + mova [rsp+gprsize*2], m7 + mova m7, [o(pixel_10bpc_max)] + REPX {pminsw x, m7}, m0, m1, m2, m3, m4, m5, m6 + pminsw m7, [rsp+gprsize*2] +%endif + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r3 ], m3 + mova [r5 +strideq*0], m4 + mova [r5 +strideq*1], m5 + mova [r5 +strideq*2], m6 + mova [r5 +r3 ], m7 + ret + +INV_TXFM_8X8_FN adst, dct +INV_TXFM_8X8_FN adst, adst +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity, 6 + +cglobal iadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_32 + mov [rsp+4*16+1*gprsize], r1 +%endif + lea t0, [o(.pass1_main)] + jmp m(idct_8x8_internal_16bpc).pass1_full +.pass1_main: + call m(iadst_8x4_internal_16bpc).main_pass1 + call .round + jmp m(idct_8x8_internal_16bpc).pack_and_transpose +.round: +%if ARCH_X86_64 + pcmpeqd m8, m8 ; -1 + REPX {psubd x, m8 }, m0, m6 + REPX {pxor x, m8 }, m1, m7, m3, m5 + REPX {psrad x, 1 }, m0, m1, m6, m7 + REPX {psubd x, m8 }, m1, m7 + mova m8, [o(pd_6144)] + REPX {paddd x, m8 }, m2, m3, m4, m5 + REPX {psrad x, 13 }, m2, m3, m4, m5 +%else + mova [r3+2*16], m0 + + pcmpeqd m0, m0 ; -1 + mova m6, [o(pd_6144)] + REPX {pxor x, m0 }, m1, m7, m3, m5 + REPX {psrad x, 1 }, m1, m7 + REPX {psubd x, m0 }, m1, m7 + REPX {paddd x, m6 }, m2, m3, m4, m5 + REPX {psrad x, 13 }, m2, m3, m4, m5 + + mova m0, [r3+2*16] + psrld m6, 12 ; +1 + paddd m0, m6 + paddd m6, [r3+3*16] + REPX {psrad x, 1 }, m0, m6 +%endif + ret + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main + call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end + lea r3, [strideq*3] +%if ARCH_X86_64 + mova m10, [o(pixel_10bpc_max)] + pxor m9, m9 +%endif + call .round3_and_write_8x8 + jmp m(idct_8x8_internal_16bpc).zero + + ; round (rounded right-shift by 5) before writing; odd registers are negated + ; data in m0-7 + ; on x86-64, pw_2048 is in m8 and pw_m2048 is in m11 + ; .round1 is for m0-7 + ; .round2 is for m0-6 & [rsp+gprsize*2] + ; .round3 is same, but without using m8 on x86-64 (.round2/3 are identical on x86-32) +%if ARCH_X86_64 +.round2_and_write_8x8: + mova m7, [rsp+gprsize*2] +.round1_and_write_8x8: + REPX {pmulhrsw x, m8 }, m0, m2, m4, m6 + REPX {pmulhrsw x, m11}, m1, m3, m5, m7 + jmp m(idct_8x8_internal_16bpc).write_8x8 +%else +.round1_and_write_8x8: + mova [rsp+gprsize*2], m7 +.round2_and_write_8x8: +%endif +.round3_and_write_8x8: + mova m7, [o(pw_2048)] + REPX {pmulhrsw x, m7}, m0, m2, m4, m6 + mova m7, [o(pw_m2048)] + REPX {pmulhrsw x, m7}, m1, m3, m5 + pmulhrsw m7, [rsp+gprsize*2] + jmp m(idct_8x8_internal_16bpc).write_8x8 + +INV_TXFM_8X8_FN flipadst, dct +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, flipadst +INV_TXFM_8X8_FN flipadst, identity, 6 + +cglobal iflipadst_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_32 + mov [rsp+4*16+1*gprsize], r1 +%endif + lea t0, [o(.pass1_main)] + jmp m(idct_8x8_internal_16bpc).pass1_full +.pass1_main: + call m(iadst_8x4_internal_16bpc).main_pass1 + call m(iadst_8x8_internal_16bpc).round + ; invert registers + packssdw m7, m6 + packssdw m5, m4 + packssdw m3, m2 + packssdw m1, m0 + mova m0, m7 + mova m2, m5 + mova m4, m3 + mova m6, m1 + jmp m(idct_8x4_internal_16bpc).transpose4x8packed + +.pass2: + lea dstq, [dstq+strideq*8] + sub dstq, strideq + neg strideq + jmp m(iadst_8x8_internal_16bpc).pass2 + +INV_TXFM_8X8_FN identity, dct +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + mova m0, [cq+0*32] + mova m1, [cq+1*32] + mova m2, [cq+2*32] + mova m3, [cq+3*32] + mova m4, [cq+4*32] + mova m5, [cq+5*32] + mova m6, [cq+6*32] + mova m7, [cq+7*32] + packssdw m0, [cq+0*32+16] + packssdw m1, [cq+1*32+16] + packssdw m2, [cq+2*32+16] + packssdw m3, [cq+3*32+16] + packssdw m4, [cq+4*32+16] + packssdw m5, [cq+5*32+16] + packssdw m6, [cq+6*32+16] + packssdw m7, [cq+7*32+16] + mova [rsp+gprsize+16*1], m6 + jmp m_suffix(idct_8x8_internal_8bpc, _ssse3).pass1_end3 + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + lea r3, [strideq*3] +%if ARCH_X86_64 + mova m10, [o(pixel_10bpc_max)] + pxor m9, m9 + mova m8, [o(pw_4096)] + call m(idct_8x8_internal_16bpc).round1_and_write_8x8 +%else + mova [rsp+gprsize], m7 + mova m7, [o(pw_4096)] + call m(idct_8x8_internal_16bpc).round4_and_write_8x8 +%endif + jmp m(idct_8x8_internal_16bpc).zero + +%macro INV_TXFM_8X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix +%if ARCH_X86_64 + INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 15, 0-16*16 +%else + INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16 +%endif +%ifidn %1_%2, dct_dct + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + mov r3d, 4 +%if stack_size_padded > 0 + ; adjust to caller's stack allocation + add rsp, (12+ARCH_X86_64)*16 +%endif + jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end +%endif +%endmacro + +INV_TXFM_8X16_FN dct, dct +INV_TXFM_8X16_FN dct, identity, v +INV_TXFM_8X16_FN dct, adst +INV_TXFM_8X16_FN dct, flipadst + +%if ARCH_X86_64 +DECLARE_REG_TMP 7 +%endif + +cglobal idct_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + PUSH r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(m(idct_8x8_internal_16bpc).pass1_main)] +.pass1_full: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif +%undef cmp + mov r6d, 4 +.zero_loop: + dec r6d + cmp eobb, byte [r5+r6] + jl .zero_loop + mov r5d, r6d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, [rsp+16*16+2*gprsize] + ; setup stack pointer + lea r3, [rsp+gprsize] +%endif +.loop_pass1: + mova m0, [cq+0*64+r5] + mova m1, [cq+1*64+r5] + mova m2, [cq+2*64+r5] + mova m3, [cq+3*64+r5] + mova m4, [cq+4*64+r5] + mova m5, [cq+5*64+r5] + mova m6, [cq+6*64+r5] + mova m7, [cq+7*64+r5] + call m(idct_8x4_internal_16bpc).rect2_mul + call t0 + + mova [cq+0*64+r5], m0 + mova [cq+1*64+r5], m1 + mova [cq+2*64+r5], m2 + mova [cq+3*64+r5], m3 + sub r5d, 16 + jge .loop_pass1 +%if WIN64 + POP r7 +%elif ARCH_X86_32 + mov r1, [rsp+16*16+1*gprsize] +%endif + jmp tx2q + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + + ; input is in cqN*16, where N=0/4/8/12/1/5/9/13/2/6/10/14/3/7/11/15 + ; some are still pre-loaded from the final loop iteration in pass=1 + + mova m1, m2 + mova m2, [cq+ 1*16] + mova m3, [cq+ 9*16] + mova m4, [cq+ 2*16] + mova m5, [cq+10*16] + mova m6, [cq+ 3*16] + mova m7, [cq+11*16] + call m_suffix(idct_8x8_internal_8bpc, _ssse3).main + mova [rsp+gprsize+3*16], m0 + mova [rsp+gprsize+4*16], m1 + mova [rsp+gprsize+5*16], m2 + mova [rsp+gprsize+6*16], m3 + mova [rsp+gprsize+7*16], m4 + mova [rsp+gprsize+8*16], m5 + mova [rsp+gprsize+9*16], m6 + ; m7 is already stored in [rsp+gprsize+0*16] + mova m0, [cq+ 4*16] + mova m1, [cq+12*16] + mova m2, [cq+ 5*16] + mova m3, [cq+13*16] + mova m4, [cq+ 6*16] + mova m5, [cq+14*16] + mova m6, [cq+ 7*16] + mova m7, [cq+15*16] + call m_suffix(idct_16x8_internal_8bpc, _ssse3).main + + ; out0-7 is in rsp+gprsize+3-10*mmsize + ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize + +%if ARCH_X86_64 + mova m8, [o(pw_2048)] + mova m10, [o(pixel_10bpc_max)] + pxor m9, m9 + mov r6, dstq +%else + mov [rsp+16*16+gprsize*1], dstq +%endif + lea r3, [strideq*3] + lea dstq, [dstq+strideq*8] + call m(idct_8x8_internal_16bpc).round2_and_write_8x8 +%if ARCH_X86_64 +%define mzero m9 +%else +%define mzero m7 + pxor m7, m7 +%endif + REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 +%undef mzero + mova m0, [rsp+gprsize+ 3*16] + mova m1, [rsp+gprsize+ 4*16] + mova m2, [rsp+gprsize+ 5*16] + mova m3, [rsp+gprsize+ 6*16] + mova m4, [rsp+gprsize+ 7*16] + mova m5, [rsp+gprsize+ 8*16] + mova m6, [rsp+gprsize+ 9*16] + mova m7, [rsp+gprsize+10*16] +%if ARCH_X86_64 + mov dstq, r6 +%else + mov dstq, [rsp+16*16+gprsize*1] +%endif + call m(idct_8x8_internal_16bpc).round1_and_write_8x8 + RET + +INV_TXFM_8X16_FN adst, dct +INV_TXFM_8X16_FN adst, adst +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, identity, v + +cglobal iadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + PUSH r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(m(iadst_8x8_internal_16bpc).pass1_main)] + jmp m(idct_8x16_internal_16bpc).pass1_full + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + mova m4, [cq+ 9*16] + mova m5, [cq+13*16] + mova [rsp+gprsize+7*16], m0 + mova [rsp+gprsize+8*16], m1 + mova [rsp+gprsize+5*16], m4 + mova [rsp+gprsize+6*16], m5 + mova m0, m2 + mova m1, m3 + mova m2, [cq+ 1*16] + mova m3, [cq+ 5*16] + mova m4, [cq+ 2*16] + mova m5, [cq+ 6*16] + mova m6, [cq+11*16] + mova m7, [cq+15*16] + mova [rsp+gprsize+ 3*16], m4 + mova [rsp+gprsize+ 4*16], m5 + mova [rsp+gprsize+ 9*16], m6 + mova [rsp+gprsize+10*16], m7 + mova m4, [cq+10*16] + mova m5, [cq+14*16] + mova m6, [cq+ 3*16] + mova m7, [cq+ 7*16] + call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main + call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end + +%if ARCH_X86_64 + mova m11, [o(pw_m2048)] + mova m8, [o(pw_2048)] + mova m10, [o(pixel_10bpc_max)] + pxor m9, m9 + mov r6, dstq +%else + mov [rsp+16*16+gprsize*1], dstq +%endif + lea r3, [strideq*3] + lea dstq, [dstq+strideq*8] + call m(iadst_8x8_internal_16bpc).round2_and_write_8x8 +%if ARCH_X86_64 +%define mzero m9 +%else +%define mzero m7 + pxor m7, m7 +%endif + REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 +%undef mzero + mova m0, [rsp+gprsize+ 3*16] + mova m1, [rsp+gprsize+ 4*16] + mova m2, [rsp+gprsize+ 5*16] + mova m3, [rsp+gprsize+ 6*16] + mova m4, [rsp+gprsize+ 7*16] + mova m5, [rsp+gprsize+ 8*16] + mova m6, [rsp+gprsize+ 9*16] + mova m7, [rsp+gprsize+10*16] +%if ARCH_X86_64 + mov dstq, r6 +%else + mov dstq, [rsp+16*16+gprsize*1] +%endif + call m(iadst_8x8_internal_16bpc).round1_and_write_8x8 + RET + +INV_TXFM_8X16_FN flipadst, dct +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst +INV_TXFM_8X16_FN flipadst, identity, v + +cglobal iflipadst_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + PUSH r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(m(iflipadst_8x8_internal_16bpc).pass1_main)] + jmp m(idct_8x16_internal_16bpc).pass1_full + +.pass2: + lea r3, [strideq*3] + lea r3, [r3*5] + add dstq, r3 + neg strideq + jmp m(iadst_8x16_internal_16bpc).pass2 + +INV_TXFM_8X16_FN identity, dct, h +INV_TXFM_8X16_FN identity, adst, h +INV_TXFM_8X16_FN identity, flipadst, h +INV_TXFM_8X16_FN identity, identity + +cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + PUSH r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(m(idct_8x8_internal_16bpc).pack_and_transpose)] + jmp m(idct_8x16_internal_16bpc).pass1_full + +.pass2: +%if ARCH_X86_64 + mova m4, [o(pw_2048)] + mova m5, [o(pixel_10bpc_max)] + pxor m6, m6 + mova m7, [o(pw_1697x16)] +%endif + mov r5d, 4 + lea r3, [strideq*3] +.pass2_loop: + call .main +%if ARCH_X86_64 + call m(idct_8x4_internal_16bpc).round1_and_write_8x4 +%else + call m(idct_8x4_internal_16bpc).round2_and_write_8x4 +%endif + REPX {mova [cq+x*16], m6}, 0, 4, 8, 12, 16, 20, 24, 28 + dec r5d + jle .end + add cq, 16 + lea dstq, [dstq+strideq*4] + mova m0, [cq+ 0*16] + mova m1, [cq+ 4*16] + mova m2, [cq+ 8*16] + mova m3, [cq+12*16] + jmp .pass2_loop +.end: + RET +.main: + ; y = pmulhrsw(x, pw_1697x16); x = paddsw(x, x); x = paddsw(x, y) +%if ARCH_X86_32 + mova m7, [o(pw_1697x16)] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 +%else + pmulhrsw m8, m7, m0 + pmulhrsw m9, m7, m1 + pmulhrsw m10, m7, m2 + pmulhrsw m11, m7, m3 +%endif + REPX {paddsw x, x}, m0, m1, m2, m3 +%if ARCH_X86_64 + paddsw m0, m8 + paddsw m1, m9 + paddsw m2, m10 + paddsw m3, m11 +%else + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 +%endif + ret + +%macro INV_TXFM_16X4_FN 2 ; type1, type2 +%if ARCH_X86_64 + INV_TXFM_FN %1, %2, 0, 16x4, 16, 0-8*16 +%else + INV_TXFM_FN %1, %2, 0, 16x4, 8, 0-12*16 +%endif +%ifidn %1_%2, dct_dct + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 4 +.dconly: + add r5d, 384 + sar r5d, 9 +.dconly2: + imul r5d, 2896 + add r5d, 34816 + movd m0, r5d + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + mova m3, [o(pixel_10bpc_max)] + pxor m4, m4 +.loop: + mova m1, [dstq+ 0] + mova m2, [dstq+16] + REPX {paddw x, m0}, m1, m2 + REPX {pminsw x, m3}, m1, m2 + REPX {pmaxsw x, m4}, m1, m2 + mova [dstq+ 0], m1 + mova [dstq+16], m2 + add dstq, strideq + dec r3d + jg .loop + RET +%endif +%endmacro + +INV_TXFM_16X4_FN dct, dct +INV_TXFM_16X4_FN dct, identity +INV_TXFM_16X4_FN dct, adst +INV_TXFM_16X4_FN dct, flipadst + +cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + ; setup stack pointer + lea r3, [rsp+gprsize] + + mova m0, [cq+ 1*16] + mova m1, [cq+ 3*16] + mova m2, [cq+ 5*16] + mova m3, [cq+ 7*16] + mova m4, [cq+ 9*16] + mova m5, [cq+11*16] + mova m6, [cq+13*16] + mova m7, [cq+15*16] + call .main_oddhalf + mova m0, [cq+ 0*16] + mova m1, [cq+ 2*16] + mova m2, [cq+ 4*16] + mova m3, [cq+ 6*16] + mova m4, [cq+ 8*16] + mova m5, [cq+10*16] + mova m6, [cq+12*16] + mova m7, [cq+14*16] + call m(idct_8x4_internal_16bpc).main_pass1 + call m(idct_8x4_internal_16bpc).round + ; t0-7 is in m0-7 + + call .round + +%if ARCH_X86_64 +.pack_transpose: + ; transpose in two parts + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m14, m15 +.transpose: + call m(idct_8x4_internal_16bpc).transpose4x8packed + call .transpose4x8packed_hi +%else + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [r3+0*16], m0 + mova [r3+1*16], m1 + mova [r3+2*16], m2 + mova [r3+3*16], m3 + mova m0, [r3+ 8*16] + mova m2, [r3+ 9*16] + mova m4, [r3+10*16] + mova m6, [r3+11*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed +%endif + jmp tx2q +%if ARCH_X86_64 +.transpose4x8packed_hi: + punpcklwd m9, m10, m14 + punpckhwd m10, m14 + punpckhwd m14, m8, m12 + punpcklwd m8, m12 + + punpckhwd m11, m8, m9 + punpcklwd m8, m9 + punpckhwd m12, m14, m10 + punpcklwd m14, m10 + + punpcklwd m10, m11, m12 + punpckhwd m11, m12 + punpckhwd m9, m8, m14 + punpcklwd m8, m14 + ret +%endif +.main_oddhalf_fast: ; lower half zero + pmulld m7, m0, [o(pd_4076)] + pmulld m0, [o(pd_401)] + pmulld m6, m1, [o(pd_m1189)] + pmulld m1, [o(pd_3920)] +%if ARCH_X86_32 + mova m4, [o(pd_2048)] + REPX {paddd x, m4}, m1, m6 + REPX {psrad x, 12}, m1, m6 + mova [r3+1*16], m1 +%endif + pmulld m5, m2, [o(pd_3612)] + pmulld m2, [o(pd_1931)] +%if ARCH_X86_32 + pmulld m1, m3, [o(pd_m2598)] +%else + pmulld m4, m3, [o(pd_m2598)] +%endif + pmulld m3, [o(pd_3166)] + jmp .main_oddhalf_fast2 +.main_oddhalf: +%if ARCH_X86_64 + ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 401, 4076 ; t8a, t15a + ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a + ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a + ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3166, 2598 ; t9a, t14a +.main_oddhalf_fast2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 + psubd m8, m0, m4 ; t9 + paddd m0, m4 ; t8 + psubd m4, m6, m2 ; t10 + paddd m2, m6 ; t11 + psubd m6, m1, m5 ; t13 + paddd m5, m1 ; t12 + psubd m1, m7, m3 ; t14 + paddd m7, m3 ; t15 + REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7 + REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7 + mova m15, [o(pd_3784)] + mova m10, [o(pd_1567)] + ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15 + ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 4 + psubd m3, m1, m4 ; t10 + paddd m1, m4 ; t9 + psubd m4, m0, m2 ; t11a + paddd m0, m2 ; t8a + psubd m2, m8, m6 ; t13 + paddd m6, m8 ; t14 + psubd m8, m7, m5 ; t12a + paddd m7, m5 ; t15a + REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7 + REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7 + REPX {pmulld x, m14}, m2, m8, m3, m4 + paddd m2, m11 + paddd m8, m11 + paddd m5, m2, m3 ; t13a + psubd m2, m3 ; t10a + psubd m3, m8, m4 ; t11 + paddd m4, m8 ; t12 + REPX {psrad x, 12}, m5, m2, m3, m4 + mova [r3+0*16], m0 + mova [r3+1*16], m1 + mova [r3+2*16], m2 + mova [r3+3*16], m3 + mova [r3+4*16], m4 + mova [r3+5*16], m5 + mova [r3+6*16], m6 + mova [r3+7*16], m7 +%else + mova [r3+0*16], m2 + mova [r3+1*16], m3 + mova [r3+2*16], m4 + mova [r3+3*16], m5 + mova m4, [o(pd_2048)] + + ITX_MULSUB_2D 0, 7, 2, 3, 5, _, 401, 4076 ; t8a, t15a + ITX_MULSUB_2D 6, 1, 2, 3, 5, 4, 3920, 1189 ; t11a, t12a + + mova m2, [r3+0*16] + mova m3, [r3+1*16] + mova [r3+0*16], m0 + mova [r3+1*16], m1 + mova m1, [r3+2*16] + mova m5, [r3+3*16] + mova [r3+2*16], m6 + mova [r3+3*16], m7 + + ITX_MULSUB_2D 2, 5, 0, 6, 7, _, 1931, 3612 ; t10a, t13a + ITX_MULSUB_2D 1, 3, 0, 6, 7, _, 3166, 2598 ; t9a, t14a + + mova m0, [r3+0*16] + mova m6, [r3+2*16] + mova m7, [r3+3*16] +.main_oddhalf_fast2: + REPX {paddd x, m4}, m0, m7, m2, m5, m1, m3 + REPX {psrad x, 12}, m0, m7, m2, m5, m1, m3 + psubd m4, m0, m1 ; t9 + paddd m0, m1 ; t8 + mova m1, [r3+1*16] + mova [r3+0*16], m4 + psubd m4, m6, m2 ; t10 + paddd m2, m6 ; t11 + psubd m6, m1, m5 ; t13 + paddd m5, m1 ; t12 + psubd m1, m7, m3 ; t14 + paddd m7, m3 ; t15 + mova m3, [o(clip_18b_min)] + REPX {pmaxsd x, m3}, m1, m4, m6, m0, m2, m5, m7 + pmaxsd m3, [r3+0*16] + mova [r3+0*16], m3 + mova m3, [o(clip_18b_max)] + REPX {pminsd x, m3}, m1, m4, m6, m0, m2, m5, m7 + pminsd m3, [r3+0*16] + mova [r3+0*16], m0 + mova [r3+1*16], m2 + mova [r3+2*16], m5 + mova [r3+3*16], m7 + mova m7, [o(pd_2048)] + ITX_MULSUB_2D 1, 3, 0, 2, 5, 7, 1567, 3784 + ITX_MULSUB_2D 6, 4, 0, 2, _, 7, 5, 3784, 4 + mova m0, [r3+0*16] + mova m2, [r3+1*16] + psubd m5, m1, m4 ; t10 + mova [r3+1*16], m5 + paddd m1, m4 ; t9 + psubd m4, m0, m2 ; t11a + paddd m0, m2 ; t8a + mova m5, [r3+2*16] + mova m7, [r3+3*16] + psubd m2, m3, m6 ; t13 + paddd m6, m3 ; t14 + paddd m3, m7, m5 ; t15a + psubd m7, m5 ; t12a + mova [r3+0*16], m3 + mova m3, [r3+1*16] + mova m5, [o(clip_18b_min)] + REPX {pmaxsd x, m5}, m2, m7, m3, m4, m0, m1, m6 + pmaxsd m5, [r3+0*16] + mova [r3+0*16], m5 + mova m5, [o(clip_18b_max)] + REPX {pminsd x, m5}, m2, m7, m3, m4, m0, m1, m6 + pminsd m5, [r3+0*16] + mova [r3+0*16], m5 + mova m5, [o(pd_2896)] + REPX {pmulld x, m5}, m2, m7, m3, m4 + mova m5, [o(pd_2048)] + REPX {paddd x, m5}, m2, m7 + paddd m5, m2, m3 ; t13a + psubd m2, m3 ; t10a + psubd m3, m7, m4 ; t11 + paddd m4, m7 ; t12 + REPX {psrad x, 12}, m5, m2, m3, m4 + mova m7, [r3+0*16] + mova [r3+11*16], m0 + mova [r3+10*16], m1 + mova [r3+9*16], m2 + mova [r3+8*16], m3 + mova [r3+7*16], m4 + mova [r3+6*16], m5 + mova [r3+5*16], m6 + mova [r3+4*16], m7 +%endif + ret +.round: +%if ARCH_X86_64 + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + pcmpeqd m8, m8 + REPX {psubd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + mova m8, [r3+1*16] + mova m9, [r3+2*16] + mova m10, [r3+3*16] + mova m11, [r3+4*16] + mova m12, [r3+5*16] + mova m13, [r3+6*16] + mova m14, [r3+7*16] + psubd m15, m0, m14 ; out15 + paddd m0, m14 ; out0 + psubd m14, m1, m13 ; out14 + paddd m1, m13 ; out1 + psubd m13, m2, m12 ; out13 + paddd m2, m12 ; out2 + psubd m12, m3, m11 ; out12 + paddd m3, m11 ; out3 + psubd m11, m4, m10 ; out11 + paddd m4, m10 ; out4 + psubd m10, m5, m9 ; out10 + paddd m5, m9 ; out5 + psubd m9, m6, m8 ; out9 + paddd m6, m8 ; out6 + psubd m8, m7, [r3+0*16] ; out8 + paddd m7, [r3+0*16] ; out7 + REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + ; and out0-15 is now in m0-15 +%else + mova [r3+ 0*16], m0 + mova m0, [o(clip_18b_min)] + REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7 + pmaxsd m0, [r3+ 0*16] + mova [r3+ 0*16], m7 + mova m7, [o(clip_18b_max)] + REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6 + pminsd m7, [r3+ 0*16] + mova [r3+ 0*16], m0 + pcmpeqd m0, m0 + REPX {psubd x, m0}, m1, m2, m3, m4, m5, m6, m7 + mova [r3+ 1*16], m1 + mova [r3+ 2*16], m2 + mova m1, [r3+ 0*16] + psubd m1, m0 + mova [r3+ 0*16], m1 + mova m1, [r3+11*16] + mova m2, [r3+10*16] + psubd m0, m7, m1 + paddd m7, m1 + psubd m1, m6, m2 + paddd m6, m2 + REPX {psrad x, 1}, m0, m1, m6, m7 + packssdw m0, m1 ; out8-9 + packssdw m6, m7 ; out6-7 + mova [r3+11*16], m6 + mova m1, [r3+9*16] + mova m7, [r3+8*16] + psubd m2, m5, m1 + paddd m5, m1 + psubd m1, m4, m7 + paddd m4, m7 + REPX {psrad x, 1}, m2, m1, m4, m5 + packssdw m2, m1 ; out10-11 + packssdw m4, m5 ; out4-5 + mova m1, [r3+2*16] + mova [r3+10*16], m4 + mova m6, [r3+7*16] + mova m7, [r3+6*16] + psubd m4, m3, m6 + paddd m3, m6 + psubd m6, m1, m7 + paddd m1, m7 + REPX {psrad x, 1}, m4, m6, m1, m3 + packssdw m4, m6 ; out12-13 + packssdw m1, m3 ; out2-3 + mova m3, [r3+1*16] + mova [r3+9*16], m1 + mova m1, [r3+0*16] + mova m5, [r3+5*16] + mova m7, [r3+4*16] + psubd m6, m3, m5 + paddd m3, m5 + psubd m5, m1, m7 + paddd m1, m7 + REPX {psrad x, 1}, m6, m5, m1, m3 + packssdw m6, m5 ; out14-15 + packssdw m1, m3 ; out0-1 + mova [r3+8*16], m1 +%endif + ret + +.pass2: + lea r4, [o(m_suffix(idct_8x4_internal_8bpc, _ssse3).main)] +.pass2_loop: + lea r3, [strideq*3] +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call r4 + call m(idct_8x4_internal_16bpc).round2_and_write_8x4 + REPX {mova [cq+x*16], m6}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +%if ARCH_X86_64 + mova m0, m8 + mova m1, m9 + mova m2, m10 + mova m3, m11 +%else + mova m0, [rsp+gprsize+0*16] + mova m1, [rsp+gprsize+1*16] + mova m2, [rsp+gprsize+2*16] + mova m3, [rsp+gprsize+3*16] +%endif + add dstq, 16 +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call r4 + call m(idct_8x4_internal_16bpc).round2_and_write_8x4 + RET + +INV_TXFM_16X4_FN adst, dct +INV_TXFM_16X4_FN adst, adst +INV_TXFM_16X4_FN adst, flipadst +INV_TXFM_16X4_FN adst, identity + +cglobal iadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + ; setup stack pointer + lea r3, [rsp+gprsize] + call .main +%if ARCH_X86_64 + jmp m(idct_16x4_internal_16bpc).pack_transpose +%else + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [rsp+gprsize+0*16], m0 + mova [rsp+gprsize+1*16], m1 + mova [rsp+gprsize+2*16], m2 + mova [rsp+gprsize+3*16], m3 + mova m0, [rsp+gprsize+ 8*16] + mova m2, [rsp+gprsize+ 9*16] + mova m4, [rsp+gprsize+10*16] + mova m6, [rsp+gprsize+11*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + jmp tx2q +%endif + +.main: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + mova m0, [cq+ 2*16] + mova m1, [cq+13*16] + mova m2, [cq+ 6*16] + mova m3, [cq+ 9*16] + mova m4, [cq+10*16] + mova m5, [cq+ 5*16] + mova m6, [cq+14*16] + mova m7, [cq+ 1*16] + call .main_part1 + mova m0, [cq+ 0*16] + mova m1, [cq+15*16] + mova m2, [cq+ 4*16] + mova m3, [cq+11*16] + mova m4, [cq+ 8*16] + mova m5, [cq+ 7*16] + mova m6, [cq+12*16] + mova m7, [cq+ 3*16] + call .main_part2 +.round: +%if ARCH_X86_64 + mova m15, [o(pd_6144)] + psrld m14, 11 ; pd_1 + pcmpeqd m8, m8 ; -1 + psubd m13, m15, m14 ; pd_6143 + REPX {paddd x, m14}, m0, m2 + REPX {paddd x, m15}, m4, m6 + REPX {pxor x, m8 }, m1, m3, m5, m7 + REPX {psrad x, 1 }, m1, m3 + REPX {paddd x, m15}, m5, m7 + REPX {psubd x, m8 }, m1, m3 + paddd m8, m15, m9 + psubd m9, m13, m10 + paddd m10, m15, m11 + psubd m11, m13, m12 + paddd m12, m14, [r3+3*16] + psubd m13, m14, [r3+2*16] + psubd m15, m14, [r3+0*16] + paddd m14, [r3+1*16] + REPX {psrad x, 1 }, m0, m2, m12, m13, m14, m15 + REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11 +%else + mova [r3+8*16], m1 + mova [r3+9*16], m3 + mova m3, [o(pd_6144)] + pcmpeqd m1, m1 + REPX {pxor x, m1}, m5, m7 + REPX {paddd x, m3}, m4, m5, m6, m7 + REPX {psrad x, 13}, m4, m5, m6, m7 + packssdw m4, m5 + packssdw m6, m7 + mova [r3+10*16], m4 + mova [r3+11*16], m6 + mova m4, [r3+4*16] + mova m5, [r3+5*16] + mova m6, [r3+6*16] + mova m7, [r3+7*16] + REPX {pxor x, m1}, m5, m7 + REPX {psubd x, m1}, m4, m6 + REPX {psrad x, 1 }, m4, m5, m6, m7 + REPX {psubd x, m1}, m5, m7 + packssdw m4, m5 + packssdw m6, m7 + mova m5, [r3+8*16] + mova m7, [r3+9*16] + mova [r3+8*16], m4 + mova [r3+9*16], m6 + REPX {pxor x, m1}, m5, m7 + REPX {paddd x, m3}, m0, m5, m2, m7 + REPX {psrad x, 13}, m0, m5, m2, m7 + packssdw m0, m5 + packssdw m2, m7 + mova m4, [r3+0*16] + mova m5, [r3+1*16] + mova m6, [r3+2*16] + mova m7, [r3+3*16] + REPX {psubd x, m1}, m4, m6 + REPX {pxor x, m1}, m5, m7 + REPX {psrad x, 1 }, m4, m5, m6, m7 + REPX {psubd x, m1}, m5, m7 + packssdw m4, m5 + packssdw m6, m7 +%endif + ret + +.main_part2: +%if ARCH_X86_64 + ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201, 4091 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751, 3703 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035, 2751 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857, 1380 + psubd m8, m0, m4 ; t8a + paddd m0, m4 ; t0a + psubd m4, m1, m5 ; t9a + paddd m1, m5 ; t1a + psubd m5, m2, m6 ; t12a + paddd m2, m6 ; t4a + psubd m6, m3, m7 ; t13a + paddd m7, m3 ; t5a + REPX {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7 + REPX {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 + mova m15, [o(pd_4017)] + mova m10, [o(pd_799)] + ITX_MULSUB_2D 8, 4, 3, 9, _, 11, 10, 15 + ITX_MULSUB_2D 6, 5, 3, 9, _, 11, 15, 10 + psubd m3, m0, m2 ; t4 + paddd m0, m2 ; t0 + psubd m2, m1, m7 ; t5 + paddd m1, m7 ; t1 + psubd m7, m4, m6 ; t12a + paddd m4, m6 ; t8a + psubd m6, m8, m5 ; t13a + paddd m5, m8 ; t9a + REPX {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5 + REPX {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 + mova m15, [o(pd_3784)] + mova m10, [o(pd_1567)] + ITX_MULSUB_2D 3, 2, 8, 9, _, 11, 10, 15 + ITX_MULSUB_2D 7, 6, 8, 9, _, 11, 10, 15 + mova m10, [r3+0*16] ; t2 + mova m8, [r3+1*16] ; t3 + psubd m9, m0, m10 ; t2a + paddd m0, m10 ; out0 + psubd m10, m1, m8 ; t3a + paddd m1, m8 ; -out15 + mova [r3+0*16], m1 + mova m15, [r3+3*16] ; t7a + mova m1, [r3+2*16] ; t6a + psubd m8, m3, m15 ; t7 + paddd m15, m3 ; out12 + paddd m3, m2, m1 ; -out3 + psubd m2, m1 ; t6 + mova [r3+3*16], m15 + mova [r3+1*16], m2 + mova m1, [r3+7*16] ; t15 + mova m2, [r3+6*16] ; t14 + paddd m15, m7, m1 ; -out13 + psubd m7, m1 ; t15a + psubd m11, m6, m2 ; t14a + paddd m2, m6 ; out2 + mova [r3+2*16], m15 + mova m1, [r3+4*16] ; t10a + mova m15, [r3+5*16] ; t11a + psubd m6, m4, m1 ; t10 + paddd m1, m4 ; -out1 + psubd m4, m5, m15 ; t11 + paddd m5, m15 ; out14 + REPX {pmaxsd x, m12}, m11, m7, m9, m10, m6, m4, m8 + pmaxsd m12, [r3+1*16] ; t6 + mova [r3+1*16], m5 + REPX {pminsd x, m13}, m11, m7, m9, m10, m6, m4, m12, m8 + REPX {pmulld x, m14}, m11, m7, m9, m10, m6, m4, m12, m8 + paddd m5, m11, m7 ; -out5 (unshifted) + psubd m11, m7 ; out10 (unshifted) + paddd m7, m9, m10 ; -out7 (unshifted) + psubd m9, m10 ; out8 (unshifted) + psubd m10, m6, m4 ; -out9 (unshifted) + paddd m6, m4 ; out6 (unshifted) + paddd m4, m12, m8 ; out4 (unshifted) + psubd m12, m8 ; -out11 (unshifted) +%else + mova [r3+8*16], m0 + mova [r3+9*16], m1 + mova [r3+10*16], m2 + mova [r3+11*16], m3 + mova m3, [o(pd_2048)] + ITX_MULSUB_2D 5, 4, 0, 1, 2, 3, 3035, 2751 + ITX_MULSUB_2D 7, 6, 0, 1, 2, 3, 3857, 1380 + mova m0, [r3+8*16] + mova m1, [r3+9*16] + mova [r3+8*16], m4 + mova m4, [r3+10*16] + mova [r3+9*16], m5 + mova [r3+10*16], m6 + mova m5, [r3+11*16] + mova [r3+11*16], m7 + ITX_MULSUB_2D 1, 0, 2, 6, 7, 3, 201, 4091 + ITX_MULSUB_2D 5, 4, 2, 6, 7, 3, 1751, 3703 + mova m2, [r3+8*16] + mova m6, [r3+9*16] + psubd m3, m0, m2 ; t8a + paddd m0, m2 ; t0a + mova [r3+8*16], m3 + psubd m2, m1, m6 ; t9a + paddd m1, m6 ; t1a + mova m3, [r3+10*16] + psubd m6, m4, m3 ; t12a + paddd m4, m3 ; t4a + mova m3, [r3+11*16] + psubd m7, m5, m3 ; t13a + paddd m5, m3 ; t5a + mova m3, [o(clip_18b_min)] + REPX {pmaxsd x, m3}, m2, m6, m7, m0, m1, m4, m5 + pmaxsd m3, [r3+8*16] + mova [r3+8*16], m3 + mova m3, [o(clip_18b_max)] + REPX {pminsd x, m3}, m2, m6, m7, m0, m1, m4, m5 + pminsd m3, [r3+8*16] + mova [r3+8*16], m3 + psubd m3, m0, m4 ; t4 + paddd m0, m4 ; t0 + psubd m4, m1, m5 ; t5 + paddd m1, m5 ; t1 + mova m5, [o(pd_2048)] + mova [r3+9*16], m1 + mova [r3+10*16], m4 + mova [r3+11*16], m3 + mova m3, [r3+8*16] + mova [r3+8*16], m0 + ITX_MULSUB_2D 3, 2, 0, 1, 4, 5, 799, 4017 + ITX_MULSUB_2D 7, 6, 0, 1, 4, 5, 4017, 4 + psubd m5, m2, m7 ; t12a + paddd m2, m7 ; t8a + psubd m7, m3, m6 ; t13a + paddd m6, m3 ; t9a + mova m0, [r3+8*16] + mova m1, [r3+9*16] + mova m4, [r3+10*16] + mova m3, [o(clip_18b_min)] + REPX {pmaxsd x, m3}, m4, m5, m7, m0, m1, m2, m6 + pmaxsd m3, [r3+11*16] + mova [r3+8*16], m3 + mova m3, [o(clip_18b_max)] + REPX {pminsd x, m3}, m4, m5, m7, m0, m1, m2, m6 + pminsd m3, [r3+8*16] + mova [r3+8*16], m0 + mova [r3+9*16], m1 + mova [r3+10*16], m2 + mova [r3+11*16], m6 + mova m0, [o(pd_2048)] + ITX_MULSUB_2D 3, 4, 1, 2, 6, 0, 1567, 3784 + ITX_MULSUB_2D 5, 7, 1, 2, 6, 0, 6, 3784 + mova m0, [r3+7*16] ; t7a + mova m2, [r3+6*16] ; t6a + psubd m1, m3, m0 ; t7 + paddd m0, m3 ; out12 + paddd m3, m4, m2 ; -out3 + psubd m4, m2 ; t6 + mova [r3+7*16], m3 + mova m3, [r3+3*16] ; t15 + mova m2, [r3+2*16] ; t14 + paddd m6, m5, m3 ; -out13 + psubd m5, m3 ; t15a + psubd m3, m7, m2 ; t14a + paddd m2, m7 ; out2 + mova [r3+6*16], m2 + mova m7, [r3+0*16] ; t10a + mova m2, [r3+1*16] ; t11a + mova [r3+0*16], m0 + mova [r3+1*16], m6 + mova m6, [r3+11*16] + psubd m0, m6, m2 ; t11 + paddd m6, m2 ; out14 + mova [r3+2*16], m6 + mova m2, [r3+10*16] + psubd m6, m2, m7 ; t10 + paddd m2, m7 ; -out1 + mova m7, [r3+5*16] ; t3 + mova [r3+5*16], m2 + mova [r3+10*16], m1 + mova m1, [r3+9*16] + psubd m2, m1, m7 ; t3a + paddd m1, m7 ; -out15 + mova [r3+3*16], m1 + mova m1, [r3+4*16] ; t2 + mova m7, [r3+8*16] + psubd m7, m1 ; t2a + paddd m1, [r3+8*16] ; out0 + mova [r3+4*16], m1 + mova m1, [o(clip_18b_min)] + REPX {pmaxsd x, m1}, m0, m2, m3, m4, m5, m6, m7 + pmaxsd m1, [r3+10*16] + mova [r3+10*16], m1 + mova m1, [o(clip_18b_max)] + REPX {pminsd x, m1}, m0, m2, m3, m4, m5, m6, m7 + pminsd m1, [r3+10*16] + mova [r3+10*16], m1 + mova m1, [o(pd_2896)] + REPX {pmulld x, m1}, m0, m2, m3, m4, m5, m6, m7 + pmulld m1, [r3+10*16] + mova [r3+11*16], m3 + psubd m3, m4, m1 ; -out11 (unshifted) + paddd m4, m1 ; out4 (unshifted) + psubd m1, m6, m0 ; -out9 (unshifted) + paddd m6, m0 ; out6 (unshifted) + psubd m0, m7, m2 ; out8 (unshifted) + paddd m7, m2 ; -out7 (unshifted) + mova m2, [r3+11*16] + mova [r3+11*16], m5 + paddd m5, m2 ; -out5 (unshifted) + psubd m2, [r3+11*16] ; out10 (unshifted) + ; m0-3 contain out8-11 (unshifted), m4-7 contain out4-7 (unshifted) + ; r[-4,3] contain out0-3 and out12-15 +%endif + ret +.main_part1: +%if ARCH_X86_64 + ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 995, 3973 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 2440, 3290 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3513, 2106 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 4052, 601 + psubd m8, m0, m4 ; t10a + paddd m0, m4 ; t2a + psubd m4, m1, m5 ; t11a + paddd m1, m5 ; t3a + psubd m5, m2, m6 ; t14a + paddd m2, m6 ; t6a + psubd m6, m3, m7 ; t15a + paddd m7, m3 ; t7a + REPX {pmaxsd x, m12}, m8, m4, m5, m6, m0, m1, m2, m7 + REPX {pminsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 + mova m15, [o(pd_2276)] + mova m10, [o(pd_3406)] + ITX_MULSUB_2D 8, 4, 3, 9, _, 11, 10, 15 + ITX_MULSUB_2D 6, 5, 3, 9, _, 11, 15, 10 + psubd m3, m0, m2 ; t6 + paddd m0, m2 ; t2 + psubd m2, m1, m7 ; t7 + paddd m1, m7 ; t3 + psubd m7, m4, m6 ; t14a + paddd m4, m6 ; t10a + psubd m6, m8, m5 ; t15a + paddd m5, m8 ; t11a + REPX {pmaxsd x, m12}, m3, m2, m7, m6, m0, m1, m4, m5 + REPX {pminsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 + mova m15, [o(pd_1567)] + mova m10, [o(pd_3784)] + ITX_MULSUB_2D 2, 3, 8, 9, _, 11, 10, 15 + ITX_MULSUB_2D 6, 7, 8, 9, _, 11, 10, 15 + mova [r3+0*16], m0 + mova [r3+1*16], m1 + mova [r3+4*16], m4 + mova [r3+5*16], m5 + mova [r3+2*16], m2 + mova [r3+3*16], m3 + mova [r3+6*16], m6 + mova [r3+7*16], m7 +%else + mova [r3+4*16], m0 + mova [r3+5*16], m1 + mova [r3+6*16], m2 + mova [r3+7*16], m3 + mova m3, [o(pd_2048)] + ITX_MULSUB_2D 5, 4, 0, 1, 2, 3, 3513, 2106 + ITX_MULSUB_2D 7, 6, 0, 1, 2, 3, 4052, 601 + mova [r3+0*16], m4 + mova [r3+1*16], m5 + mova [r3+2*16], m6 + mova [r3+3*16], m7 + mova m0, [r3+4*16] + mova m1, [r3+5*16] + mova m2, [r3+6*16] + mova m7, [r3+7*16] + ITX_MULSUB_2D 1, 0, 4, 5, 6, 3, 995, 3973 + ITX_MULSUB_2D 7, 2, 4, 5, 6, 3, 2440, 3290 + mova m4, [r3+0*16] + mova m5, [r3+1*16] + psubd m6, m0, m4 ; t10a + paddd m0, m4 ; t2a + mova [r3+4*16], m6 + mova m6, [r3+2*16] + mova m3, [r3+3*16] + psubd m4, m1, m5 ; t11a + paddd m1, m5 ; t3a + psubd m5, m2, m6 ; t14a + paddd m2, m6 ; t6a + psubd m6, m7, m3 ; t15a + paddd m7, m3 ; t7a + mova m3, [o(clip_18b_min)] + REPX {pmaxsd x, m3}, m4, m5, m6, m0, m1, m2, m7 + pmaxsd m3, [r3+4*16] + mova [r3+4*16], m3 + mova m3, [o(clip_18b_max)] + REPX {pminsd x, m3}, m4, m5, m6, m0, m1, m2, m7 + pminsd m3, [r3+4*16] + mova [r3+4*16], m3 + psubd m3, m0, m2 ; t6 + paddd m0, m2 ; t2 + psubd m2, m1, m7 ; t7 + paddd m1, m7 ; t3 + mova [r3+5*16], m1 + mova [r3+6*16], m3 + mova [r3+7*16], m2 + mova m1, [r3+4*16] + mova [r3+4*16], m0 + mova m3, [o(pd_2048)] + ITX_MULSUB_2D 1, 4, 0, 7, 2, 3, 3406, 2276 + ITX_MULSUB_2D 6, 5, 0, 7, 2, 3, 2276, 2 + psubd m7, m4, m6 ; t14a + paddd m4, m6 ; t10a + psubd m6, m1, m5 ; t15a + paddd m5, m1 ; t11a + mova m1, [r3+5*16] + mova m3, [r3+6*16] + mova m2, [r3+7*16] + mova m0, [o(clip_18b_min)] + REPX {pmaxsd x, m0}, m3, m2, m7, m6, m1, m4, m5 + pmaxsd m0, [r3+4*16] + mova [r3+4*16], m0 + mova m0, [o(clip_18b_max)] + REPX {pminsd x, m0}, m3, m2, m7, m6, m1, m4, m5 + pminsd m0, [r3+4*16] + mova [r3+4*16], m0 + mova [r3+5*16], m1 + mova [r3+0*16], m4 + mova [r3+1*16], m5 + mova m0, [o(pd_2048)] + ITX_MULSUB_2D 2, 3, 1, 4, 5, 0, 3784, 1567 + ITX_MULSUB_2D 6, 7, 1, 4, 5, 0, 5, 1567 + mova [r3+6*16], m2 + mova [r3+7*16], m3 + mova [r3+2*16], m6 + mova [r3+3*16], m7 +%endif + ret + +.pass2: + lea r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)] + jmp m(idct_16x4_internal_16bpc).pass2_loop + +INV_TXFM_16X4_FN flipadst, dct +INV_TXFM_16X4_FN flipadst, adst +INV_TXFM_16X4_FN flipadst, flipadst +INV_TXFM_16X4_FN flipadst, identity + +cglobal iflipadst_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 + lea r3, [rsp+gprsize] + call m(iadst_16x4_internal_16bpc).main +%if ARCH_X86_64 + packssdw m1, m0 + packssdw m3, m2 + packssdw m5, m4 + packssdw m7, m6 + packssdw m9, m8 + packssdw m11, m10 + packssdw m13, m12 + packssdw m15, m14 + mova m0, m15 + mova m2, m13 + mova m4, m11 + mova m6, m9 + mova m8, m7 + mova m10, m5 + mova m12, m3 + mova m14, m1 + jmp m(idct_16x4_internal_16bpc).transpose +%else + mova [rsp+gprsize+4*16], m0 + mova [rsp+gprsize+5*16], m2 + mova [rsp+gprsize+6*16], m4 + mova [rsp+gprsize+7*16], m6 + pshufd m6, [rsp+gprsize+ 8*16], q1032 + pshufd m4, [rsp+gprsize+ 9*16], q1032 + pshufd m2, [rsp+gprsize+10*16], q1032 + pshufd m0, [rsp+gprsize+11*16], q1032 + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [rsp+gprsize+0*16], m0 + mova [rsp+gprsize+1*16], m1 + mova [rsp+gprsize+2*16], m2 + mova [rsp+gprsize+3*16], m3 + pshufd m6, [rsp+gprsize+ 4*16], q1032 + pshufd m4, [rsp+gprsize+ 5*16], q1032 + pshufd m2, [rsp+gprsize+ 6*16], q1032 + pshufd m0, [rsp+gprsize+ 7*16], q1032 + call m(idct_8x4_internal_16bpc).transpose4x8packed + jmp tx2q +%endif + +.pass2: + lea r3, [strideq*3] + lea dstq, [dstq+r3] + neg strideq + lea r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)] + jmp m(idct_16x4_internal_16bpc).pass2_loop + +INV_TXFM_16X4_FN identity, dct +INV_TXFM_16X4_FN identity, adst +INV_TXFM_16X4_FN identity, flipadst +INV_TXFM_16X4_FN identity, identity + +cglobal iidentity_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_64 + mova m15, [o(pd_11586)] + pmulld m0, m15, [cq+ 0*16] + pmulld m1, m15, [cq+ 1*16] + pmulld m2, m15, [cq+ 2*16] + pmulld m3, m15, [cq+ 3*16] + pmulld m4, m15, [cq+ 4*16] + pmulld m5, m15, [cq+ 5*16] + pmulld m6, m15, [cq+ 6*16] + pmulld m7, m15, [cq+ 7*16] + pmulld m8, m15, [cq+ 8*16] + pmulld m9, m15, [cq+ 9*16] + pmulld m10, m15, [cq+10*16] + pmulld m11, m15, [cq+11*16] + pmulld m12, m15, [cq+12*16] + pmulld m13, m15, [cq+13*16] + pmulld m14, m15, [cq+14*16] + pmulld m15, [cq+15*16] + mova [cq+ 0*16], m15 + mova m15, [o(pd_6144)] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + paddd m15, [cq+ 0*16] + REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp m(idct_16x4_internal_16bpc).pack_transpose +%else + add cq, 8*16 + mov r5d, 2 +.loop_pass1: + mova m7, [o(pd_11586)] + pmulld m0, m7, [cq+0*16] + pmulld m1, m7, [cq+1*16] + pmulld m2, m7, [cq+2*16] + pmulld m3, m7, [cq+3*16] + pmulld m4, m7, [cq+4*16] + pmulld m5, m7, [cq+5*16] + pmulld m6, m7, [cq+6*16] + pmulld m7, [cq+7*16] + mova [cq+7*16], m7 + mova m7, [o(pd_6144)] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 + paddd m7, [cq+7*16] + REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + call m(idct_8x4_internal_16bpc).transpose4x8packed + dec r5d + jz .end_pass1 + mova [rsp+gprsize+0*16], m0 + mova [rsp+gprsize+1*16], m1 + mova [rsp+gprsize+2*16], m2 + mova [rsp+gprsize+3*16], m3 + sub cq, 8*16 + jmp .loop_pass1 +.end_pass1: + jmp tx2q +%endif + +.pass2: +%if ARCH_X86_64 + mova m12, [o(pw_1697x8)] +%endif + lea r4, [o(.main)] + jmp m(idct_16x4_internal_16bpc).pass2_loop +.main: +%if ARCH_X86_64 + pmulhrsw m4, m0, m12 + pmulhrsw m5, m1, m12 + pmulhrsw m6, m2, m12 + pmulhrsw m7, m3, m12 +%else + mova m7, [o(pw_1697x8)] + pmulhrsw m4, m0, m7 + pmulhrsw m5, m1, m7 + pmulhrsw m6, m2, m7 + pmulhrsw m7, m3 +%endif + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + ret + +%macro INV_TXFM_16X8_FN 2-3 0 ; type1, type2, eob_offset +%if ARCH_X86_64 + INV_TXFM_FN %1, %2, %3, 16x8, 16, 0-8*16 +%else + INV_TXFM_FN %1, %2, %3, 16x8, 8, 0-13*16 +%endif +%ifidn %1_%2, dct_dct + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 8 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 +%if ARCH_X86_32 + add rsp, 1*16 +%endif + jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly +%endif +%endmacro + +INV_TXFM_16X8_FN dct, dct +INV_TXFM_16X8_FN dct, identity, 6 +INV_TXFM_16X8_FN dct, adst +INV_TXFM_16X8_FN dct, flipadst + +cglobal idct_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_64 + DECLARE_REG_TMP 6, 4, 6 +%else + mov [rsp+gprsize+12*16], r1 + DECLARE_REG_TMP 1, 4, 3 +%endif + lea t0, [o(.main)] +.loop_main: +%undef cmp +%if ARCH_X86_64 + xor r5d, r5d + cmp eobd, 10 + setge r5b +%else + mov r5d, 1 + cmp eobd, 10 + sbb r5d, 0 +%endif + shl r5d, 4 + + lea r3, [rsp+gprsize] +.loop_pass1: + call t0 +%if ARCH_X86_64 + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [cq+4*32+r5], m8 + mova [cq+5*32+r5], m9 + mova [cq+6*32+r5], m10 + mova [cq+7*32+r5], m11 +%else + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [cq+4*32+r5], m0 + mova [cq+5*32+r5], m1 + mova [cq+6*32+r5], m2 + mova [cq+7*32+r5], m3 + mova m0, [rsp+gprsize+ 8*16] + mova m2, [rsp+gprsize+ 9*16] + mova m4, [rsp+gprsize+10*16] + mova m6, [rsp+gprsize+11*16] +%endif + call m(idct_8x4_internal_16bpc).transpose4x8packed + pxor m7, m7 + REPX {mova [cq+x*32+r5], m7}, 8, 9, 10, 11, 12, 13, 14, 15 + test r5d, r5d + jz .end + mova [cq+0*32+r5], m0 + mova [cq+1*32+r5], m1 + mova [cq+2*32+r5], m2 + mova [cq+3*32+r5], m3 + xor r5d, r5d + jmp .loop_pass1 +.end: + + jmp tx2q +.main: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + mova m0, [cq+ 1*32+r5] + mova m1, [cq+ 3*32+r5] + mova m2, [cq+ 5*32+r5] + mova m3, [cq+ 7*32+r5] + mova m4, [cq+ 9*32+r5] + mova m5, [cq+11*32+r5] + mova m6, [cq+13*32+r5] + mova m7, [cq+15*32+r5] + call m(idct_8x4_internal_16bpc).rect2_mul + call m(idct_16x4_internal_16bpc).main_oddhalf + + mova m0, [cq+ 0*32+r5] + mova m1, [cq+ 2*32+r5] + mova m2, [cq+ 4*32+r5] + mova m3, [cq+ 6*32+r5] + mova m4, [cq+ 8*32+r5] + mova m5, [cq+10*32+r5] + mova m6, [cq+12*32+r5] + mova m7, [cq+14*32+r5] + call m(idct_8x4_internal_16bpc).rect2_mul + call m(idct_8x4_internal_16bpc).main_pass1 + call m(idct_8x4_internal_16bpc).round + call m(idct_16x4_internal_16bpc).round +%if ARCH_X86_64 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m14, m15 +%endif + ret + +.pass2: +%if ARCH_X86_32 + mov strideq, [rsp+gprsize+12*16] +%endif + mov r4d, 2 +.pass2_main: +%if ARCH_X86_64 + mova m8, [o(pw_2048)] + pxor m9, m9 + mova m10, [o(pixel_10bpc_max)] +%endif + lea r3, [strideq*3] + jmp .loop_pass2_entry +.loop_pass2: + mova m0, [cq+0*32+ 0] + mova m1, [cq+1*32+ 0] + mova m2, [cq+2*32+ 0] + mova m3, [cq+3*32+ 0] +.loop_pass2_entry: + mova m4, [cq+0*32+16] + mova m5, [cq+1*32+16] + mova m6, [cq+2*32+16] + mova m7, [cq+3*32+16] +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(idct_8x8_internal_8bpc, _ssse3).main + call m(idct_8x8_internal_16bpc).round2_and_write_8x8 +%if ARCH_X86_64 +%define mzero m9 +%else +%define mzero m7 + pxor m7, m7 +%endif + REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 + add dstq, 16 + add cq, 4*32 + dec r4d + jg .loop_pass2 + RET + +INV_TXFM_16X8_FN adst, dct +INV_TXFM_16X8_FN adst, adst +INV_TXFM_16X8_FN adst, flipadst +INV_TXFM_16X8_FN adst, identity, 6 + +cglobal iadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_32 + mov [rsp+gprsize+12*16], r1 +%endif + lea t0, [o(.main)] + jmp m(idct_16x8_internal_16bpc).loop_main + +.main: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + mova m0, [cq+ 2*32+r5] + mova m1, [cq+13*32+r5] + mova m2, [cq+ 6*32+r5] + mova m3, [cq+ 9*32+r5] + mova m4, [cq+10*32+r5] + mova m5, [cq+ 5*32+r5] + mova m6, [cq+14*32+r5] + mova m7, [cq+ 1*32+r5] + call m(idct_8x4_internal_16bpc).rect2_mul + call m(iadst_16x4_internal_16bpc).main_part1 + mova m0, [cq+ 0*32+r5] + mova m1, [cq+15*32+r5] + mova m2, [cq+ 4*32+r5] + mova m3, [cq+11*32+r5] + mova m4, [cq+ 8*32+r5] + mova m5, [cq+ 7*32+r5] + mova m6, [cq+12*32+r5] + mova m7, [cq+ 3*32+r5] +%if ARCH_X86_32 + add r3, 8*16 +%endif + call m(idct_8x4_internal_16bpc).rect2_mul +%if ARCH_X86_32 + sub r3, 8*16 +%endif + call m(iadst_16x4_internal_16bpc).main_part2 + call m(iadst_16x4_internal_16bpc).round +%if ARCH_X86_64 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m14, m15 +%endif + ret + +.pass2: +%if ARCH_X86_32 + mov strideq, [rsp+gprsize+12*16] +%endif + mov r4d, 2 +%if ARCH_X86_64 + mova m8, [o(pw_2048)] + pxor m9, m9 + mova m10, [o(pixel_10bpc_max)] + mova m11, [o(pw_m2048)] +%endif + lea r3, [strideq*3] + jmp .loop_pass2_entry +.loop_pass2: + mova m0, [cq+0*32+ 0] + mova m1, [cq+1*32+ 0] + mova m2, [cq+2*32+ 0] + mova m3, [cq+3*32+ 0] +.loop_pass2_entry: + mova m4, [cq+0*32+16] + mova m5, [cq+1*32+16] + mova m6, [cq+2*32+16] + mova m7, [cq+3*32+16] +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main + call m_suffix(iadst_8x8_internal_8bpc, _ssse3).main_pass2_end + call m(iadst_8x8_internal_16bpc).round2_and_write_8x8 +%if ARCH_X86_64 +%define mzero m9 +%else +%define mzero m7 + pxor m7, m7 +%endif + REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 + add dstq, 16 + add cq, 4*32 + dec r4d + jg .loop_pass2 + RET + +INV_TXFM_16X8_FN flipadst, dct +INV_TXFM_16X8_FN flipadst, adst +INV_TXFM_16X8_FN flipadst, flipadst +INV_TXFM_16X8_FN flipadst, identity, 6 + +cglobal iflipadst_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_32 + mov [rsp+gprsize+12*16], r1 +%endif + lea t0, [o(.main)] + jmp m(idct_16x8_internal_16bpc).loop_main +.main: + call m(iadst_16x8_internal_16bpc).main +%if ARCH_X86_64 + pshufd m1, m0, q1032 + pshufd m3, m2, q1032 + pshufd m5, m4, q1032 + pshufd m7, m6, q1032 + pshufd m0, m14, q1032 + pshufd m2, m12, q1032 + pshufd m4, m10, q1032 + pshufd m6, m8, q1032 + mova m14, m1 + mova m12, m3 + mova m10, m5 + mova m8, m7 +%else + pshufd m1, m0, q1032 + pshufd m3, m2, q1032 + pshufd m5, m4, q1032 + pshufd m7, m6, q1032 + pshufd m0, [r3+11*16], q1032 + pshufd m2, [r3+10*16], q1032 + pshufd m4, [r3+9*16], q1032 + pshufd m6, [r3+8*16], q1032 + mova [r3+8*16], m7 + mova [r3+9*16], m5 + mova [r3+10*16], m3 + mova [r3+11*16], m1 +%endif + ret + +.pass2: +%if ARCH_X86_32 + mov strideq, [rsp+gprsize+12*16] +%endif + lea dstq, [dstq+strideq*8] + neg strideq + add dstq, strideq +%if ARCH_X86_32 + mov [rsp+gprsize+12*16], strideq +%endif + jmp m(iadst_16x8_internal_16bpc).pass2 + +INV_TXFM_16X8_FN identity, dct, -54 +INV_TXFM_16X8_FN identity, adst, -54 +INV_TXFM_16X8_FN identity, flipadst, -54 +INV_TXFM_16X8_FN identity, identity + +cglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_32 + mov [rsp+gprsize+12*16], r1 +%endif + lea t0, [o(.main)] + jmp m(idct_16x8_internal_16bpc).loop_main +.main: +%if ARCH_X86_64 + mova m15, [o(pd_2896)] + pmulld m0, m15, [cq+ 0*32+r5] + pmulld m1, m15, [cq+ 1*32+r5] + pmulld m2, m15, [cq+ 2*32+r5] + pmulld m3, m15, [cq+ 3*32+r5] + pmulld m4, m15, [cq+ 4*32+r5] + pmulld m5, m15, [cq+ 5*32+r5] + pmulld m6, m15, [cq+ 6*32+r5] + pmulld m7, m15, [cq+ 7*32+r5] + pmulld m8, m15, [cq+ 8*32+r5] + pmulld m9, m15, [cq+ 9*32+r5] + pmulld m10, m15, [cq+10*32+r5] + pmulld m11, m15, [cq+11*32+r5] + pmulld m12, m15, [cq+12*32+r5] + pmulld m13, m15, [cq+13*32+r5] + pmulld m14, m15, [cq+14*32+r5] + pmulld m15, [cq+15*32+r5] + mova [r3], m15 + mova m15, [o(pd_2048)] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + paddd m15, [r3] + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + mova [r3], m15 + mova m15, [o(pd_11586)] + REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + pmulld m15, [r3] + mova [r3], m15 + mova m15, [o(pd_6144)] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + paddd m15, [r3] + REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m14, m15 +%else + mova m0, [cq+ 0*32+r5] + mova m1, [cq+ 1*32+r5] + mova m2, [cq+ 2*32+r5] + mova m3, [cq+ 3*32+r5] + mova m4, [cq+ 4*32+r5] + mova m5, [cq+ 5*32+r5] + mova m6, [cq+ 6*32+r5] + mova m7, [cq+ 7*32+r5] + call m(idct_8x4_internal_16bpc).rect2_mul + mova [r3], m7 + mova m7, [o(pd_11586)] + REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulld m7, [r3] + mova [r3], m7 + mova m7, [o(pd_6144)] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 + paddd m7, [r3] + REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + mova [r3+ 8*16], m0 + mova [r3+ 9*16], m2 + mova [r3+10*16], m4 + mova [r3+11*16], m6 + mova m0, [cq+ 8*32+r5] + mova m1, [cq+ 9*32+r5] + mova m2, [cq+10*32+r5] + mova m3, [cq+11*32+r5] + mova m4, [cq+12*32+r5] + mova m5, [cq+13*32+r5] + mova m6, [cq+14*32+r5] + mova m7, [cq+15*32+r5] + call m(idct_8x4_internal_16bpc).rect2_mul + mova [r3], m7 + mova m7, [o(pd_11586)] + REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6 + pmulld m7, [r3] + mova [r3], m7 + mova m7, [o(pd_6144)] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 + paddd m7, [r3] + REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 +%endif + ret +.pass2: +%if ARCH_X86_32 + mov strideq, [rsp+gprsize+12*16] +%endif + mov r4d, 2 +%if ARCH_X86_64 + mova m8, [o(pw_4096)] + pxor m9, m9 + mova m10, [o(pixel_10bpc_max)] +%endif + lea r3, [strideq*3] + jmp .loop_pass2_entry +.loop_pass2: + mova m0, [cq+0*32+ 0] + mova m1, [cq+1*32+ 0] + mova m2, [cq+2*32+ 0] + mova m3, [cq+3*32+ 0] +.loop_pass2_entry: + mova m4, [cq+0*32+16] + mova m5, [cq+1*32+16] + mova m6, [cq+2*32+16] + mova m7, [cq+3*32+16] +%if ARCH_X86_64 + call m(idct_8x8_internal_16bpc).round1_and_write_8x8 +%else + mova [rsp+gprsize], m7 + mova m7, [o(pw_4096)] + call m(idct_8x8_internal_16bpc).round4_and_write_8x8 +%endif +%if ARCH_X86_64 +%define mzero m9 +%else +%define mzero m7 + pxor m7, m7 +%endif + REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 + add dstq, 16 + add cq, 4*32 + dec r4d + jg .loop_pass2 + RET + +%macro INV_TXFM_16X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix +%if ARCH_X86_64 + INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 16, 0-(16+WIN64)*16 +%else + INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 8, 0-17*16 +%endif +%ifidn %1_%2, dct_dct + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 16 + add r5d, 640 + sar r5d, 10 + add rsp, (5+ARCH_X86_64*3+WIN64)*16 + jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2 +%endif +%endmacro + +INV_TXFM_16X16_FN dct, dct +INV_TXFM_16X16_FN dct, identity, v +INV_TXFM_16X16_FN dct, adst +INV_TXFM_16X16_FN dct, flipadst + +cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if ARCH_X86_64 + DECLARE_REG_TMP 6, 7 +%if WIN64 + mov [rsp+16*16+gprsize], r7 +%endif +%elif ARCH_X86_32 + DECLARE_REG_TMP 1, 6 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(.main)] +.pass1_full: +%undef cmp + mov t1d, 4 +.zero_loop: + dec t1d + cmp eobb, byte [r5+t1] + jb .zero_loop + mov r5d, t1d + shl r5d, 4 +%if ARCH_X86_32 + ; restore pic-ptr + mov r6, [rsp+16*16+2*gprsize] +%endif + ; setup stack pointer + lea r3, [rsp+gprsize] +.loop_pass1: + call t0 +%if ARCH_X86_64 + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [cq+4*64+r5], m8 + mova [cq+5*64+r5], m9 + mova [cq+6*64+r5], m10 + mova [cq+7*64+r5], m11 +%else + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [cq+4*64+r5], m0 + mova [cq+5*64+r5], m1 + mova [cq+6*64+r5], m2 + mova [cq+7*64+r5], m3 + mova m0, [rsp+gprsize+ 8*16] + mova m2, [rsp+gprsize+ 9*16] + mova m4, [rsp+gprsize+10*16] + mova m6, [rsp+gprsize+11*16] +%endif + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [cq+0*64+r5], m0 + mova [cq+1*64+r5], m1 + mova [cq+2*64+r5], m2 + mova [cq+3*64+r5], m3 + pxor m0, m0 + REPX {mova [cq+x*64+r5], m0}, 8, 9, 10, 11, 12, 13, 14, 15 + sub r5d, 16 + jge .loop_pass1 + +%if ARCH_X86_32 + ; restore pic-ptr + mov r1, [rsp+16*16+1*gprsize] +%endif + jmp tx2q +.main: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + + mova m0, [cq+ 1*64+r5] + mova m1, [cq+ 3*64+r5] + mova m2, [cq+ 5*64+r5] + mova m3, [cq+ 7*64+r5] + mova m4, [cq+ 9*64+r5] + mova m5, [cq+11*64+r5] + mova m6, [cq+13*64+r5] + mova m7, [cq+15*64+r5] + call m(idct_16x4_internal_16bpc).main_oddhalf + + mova m0, [cq+ 0*64+r5] + mova m1, [cq+ 2*64+r5] + mova m2, [cq+ 4*64+r5] + mova m3, [cq+ 6*64+r5] + mova m4, [cq+ 8*64+r5] + mova m5, [cq+10*64+r5] + mova m6, [cq+12*64+r5] + mova m7, [cq+14*64+r5] + call m(idct_8x4_internal_16bpc).main_pass1 + call m(idct_8x4_internal_16bpc).round + call .round +%if ARCH_X86_64 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m14, m15 +%endif + ret +.round: +%if ARCH_X86_64 + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + psrld m8, m11, 10 ; 2 + REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + mova m8, [r3+1*16] + mova m9, [r3+2*16] + mova m10, [r3+3*16] + mova m11, [r3+4*16] + mova m12, [r3+5*16] + mova m13, [r3+6*16] + mova m14, [r3+7*16] + psubd m15, m0, m14 ; out15 + paddd m0, m14 ; out0 + psubd m14, m1, m13 ; out14 + paddd m1, m13 ; out1 + psubd m13, m2, m12 ; out13 + paddd m2, m12 ; out2 + psubd m12, m3, m11 ; out12 + paddd m3, m11 ; out3 + psubd m11, m4, m10 ; out11 + paddd m4, m10 ; out4 + psubd m10, m5, m9 ; out10 + paddd m5, m9 ; out5 + psubd m9, m6, m8 ; out9 + paddd m6, m8 ; out6 + psubd m8, m7, [r3+0*16] ; out8 + paddd m7, [r3+0*16] ; out7 + REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + ; and out0-15 is now in m0-15 +%else + mova [r3+ 0*16], m0 + mova m0, [o(clip_18b_min)] + REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7 + pmaxsd m0, [r3+ 0*16] + mova [r3+ 0*16], m7 + mova m7, [o(clip_18b_max)] + REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6 + pminsd m7, [r3+ 0*16] + mova [r3+ 0*16], m0 + mova m0, [o(pd_2)] + REPX {paddd x, m0}, m1, m2, m3, m4, m5, m6, m7 + paddd m0, [r3+ 0*16] + mova [r3+ 0*16], m0 + mova [r3+ 1*16], m1 + mova [r3+ 2*16], m2 + mova m1, [r3+11*16] + mova m2, [r3+10*16] + psubd m0, m7, m1 + paddd m7, m1 + psubd m1, m6, m2 + paddd m6, m2 + REPX {psrad x, 2}, m0, m1, m6, m7 + packssdw m0, m1 ; out8-9 + packssdw m6, m7 ; out6-7 + mova [r3+11*16], m6 + mova m1, [r3+9*16] + mova m7, [r3+8*16] + psubd m2, m5, m1 + paddd m5, m1 + psubd m1, m4, m7 + paddd m4, m7 + REPX {psrad x, 2}, m2, m1, m4, m5 + packssdw m2, m1 ; out10-11 + packssdw m4, m5 ; out4-5 + mova m1, [r3+2*16] + mova [r3+10*16], m4 + mova m6, [r3+7*16] + mova m7, [r3+6*16] + psubd m4, m3, m6 + paddd m3, m6 + psubd m6, m1, m7 + paddd m1, m7 + REPX {psrad x, 2}, m4, m6, m1, m3 + packssdw m4, m6 ; out12-13 + packssdw m1, m3 ; out2-3 + mova m3, [r3+1*16] + mova [r3+9*16], m1 + mova m1, [r3+0*16] + mova m5, [r3+5*16] + mova m7, [r3+4*16] + psubd m6, m3, m5 + paddd m3, m5 + psubd m5, m1, m7 + paddd m1, m7 + REPX {psrad x, 2}, m6, m5, m1, m3 + packssdw m6, m5 ; out14-15 + packssdw m1, m3 ; out0-1 + mova [r3+8*16], m1 +%endif + ret + +.pass2: +%if ARCH_X86_64 + mova m8, [o(pw_2048)] + pxor m9, m9 + mova m10, [o(pixel_10bpc_max)] + mov r7, dstq +%else + mov [rsp+2*gprsize+16*16], dstq +%endif + lea r3, [strideq*3] + mov r4d, 2 +.loop_pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + mova m0, [cq+0*64+ 0] + mova m1, [cq+2*64+ 0] + mova m2, [cq+0*64+16] + mova m3, [cq+2*64+16] + mova m4, [cq+0*64+32] + mova m5, [cq+2*64+32] + mova m6, [cq+0*64+48] + mova m7, [cq+2*64+48] + call m_suffix(idct_8x8_internal_8bpc, _ssse3).main + mova [rsp+gprsize+3*16], m0 + mova [rsp+gprsize+4*16], m1 + mova [rsp+gprsize+5*16], m2 + mova [rsp+gprsize+6*16], m3 + mova [rsp+gprsize+7*16], m4 + mova [rsp+gprsize+8*16], m5 + mova [rsp+gprsize+9*16], m6 + ; m7 is already stored in [rsp+gprsize+0*16] + mova m0, [cq+1*64+ 0] + mova m1, [cq+3*64+ 0] + mova m2, [cq+1*64+16] + mova m3, [cq+3*64+16] + mova m4, [cq+1*64+32] + mova m5, [cq+3*64+32] + mova m6, [cq+1*64+48] + mova m7, [cq+3*64+48] + call m_suffix(idct_16x8_internal_8bpc, _ssse3).main + + ; out0-7 is in rsp+gprsize+3-10*mmsize + ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize + +%if ARCH_X86_64 + lea dstq, [r7+strideq*8] +%else + mov dstq, [rsp+2*gprsize+16*16] + lea dstq, [dstq+strideq*8] +%endif + call m(idct_8x8_internal_16bpc).round2_and_write_8x8 +%if ARCH_X86_64 + mov dstq, r7 +%else + mov dstq, [rsp+2*gprsize+16*16] +%endif + mova m0, [rsp+gprsize+ 3*16] + mova m1, [rsp+gprsize+ 4*16] + mova m2, [rsp+gprsize+ 5*16] + mova m3, [rsp+gprsize+ 6*16] + mova m4, [rsp+gprsize+ 7*16] + mova m5, [rsp+gprsize+ 8*16] + mova m6, [rsp+gprsize+ 9*16] + mova m7, [rsp+gprsize+10*16] + call m(idct_8x8_internal_16bpc).round1_and_write_8x8 +%if ARCH_X86_64 + add r7, 16 +%define mzero m9 +%else + add dword [rsp+2*gprsize+16*16], 16 +%define mzero m7 + pxor m7, m7 +%endif + REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 + add cq, 64*4 + REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1 +%undef mzero + dec r4d + jg .loop_pass2 +%if WIN64 + mov r7, [rsp+16*16+gprsize] +%endif + RET + +INV_TXFM_16X16_FN adst, dct +INV_TXFM_16X16_FN adst, adst +INV_TXFM_16X16_FN adst, flipadst + +cglobal iadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + mov [rsp+16*16+gprsize], r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(.main)] + jmp m(idct_16x16_internal_16bpc).pass1_full + +.main: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + mova m0, [cq+ 2*64+r5] + mova m1, [cq+13*64+r5] + mova m2, [cq+ 6*64+r5] + mova m3, [cq+ 9*64+r5] + mova m4, [cq+10*64+r5] + mova m5, [cq+ 5*64+r5] + mova m6, [cq+14*64+r5] + mova m7, [cq+ 1*64+r5] + call m(iadst_16x4_internal_16bpc).main_part1 + mova m0, [cq+ 0*64+r5] + mova m1, [cq+15*64+r5] + mova m2, [cq+ 4*64+r5] + mova m3, [cq+11*64+r5] + mova m4, [cq+ 8*64+r5] + mova m5, [cq+ 7*64+r5] + mova m6, [cq+12*64+r5] + mova m7, [cq+ 3*64+r5] + call m(iadst_16x4_internal_16bpc).main_part2 + call .round +%if ARCH_X86_64 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m14, m15 +%endif + ret +.round: +%if ARCH_X86_64 + pcmpeqd m8, m8 ; -1 + mova m15, [o(pd_10240)] + psrld m14, 10 ; +2 + psubd m13, m14, m8 ; +3 + REPX {pxor x, m8 }, m1, m3, m5, m7 + REPX {paddd x, m14}, m0, m2 + REPX {paddd x, m13}, m1, m3 + REPX {paddd x, m15}, m4, m5, m6, m7 + paddd m13, m15, m8 ; +10239 + paddd m8, m15, m9 + psubd m9, m13, m10 + paddd m10, m15, m11 + psubd m11, m13, m12 + paddd m12, m14, [r3+3*16] + psubd m13, m14, [r3+2*16] + psubd m15, m14, [r3+0*16] + paddd m14, [r3+1*16] + REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 14}, m4, m5, m6, m7, m8, m9, m10, m11 +%else + mova [r3+8*16], m1 + mova [r3+9*16], m3 + mova m3, [o(pd_10240)] + pcmpeqd m1, m1 + REPX {pxor x, m1}, m5, m7 + REPX {paddd x, m3}, m4, m5, m6, m7 + REPX {psrad x, 14}, m4, m5, m6, m7 + packssdw m4, m5 + packssdw m6, m7 + mova [r3+10*16], m4 + mova [r3+11*16], m6 + mova m4, [r3+4*16] + mova m5, [r3+5*16] + mova m6, [r3+6*16] + mova m7, [r3+7*16] + mova m3, [o(pd_2)] + REPX {pxor x, m1}, m5, m7 + REPX {paddd x, m3}, m4, m6 + psubd m3, m1 + REPX {paddd x, m3}, m5, m7 + REPX {psrad x, 2 }, m4, m5, m6, m7 + packssdw m4, m5 + packssdw m6, m7 + mova m5, [r3+8*16] + mova m7, [r3+9*16] + mova [r3+8*16], m4 + mova [r3+9*16], m6 + mova m3, [o(pd_10240)] + REPX {pxor x, m1}, m5, m7 + REPX {paddd x, m3}, m0, m5, m2, m7 + REPX {psrad x, 14}, m0, m5, m2, m7 + packssdw m0, m5 + packssdw m2, m7 + mova m4, [r3+0*16] + mova m5, [r3+1*16] + mova m6, [r3+2*16] + mova m7, [r3+3*16] + mova m3, [o(pd_2)] + REPX {pxor x, m1}, m5, m7 + REPX {paddd x, m3}, m4, m6 + psubd m3, m1 + REPX {paddd x, m3}, m5, m7 + REPX {psrad x, 2 }, m4, m5, m6, m7 + packssdw m4, m5 + packssdw m6, m7 +%endif + ret +.pass2: +%if ARCH_X86_64 + mova m8, [o(pw_2048)] + mova m11, [o(pw_m2048)] + pxor m9, m9 + mova m10, [o(pixel_10bpc_max)] + mov r7, dstq +%else + mov [rsp+2*gprsize+16*16], dstq +%endif + lea r3, [strideq*3] + mov r4d, 2 +.loop_pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + mova m0, [cq+0*64+32] + mova m1, [cq+1*64+32] + mova m2, [cq+2*64+16] + mova m3, [cq+3*64+16] + mova m4, [cq+0*64+ 0] + mova m5, [cq+1*64+ 0] + mova m6, [cq+2*64+48] + mova m7, [cq+3*64+48] + mova [rsp+gprsize+3*16], m0 + mova [rsp+gprsize+4*16], m1 + mova [rsp+gprsize+5*16], m2 + mova [rsp+gprsize+6*16], m3 + mova [rsp+gprsize+7*16], m4 + mova [rsp+gprsize+8*16], m5 + mova [rsp+gprsize+9*16], m6 + mova [rsp+gprsize+10*16], m7 + mova m0, [cq+2*64+ 0] + mova m1, [cq+3*64+ 0] + mova m2, [cq+0*64+16] + mova m3, [cq+1*64+16] + mova m4, [cq+2*64+32] + mova m5, [cq+3*64+32] + mova m6, [cq+0*64+48] + mova m7, [cq+1*64+48] + call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main + call m_suffix(iadst_16x8_internal_8bpc, _ssse3).main_pass2_end + + ; out0-7 is in rsp+gprsize+3-10*mmsize + ; out8-14 is in m0-6, and out15 is in m7 as well as rsp+gprsize+0*mmsize + +%if ARCH_X86_64 + lea dstq, [r7+strideq*8] +%else + mov dstq, [rsp+2*gprsize+16*16] + lea dstq, [dstq+strideq*8] +%endif + call m(iadst_8x8_internal_16bpc).round2_and_write_8x8 +%if ARCH_X86_64 + mov dstq, r7 +%else + mov dstq, [rsp+2*gprsize+16*16] +%endif + mova m0, [rsp+gprsize+ 3*16] + mova m1, [rsp+gprsize+ 4*16] + mova m2, [rsp+gprsize+ 5*16] + mova m3, [rsp+gprsize+ 6*16] + mova m4, [rsp+gprsize+ 7*16] + mova m5, [rsp+gprsize+ 8*16] + mova m6, [rsp+gprsize+ 9*16] + mova m7, [rsp+gprsize+10*16] + call m(iadst_8x8_internal_16bpc).round1_and_write_8x8 +%if ARCH_X86_64 + add r7, 16 +%define mzero m9 +%else + add dword [rsp+2*gprsize+16*16], 16 +%define mzero m7 + pxor m7, m7 +%endif + REPX {mova [cq+x*16], mzero}, 0, 1, 2, 3, 4, 5, 6, 7 + add cq, 64*4 + REPX {mova [cq+x*16], mzero}, -8, -7, -6, -5, -4, -3, -2, -1 +%undef mzero + dec r4d + jg .loop_pass2 +%if WIN64 + mov r7, [rsp+16*16+gprsize] +%endif + RET + +INV_TXFM_16X16_FN flipadst, dct +INV_TXFM_16X16_FN flipadst, adst +INV_TXFM_16X16_FN flipadst, flipadst + +cglobal iflipadst_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + mov [rsp+16*16+gprsize], r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(.main)] + jmp m(idct_16x16_internal_16bpc).pass1_full + +.main: + call m(iadst_16x16_internal_16bpc).main +%if ARCH_X86_64 + mova m1, m0 + mova m3, m2 + mova m5, m4 + mova m7, m6 + pshufd m0, m14, q1032 + pshufd m2, m12, q1032 + pshufd m4, m10, q1032 + pshufd m6, m8, q1032 + pshufd m8, m7, q1032 + pshufd m10, m5, q1032 + pshufd m12, m3, q1032 + pshufd m14, m1, q1032 +%else + pshufd m1, m0, q1032 + pshufd m3, m2, q1032 + pshufd m5, m4, q1032 + pshufd m7, m6, q1032 + pshufd m0, [r3+11*16], q1032 + pshufd m2, [r3+10*16], q1032 + pshufd m4, [r3+9*16], q1032 + pshufd m6, [r3+8*16], q1032 + mova [r3+11*16], m1 + mova [r3+10*16], m3 + mova [r3+ 9*16], m5 + mova [r3+ 8*16], m7 +%endif + ret + +.pass2: + lea r3, [strideq*3] + lea r3, [r3*5] + add dstq, r3 + neg strideq + jmp m(iadst_16x16_internal_16bpc).pass2 + +INV_TXFM_16X16_FN identity, dct, h +INV_TXFM_16X16_FN identity, identity + +cglobal iidentity_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2 +%if WIN64 + mov [rsp+16*16+gprsize], r7 +%elif ARCH_X86_32 + mov [rsp+16*16+gprsize*1], r1 + mov [rsp+16*16+gprsize*2], r6 +%endif + lea t0, [o(.main)] + jmp m(idct_16x16_internal_16bpc).pass1_full + +.main: +%if ARCH_X86_64 + mova m15, [o(pd_11586)] + pmulld m0, m15, [cq+ 0*64+r5] + pmulld m1, m15, [cq+ 1*64+r5] + pmulld m2, m15, [cq+ 2*64+r5] + pmulld m3, m15, [cq+ 3*64+r5] + pmulld m4, m15, [cq+ 4*64+r5] + pmulld m5, m15, [cq+ 5*64+r5] + pmulld m6, m15, [cq+ 6*64+r5] + pmulld m7, m15, [cq+ 7*64+r5] + pmulld m8, m15, [cq+ 8*64+r5] + pmulld m9, m15, [cq+ 9*64+r5] + pmulld m10, m15, [cq+10*64+r5] + pmulld m11, m15, [cq+11*64+r5] + pmulld m12, m15, [cq+12*64+r5] + pmulld m13, m15, [cq+13*64+r5] + pmulld m14, m15, [cq+14*64+r5] + pmulld m15, [cq+15*64+r5] + mova [r3], m15 + mova m15, [o(pd_10240)] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + paddd m15, [r3] + REPX {psrad x, 14 }, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m14, m15 +%else + mova m7, [o(pd_11586)] + pmulld m0, m7, [cq+ 0*64+r5] + pmulld m1, m7, [cq+ 1*64+r5] + pmulld m2, m7, [cq+ 2*64+r5] + pmulld m3, m7, [cq+ 3*64+r5] + pmulld m4, m7, [cq+ 4*64+r5] + pmulld m5, m7, [cq+ 5*64+r5] + pmulld m6, m7, [cq+ 6*64+r5] + pmulld m7, [cq+ 7*64+r5] + mova [r3], m7 + mova m7, [o(pd_10240)] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 + paddd m7, [r3] + REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + mova [r3+8*16], m0 + mova [r3+9*16], m2 + mova [r3+10*16], m4 + mova [r3+11*16], m6 + mova m7, [o(pd_11586)] + pmulld m0, m7, [cq+ 8*64+r5] + pmulld m1, m7, [cq+ 9*64+r5] + pmulld m2, m7, [cq+10*64+r5] + pmulld m3, m7, [cq+11*64+r5] + pmulld m4, m7, [cq+12*64+r5] + pmulld m5, m7, [cq+13*64+r5] + pmulld m6, m7, [cq+14*64+r5] + pmulld m7, [cq+15*64+r5] + mova [r3], m7 + mova m7, [o(pd_10240)] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 + paddd m7, [r3] + REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 +%endif + ret + +.pass2: +%if ARCH_X86_64 + mova m4, [o(pw_2048)] + mova m5, [o(pixel_10bpc_max)] + pxor m6, m6 + mova m7, [o(pw_1697x16)] + mov r7, dstq +%else + mov [rsp+2*gprsize+16*16], dstq +%endif + mov r5d, 4 + lea r3, [strideq*3] +.pass2_loop: + mova m0, [cq+0*64+0] + mova m1, [cq+1*64+0] + mova m2, [cq+2*64+0] + mova m3, [cq+3*64+0] + call m(iidentity_8x16_internal_16bpc).main +%if ARCH_X86_64 + call m(idct_8x4_internal_16bpc).round1_and_write_8x4 +%else + call m(idct_8x4_internal_16bpc).round2_and_write_8x4 +%endif + REPX {mova [cq+x*16], m6}, 0, 4, 8, 12 + add cq, 16 + lea dstq, [dstq+strideq*4] + dec r5w + jg .pass2_loop + add cq, 64*3 + btc r5d, 16 + jc .end +%if ARCH_X86_64 + lea dstq, [r7+16] +%else + mov dstq, [rsp+2*gprsize+16*16] + add dstq, 16 +%endif + add r5d, 4 + jmp .pass2_loop +.end: +%if WIN64 + mov r7, [rsp+16*16+gprsize] +%endif + RET + +cglobal inv_txfm_add_identity_identity_8x32_16bpc, 4, 7, 8, dst, stride, c, eob +%if ARCH_X86_32 + LEA r6, $$ +%endif + mova m5, [o(pw_5)] + mova m7, [o(pixel_10bpc_max)] + pxor m6, m6 + mov r5d, eobd + add eobb, 21 + cmovc eobd, r5d ; 43, 107, 171 -> 64, 128, 192 + lea r4, [strideq*3] +.loop: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + REPX {paddsw x, m5}, m0, m1, m2, m3 + REPX {psraw x, 3 }, m0, m1, m2, m3 + call .main_zero + add cq, 16 + lea dstq, [dstq+strideq*4] + btc eobd, 16 + jnc .loop + sub eobd, 64 + jge .loop + RET +ALIGN function_align +.main_zero: + REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 +.main: + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m4 + punpcklwd m0, m4 + punpckhwd m4, m2, m1 + punpcklwd m2, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] + paddw m2, [dstq+strideq*2] + paddw m3, [dstq+r4 ] + REPX {pmaxsw x, m6}, m0, m1, m2, m3 + REPX {pminsw x, m7}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r4 ], m3 + ret + +cglobal inv_txfm_add_identity_identity_32x8_16bpc, 4, 7, 8, dst, stride, c, eob +%if ARCH_X86_32 + LEA r6, $$ +%endif + mova m5, [o(pw_4096)] + mova m7, [o(pixel_10bpc_max)] + pxor m6, m6 + mov r4d, eobd + add eobb, 21 + cmovc eobd, r4d + lea r4, [strideq*3] + mov r5, dstq +.loop: + mova m0, [cq+32*0] + packssdw m0, [cq+32*1] + mova m1, [cq+32*2] + packssdw m1, [cq+32*3] + mova m2, [cq+32*4] + packssdw m2, [cq+32*5] + mova m3, [cq+32*6] + packssdw m3, [cq+32*7] + REPX {mova [cq+32*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + call m(inv_txfm_add_identity_identity_8x32_16bpc).main + lea dstq, [dstq+strideq*4] + add cq, 16 + btc eobd, 16 + jnc .loop + add cq, 32*8-32 + add r5, 16 + mov dstq, r5 + sub eobd, 64 + jge .loop + RET + +cglobal inv_txfm_add_identity_identity_16x32_16bpc, 4, 7, 12, dst, stride, c, eob +%if ARCH_X86_32 + LEA r6, $$ +%else + mova m8, [o(pw_2896x8)] + mova m9, [o(pw_1697x16)] + mova m11, [o(pw_8192)] +%endif + mova m7, [o(pixel_10bpc_max)] + lea r4, [strideq*3] + pxor m6, m6 +%if ARCH_X86_64 + paddw m10, m11, m11 ; pw_16384 +%endif + mov r5, dstq + call .main + sub eobd, 36 + jl .ret + add cq, 128*8-32 + lea dstq, [r5+16] + call .main + sub cq, 128*8 + lea dstq, [r5+strideq*8] + mov r5, dstq + call .main + sub eobd, 107 ; eob < 143 + jl .ret + add cq, 128*8-32 + lea dstq, [r5+16] + call .main + sub cq, 128*8 + lea dstq, [r5+strideq*8] + mov r5, dstq + call .main + sub eobd, 128 ; eob < 271 + jl .ret + add cq, 128*8-32 + lea dstq, [r5+16] + call .main + sub cq, 128*8 + lea dstq, [r5+strideq*8] + mov r5, dstq + call .main + sub eobd, 128 ; eob < 399 + jl .ret + add cq, 128*8-32 + lea dstq, [r5+16] + call .main +.ret: + RET +ALIGN function_align +.main: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] +%if ARCH_X86_64 + REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 + pmulhrsw m4, m9, m0 + pmulhrsw m5, m9, m1 + REPX {pmulhrsw x, m10}, m4, m5 +%else + mova m6, [o(pw_2896x8)] + REPX {pmulhrsw x, m6 }, m0, m1, m2, m3 + mova m5, [o(pw_1697x16)] + pmulhrsw m4, m5, m0 + pmulhrsw m5, m1 + mova m6, [o(pw_16384)] + REPX {pmulhrsw x, m6 }, m4, m5 +%endif + paddsw m0, m4 + paddsw m1, m5 +%if ARCH_X86_64 + pmulhrsw m4, m9, m2 + pmulhrsw m5, m9, m3 + REPX {pmulhrsw x, m10}, m4, m5 +%else + mova m5, [o(pw_1697x16)] + pmulhrsw m4, m5, m2 + pmulhrsw m5, m3 + REPX {pmulhrsw x, m6 }, m4, m5 +%endif + paddsw m2, m4 + paddsw m3, m5 +%if ARCH_X86_64 + REPX {pmulhrsw x, m11}, m0, m1, m2, m3 +%else + psrlw m6, 1 ; pw_8192 + REPX {pmulhrsw x, m6 }, m0, m1, m2, m3 + pxor m6, m6 +%endif + call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero + lea dstq, [dstq+strideq*4] + add cq, 16 + btc eobd, 16 + jnc .main + ret + +cglobal inv_txfm_add_identity_identity_32x16_16bpc, 4, 7, 11, dst, stride, c, eob +%if ARCH_X86_32 + LEA r6, $$ +%else + mova m8, [o(pw_2896x8)] + mova m9, [o(pw_1697x16)] + mova m10, [o(pw_2048)] +%endif + mova m7, [o(pixel_10bpc_max)] + lea r4, [strideq*3] + pxor m6, m6 + mov r5, dstq + call .main + sub eobd, 36 + jl .ret + call .main + add cq, 64*8-64 + lea dstq, [r5+16*1] + call .main + sub eobd, 107 ; eob < 143 + jl .ret + call .main + add cq, 64*8-64 + lea dstq, [r5+16*2] + call .main + sub eobd, 128 ; eob < 271 + jl .ret + call .main + add cq, 64*8-64 + lea dstq, [r5+16*3] + call .main + sub eobd, 128 ; eob < 399 + jl .ret + call .main +.ret: + RET +ALIGN function_align +.main: + mova m0, [cq+64*0] + packssdw m0, [cq+64*1] + mova m1, [cq+64*2] + packssdw m1, [cq+64*3] + mova m2, [cq+64*4] + packssdw m2, [cq+64*5] + mova m3, [cq+64*6] + packssdw m3, [cq+64*7] +%if ARCH_X86_64 + REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 +%else + mova m6, [o(pw_2896x8)] + REPX {pmulhrsw x, m6 }, m0, m1, m2, m3 +%endif + REPX {paddsw x, x }, m0, m1, m2, m3 +%if ARCH_X86_64 + pmulhrsw m4, m9, m0 + pmulhrsw m5, m9, m1 +%else + mova m6, [o(pw_1697x16)] + pmulhrsw m4, m6, m0 + pmulhrsw m5, m6, m1 +%endif + REPX {paddsw x, x }, m0, m1 + paddsw m0, m4 + paddsw m1, m5 +%if ARCH_X86_64 + pmulhrsw m4, m9, m2 + pmulhrsw m5, m9, m3 +%else + pmulhrsw m4, m6, m2 + pmulhrsw m6, m3 +%endif + REPX {paddsw x, x }, m2, m3 + paddsw m2, m4 +%if ARCH_X86_64 + paddsw m3, m5 + REPX {pmulhrsw x, m10}, m0, m1, m2, m3 +%else + paddsw m3, m6 + mova m6, [o(pw_2048)] + REPX {pmulhrsw x, m6 }, m0, m1, m2, m3 + pxor m6, m6 +%endif + REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 + call m(inv_txfm_add_identity_identity_8x32_16bpc).main + lea dstq, [dstq+strideq*4] + add cq, 16 + btc eobd, 16 + jnc .main + ret + +cglobal inv_txfm_add_identity_identity_32x32_16bpc, 4, 7, 8, dst, stride, c, eob +%undef cmp +%if ARCH_X86_32 + LEA r6, $$ +%endif + mova m5, [o(pw_8192)] + mova m7, [o(pixel_10bpc_max)] + pxor m6, m6 + lea r4, [strideq*3] + mov r5, dstq + call .main ; 0 + cmp eobd, 36 + jl .ret + add cq, 128*8-32 ; 0 1 + lea dstq, [r5+16] ; 1 + call .main + call .main2 + cmp eobd, 136 + jl .ret + add cq, 128*16-64 ; 0 1 2 + lea dstq, [r5+16*2] ; 1 2 + call .main ; 2 + call .main2 + call .main2 + cmp eobd, 300 + jl .ret + add cq, 128*24-96 ; 0 1 2 3 + add r5, 16*3 ; 1 2 3 + mov dstq, r5 ; 2 3 + call .main ; 3 + call .main2 + call .main2 + call .main2 + cmp eobd, 535 + jl .ret + add cq, 128*24-96 ; 0 1 2 3 + lea dstq, [r5+strideq*8] ; 1 2 3 4 + mov r5, dstq ; 2 3 4 + call .main ; 3 4 + call .main2 + call .main2 + cmp eobd, 755 + jl .ret + add cq, 128*16-64 ; 0 1 2 3 + lea dstq, [r5+strideq*8] ; 1 2 3 4 + mov r5, dstq ; 2 3 4 5 + call .main ; 3 4 5 + call .main2 + cmp eobd, 911 + jl .ret + add cq, 128*8-32 ; 0 1 2 3 + lea dstq, [r5+strideq*8] ; 1 2 3 4 + call .main ; 2 3 4 5 +.ret: ; 3 4 5 6 + RET +ALIGN function_align +.main2: + sub cq, 128*8 + sub dstq, 16 +.main: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + call m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero + lea dstq, [dstq+strideq*4] + add cq, 16 + btc eobd, 16 + jnc .main + ret + +cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 15, 0-36*16, \ + dst, stride, c, eob +%if ARCH_X86_32 + LEA r6, $$ +%define base $$ + DECLARE_REG_TMP 0, 4 +%else + lea r6, [tbl_Nx32_odd_offset] +%define base tbl_Nx32_odd_offset + DECLARE_REG_TMP 4, 7 +%if WIN64 + mov [rsp+gprsize*1+35*16], r7 +%endif +%endif +%define o2(x) r6-base+x + test eobd, eobd + jz .dconly + +%if ARCH_X86_32 + mov [rsp+gprsize*1+35*16], r0 +%endif +%undef cmp + ; remove entirely-zero iterations + mov r5d, 7*2 + cmp eobw, word [o2(tbl_8x32_2d)+r5] + jge .end_zero_loop + pxor m0, m0 +.zero_loop: + movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] + movzx t1d, t0b + shr t0d, 8 + mova [rsp+ 3*16+r5*8], m0 + mova [rsp+11*16+r5*8], m0 + mova [rsp+ 3*16+t0*8], m0 + mova [rsp+ 3*16+t1*8], m0 + sub r5d, 2 + cmp eobw, word [o2(tbl_8x32_2d)+r5] + jl .zero_loop +.end_zero_loop: + ; actual first pass after skipping all-zero data + mov [rsp+gprsize*0+35*16], eobd + mov r3, rsp +.loop_pass1: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + mova m0, [cq+0*128+r5*8] + mova m1, [cq+1*128+r5*8] + mova m2, [cq+2*128+r5*8] + mova m3, [cq+3*128+r5*8] + mova m4, [cq+4*128+r5*8] + mova m5, [cq+5*128+r5*8] + mova m6, [cq+6*128+r5*8] + mova m7, [cq+7*128+r5*8] + call m(idct_8x4_internal_16bpc).main_pass1 + mova m1, [o(pd_2)] + REPX {paddd x, m1}, m0, m6, m5, m3 + call m(idct_8x4_internal_16bpc).round + REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + call m(idct_8x4_internal_16bpc).transpose4x8packed + + movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] + movzx t1d, t0b + shr t0d, 8 + mova [r3+ 3*16+r5*8], m0 + mova [r3+11*16+r5*8], m2 + mova [r3+ 3*16+t1*8], m1 + mova [r3+ 3*16+t0*8], m3 + pxor m7, m7 + REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + sub r5d, 2 + jge .loop_pass1 + + ; pass 2 code starts here + ; m0 is already loaded from last iteration of first pass +%if ARCH_X86_32 + mov r0, [rsp+gprsize*1+35*16] +%endif + mov eobd, [rsp+gprsize*0+35*16] + cmp eobd, 43 + jl .load_veryfast + cmp eobd, 107 + jl .load_fast + ; load normal + lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)] + jmp .run +.load_fast: + lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] + jmp .run +.load_veryfast: + lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] + ; fall-through +.run: + call .pass2 +%if WIN64 + mov r7, [rsp+gprsize*1+35*16] +%endif + RET + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + mova m1, [rsp+gprsize+16* 4] + mova m2, [rsp+gprsize+16* 5] + mova m3, [rsp+gprsize+16* 6] + mova m4, [rsp+gprsize+16* 7] + mova m5, [rsp+gprsize+16* 8] + mova m6, [rsp+gprsize+16* 9] + mova m7, [rsp+gprsize+16*10] + call m_suffix(idct_8x8_internal_8bpc, _ssse3).main + mova [rsp+gprsize+ 3*16], m0 + mova [rsp+gprsize+ 4*16], m1 + mova [rsp+gprsize+ 5*16], m2 + mova [rsp+gprsize+ 6*16], m3 + mova [rsp+gprsize+ 7*16], m4 + mova [rsp+gprsize+ 8*16], m5 + mova [rsp+gprsize+ 9*16], m6 + mova m0, [rsp+gprsize+11*16] + mova m1, [rsp+gprsize+12*16] + mova m2, [rsp+gprsize+13*16] + mova m3, [rsp+gprsize+14*16] + mova m4, [rsp+gprsize+15*16] + mova m5, [rsp+gprsize+16*16] + mova m6, [rsp+gprsize+17*16] + mova m7, [rsp+gprsize+18*16] + call m_suffix(idct_16x8_internal_8bpc, _ssse3).main + mova m7, [rsp+gprsize+ 0*16] + mova [rsp+gprsize+11*16], m0 + mova [rsp+gprsize+12*16], m1 + mova [rsp+gprsize+13*16], m2 + mova [rsp+gprsize+14*16], m3 + mova [rsp+gprsize+15*16], m4 + mova [rsp+gprsize+16*16], m5 + mova [rsp+gprsize+17*16], m6 + mova [rsp+gprsize+18*16], m7 + call r4 +%if ARCH_X86_64 + mova m8, [o(pw_2048)] + pxor m9, m9 + mova m10, [o(pixel_10bpc_max)] +%endif + lea r3, [strideq*3] + call m(idct_8x8_internal_16bpc).round1_and_write_8x8 + lea dstq, [dstq+strideq*8] + mova m0, [rsp+gprsize+11*16] + mova m1, [rsp+gprsize+12*16] + mova m2, [rsp+gprsize+13*16] + mova m3, [rsp+gprsize+14*16] + mova m4, [rsp+gprsize+15*16] + mova m5, [rsp+gprsize+16*16] + mova m6, [rsp+gprsize+17*16] + mova m7, [rsp+gprsize+18*16] + call m(idct_8x8_internal_16bpc).round1_and_write_8x8 + lea dstq, [dstq+strideq*8] + mova m0, [rsp+gprsize+19*16] + mova m1, [rsp+gprsize+20*16] + mova m2, [rsp+gprsize+21*16] + mova m3, [rsp+gprsize+22*16] + mova m4, [rsp+gprsize+23*16] + mova m5, [rsp+gprsize+24*16] + mova m6, [rsp+gprsize+25*16] + mova m7, [rsp+gprsize+26*16] + call m(idct_8x8_internal_16bpc).round1_and_write_8x8 + lea dstq, [dstq+strideq*8] + mova m0, [rsp+gprsize+27*16] + mova m1, [rsp+gprsize+28*16] + mova m2, [rsp+gprsize+29*16] + mova m3, [rsp+gprsize+30*16] + mova m4, [rsp+gprsize+31*16] + mova m5, [rsp+gprsize+32*16] + mova m6, [rsp+gprsize+33*16] + mova m7, [rsp+gprsize+34*16] + call m(idct_8x8_internal_16bpc).round1_and_write_8x8 + ret +.dconly: + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 8 + add r5d, 640 + sar r5d, 10 + add rsp, (31+2*ARCH_X86_64)*16 + jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end2 + +cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \ + dst, stride, c, eob + LEA r6, base + test eobd, eobd + jz .dconly + +%if ARCH_X86_32 + mov [rsp+gprsize*1+76*16], r0 +%elif WIN64 + mov [rsp+gprsize*1+76*16], r7 +%endif +%undef cmp + ; remove entirely-zero iterations + mov r5d, 7*2 + cmp eobw, word [o2(tbl_16x32_2d)+r5] + jge .end_zero_loop + pxor m0, m0 +.zero_loop: + movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] + movzx t1d, t0b + shr t0d, 8 + mova [rsp+12*16+r5*8], m0 + mova [rsp+20*16+r5*8], m0 + mova [rsp+12*16+t0*8], m0 + mova [rsp+12*16+t1*8], m0 + mova [rsp+44*16+r5*8], m0 + mova [rsp+52*16+r5*8], m0 + mova [rsp+44*16+t0*8], m0 + mova [rsp+44*16+t1*8], m0 + sub r5d, 2 + cmp eobw, word [o2(tbl_16x32_2d)+r5] + jl .zero_loop +.end_zero_loop: + ; actual first pass after skipping all-zero data + mov [rsp+gprsize*0+76*16], eobd + mov r3, rsp +.loop_pass1: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + mova m0, [cq+ 1*128+r5*8] + mova m1, [cq+ 3*128+r5*8] + mova m2, [cq+ 5*128+r5*8] + mova m3, [cq+ 7*128+r5*8] + mova m4, [cq+ 9*128+r5*8] + mova m5, [cq+11*128+r5*8] + mova m6, [cq+13*128+r5*8] + mova m7, [cq+15*128+r5*8] + call m(idct_8x4_internal_16bpc).rect2_mul + call m(idct_16x4_internal_16bpc).main_oddhalf + + mova m0, [cq+ 0*128+r5*8] + mova m1, [cq+ 2*128+r5*8] + mova m2, [cq+ 4*128+r5*8] + mova m3, [cq+ 6*128+r5*8] + mova m4, [cq+ 8*128+r5*8] + mova m5, [cq+10*128+r5*8] + mova m6, [cq+12*128+r5*8] + mova m7, [cq+14*128+r5*8] + call m(idct_8x4_internal_16bpc).rect2_mul + call m(idct_8x4_internal_16bpc).main_pass1 + call m(idct_8x4_internal_16bpc).round + call m(idct_16x4_internal_16bpc).round +%if ARCH_X86_64 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m14, m15 +%endif + call m(idct_8x4_internal_16bpc).transpose4x8packed + movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] + movzx t1d, t0b + shr t0d, 8 +%if ARCH_X86_64 + mova [rsp+12*16+r5*8], m0 + mova [rsp+20*16+r5*8], m2 + mova [rsp+12*16+t1*8], m1 + mova [rsp+12*16+t0*8], m3 + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [rsp+44*16+r5*8], m8 + mova [rsp+52*16+r5*8], m10 + mova [rsp+44*16+t1*8], m9 + mova [rsp+44*16+t0*8], m11 +%else + mova [rsp+44*16+r5*8], m0 + mova [rsp+52*16+r5*8], m2 + mova [rsp+44*16+t1*8], m1 + mova [rsp+44*16+t0*8], m3 + mova m0, [r3+ 8*16] + mova m2, [r3+ 9*16] + mova m4, [r3+10*16] + mova m6, [r3+11*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [rsp+12*16+r5*8], m0 + mova [rsp+20*16+r5*8], m2 + mova [rsp+12*16+t1*8], m1 + mova [rsp+12*16+t0*8], m3 +%endif + pxor m7, m7 + REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + sub r5d, 2 + jge .loop_pass1 + + ; pass=2 + add rsp, 9*16 +%if ARCH_X86_64 + mov r6, dstq +%else + mov dstq, [rsp+gprsize*1+67*16] +%endif + mov eobd, [rsp+gprsize*0+67*16] + cmp eobd, 44 + jl .load_veryfast + cmp eobd, 151 + jl .load_fast + ; load normal + lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)] + jmp .run +.load_fast: + lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] + jmp .run +.load_veryfast: + lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] + ; fall-through +.run: +%if ARCH_X86_64 + lea r2, [dstq+32] + mov r7, -4 +%else + lea r2, [rsp+67*16] + mov dword [r2+0*gprsize], 2 +%endif + jmp .loop_pass2_entry +.loop_pass2: + mova m0, [rsp+16* 3] +.loop_pass2_entry: +%if ARCH_X86_32 + mov dstq, [r2+1*gprsize] +%endif + call m(inv_txfm_add_dct_dct_8x32_16bpc).pass2 + add rsp, 32*16 +%if ARCH_X86_64 + add r7, 2 + lea dstq, [r2+r7*8] + jl .loop_pass2 +%if WIN64 + mov r7, [rsp+gprsize*1+3*16] +%endif +%else + add dword [r2+1*gprsize], 16 + dec dword [r2+0*gprsize] + jg .loop_pass2 +%endif +%assign stack_size (stack_size-73*16) +%if STACK_ALIGNMENT >= 16 +%assign stack_size_padded (stack_size_padded-73*16) +%assign stack_offset (stack_offset-73*16) +%else +%xdefine rstkm [rsp + stack_size] +%endif + RET +.dconly: + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 32 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + add rsp, (65+4*ARCH_X86_64)*16 + jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly + +cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \ + dst, stride, c, eob +%if ARCH_X86_32 + LEA r6, $$ +%endif + test eobd, eobd + jz .dconly + + ; remove entirely-zero iterations +%undef cmp +%if ARCH_X86_64 + xor r5d, r5d + cmp eobd, 10 + setge r5b +%else + mov r5d, 1 + cmp eobd, 10 + sbb r5d, 0 +%endif + add r5d, r5d + + ; actual first pass after skipping all-zero data +.loop_pass1: + mova m0, [cq+32* 1+r5*8] + mova m1, [cq+32* 7+r5*8] + mova m2, [cq+32* 9+r5*8] + mova m3, [cq+32*15+r5*8] + mova m4, [cq+32*17+r5*8] + mova m5, [cq+32*23+r5*8] + mova m6, [cq+32*25+r5*8] + mova m7, [cq+32*31+r5*8] +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + mov r3, rsp + call .main_oddhalf_part1 + mova m0, [cq+32* 3+r5*8] + mova m1, [cq+32* 5+r5*8] + mova m2, [cq+32*11+r5*8] + mova m3, [cq+32*13+r5*8] + mova m4, [cq+32*19+r5*8] + mova m5, [cq+32*21+r5*8] + mova m6, [cq+32*27+r5*8] + mova m7, [cq+32*29+r5*8] + call .main_oddhalf_part2 + mova m0, [cq+32* 2+r5*8] + mova m1, [cq+32* 6+r5*8] + mova m2, [cq+32*10+r5*8] + mova m3, [cq+32*14+r5*8] + mova m4, [cq+32*18+r5*8] + mova m5, [cq+32*22+r5*8] + mova m6, [cq+32*26+r5*8] + mova m7, [cq+32*30+r5*8] + add r3, 16*(16+4*ARCH_X86_32) + call m(idct_16x4_internal_16bpc).main_oddhalf + mova m0, [cq+32* 0+r5*8] + mova m1, [cq+32* 4+r5*8] + mova m2, [cq+32* 8+r5*8] + mova m3, [cq+32*12+r5*8] + mova m4, [cq+32*16+r5*8] + mova m5, [cq+32*20+r5*8] + mova m6, [cq+32*24+r5*8] + mova m7, [cq+32*28+r5*8] + call m(idct_8x4_internal_16bpc).main_pass1 + call m(idct_8x4_internal_16bpc).round + sub r3, 16*(16+4*ARCH_X86_32) + call .round_dct32 +%if ARCH_X86_64 + call m(idct_8x4_internal_16bpc).transpose4x8packed + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [cq+32* 8+r5*8], m8 + mova [cq+32* 9+r5*8], m9 + mova [cq+32*10+r5*8], m10 + mova [cq+32*11+r5*8], m11 + mova m8, [r3+16* 9] ; 8 9 + mova m10, [r3+16*11] ; 10 11 + mova m12, [r3+16*13] ; 12 13 + mova m14, [r3+16*15] ; 14 15 + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [cq+32* 4+r5*8], m8 + mova [cq+32* 5+r5*8], m9 + mova [cq+32* 6+r5*8], m10 + mova [cq+32* 7+r5*8], m11 + mova m8, [r3+16* 8] ; 24 25 + mova m10, [r3+16*10] ; 26 27 + mova m12, [r3+16*12] ; 28 29 + mova m14, [r3+16*14] ; 30 31 + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [cq+32*12+r5*8], m8 + mova [cq+32*13+r5*8], m9 + mova [cq+32*14+r5*8], m10 + mova [cq+32*15+r5*8], m11 +%else + sub r3, 8*16 + mova m0, [r3+ 8*16] + mova m2, [r3+10*16] + mova m4, [r3+12*16] + mova m6, [r3+14*16] + packssdw m0, [r3+ 9*16] + packssdw m2, [r3+11*16] + packssdw m4, [r3+13*16] + packssdw m6, [r3+15*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [cq+32* 4+r5*8], m0 + mova [cq+32* 5+r5*8], m1 + mova [cq+32* 6+r5*8], m2 + mova [cq+32* 7+r5*8], m3 + mova m0, [r3+16*16] + mova m2, [r3+18*16] + mova m4, [r3+20*16] + mova m6, [r3+22*16] + packssdw m0, [r3+17*16] + packssdw m2, [r3+19*16] + packssdw m4, [r3+21*16] + packssdw m6, [r3+23*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [cq+32* 8+r5*8], m0 + mova [cq+32* 9+r5*8], m1 + mova [cq+32*10+r5*8], m2 + mova [cq+32*11+r5*8], m3 + mova m0, [r3+31*16] + mova m2, [r3+29*16] + mova m4, [r3+27*16] + mova m6, [r3+25*16] + packssdw m0, [r3+30*16] + packssdw m2, [r3+28*16] + packssdw m4, [r3+26*16] + packssdw m6, [r3+24*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [cq+32*12+r5*8], m0 + mova [cq+32*13+r5*8], m1 + mova [cq+32*14+r5*8], m2 + mova [cq+32*15+r5*8], m3 + mova m0, [r3+ 0*16] + mova m2, [r3+ 2*16] + mova m4, [r3+ 4*16] + mova m6, [r3+ 6*16] + packssdw m0, [r3+ 1*16] + packssdw m2, [r3+ 3*16] + packssdw m4, [r3+ 5*16] + packssdw m6, [r3+ 7*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed +%endif + pxor m7, m7 + ; clear lower half of [cq] + REPX {mova [cq+x*32+r5*8], m7}, 16, 17, 18, 19, 20, 21, 22, 23, \ + 24, 25, 26, 27, 28, 29, 30, 31 + test r5d, r5d + jz .end_pass1 + mova [cq+32* 0+r5*8], m0 + mova [cq+32* 1+r5*8], m1 + mova [cq+32* 2+r5*8], m2 + mova [cq+32* 3+r5*8], m3 + sub r5d, 2 + jmp .loop_pass1 +.end_pass1: + + ; pass=2, we need to call this otherwise the stack pointer has + ; the wrong offset in the 8-bit code + mov r4d, 4 + call m(idct_16x8_internal_16bpc).pass2_main + RET + +.main_oddhalf_part1_fast: ; lower half zero + pmulld m7, m0, [o(pd_4091)] + pmulld m0, [o(pd_201)] + pmulld m4, m3, [o(pd_m2751)] +%if ARCH_X86_32 + pmulld m3, [o(pd_3035)] + mova m5, [o(pd_2048)] + REPX {paddd x, m5}, m0, m7 + REPX {psrad x, 12}, m0, m7 + mova [r3+3*16], m7 + mova m7, m3 + mova m3, m5 +%else + pmulld m3, [o(pd_3035)] +%endif + pmulld m6, m1, [o(pd_m1380)] + pmulld m1, [o(pd_3857)] + pmulld m5, m2, [o(pd_3703)] + pmulld m2, [o(pd_1751)] + jmp .main_oddhalf_part1_fast2 +.main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31 +%if ARCH_X86_64 + ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a + ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a + ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a + ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a +.main_oddhalf_part1_fast2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 + psubd m8, m0, m4 ; t17 + paddd m0, m4 ; t16 + psubd m4, m6, m2 ; t18 + paddd m6, m2 ; t19 + psubd m2, m1, m5 ; t29 + paddd m1, m5 ; t28 + psubd m5, m7, m3 ; t30 + paddd m7, m3 ; t31 + REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 + REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 + mova m15, [o(pd_4017)] + mova m10, [o(pd_799)] + ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a + ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 4 ; t29a, t18a + psubd m3, m0, m6 ; t19a + paddd m0, m6 ; t16a + psubd m6, m7, m1 ; t28a + paddd m7, m1 ; t31a + psubd m1, m5, m4 ; t18 + paddd m5, m4 ; t17 + psubd m4, m8, m2 ; t29 + paddd m8, m2 ; t30 + REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 + REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 + mova m15, [o(pd_3784)] + mova m10, [o(pd_1567)] + ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a + ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15 ; t19, t28 + mova [r3+16*0], m0 + mova [r3+16*1], m5 + mova [r3+16*2], m4 + mova [r3+16*3], m6 + mova [r3+16*4], m3 + mova [r3+16*5], m1 + mova [r3+16*6], m8 + mova [r3+16*7], m7 +%else + mova [r3+0*16], m2 + mova [r3+1*16], m3 + mova [r3+2*16], m4 + mova [r3+3*16], m5 + mova m3, [o(pd_2048)] + ITX_MULSUB_2D 0, 7, 2, 4, 5, 3, 201, 4091 ; t16a, t31a + ITX_MULSUB_2D 6, 1, 2, 4, 5, _, 3857, 1380 ; t19a, t28a + mova m4, [r3+2*16] + mova m5, [r3+3*16] + mova [r3+2*16], m6 + mova [r3+3*16], m7 + mova m2, [r3+0*16] + mova m7, [r3+1*16] + mova [r3+0*16], m0 + mova [r3+1*16], m1 + ITX_MULSUB_2D 2, 5, 0, 1, 6, _, 1751, 3703 ; t18a, t29a + ITX_MULSUB_2D 4, 7, 0, 1, 6, _, 3035, 2751 ; t17a, t30a + mova m0, [r3+0*16] + mova m1, [r3+1*16] + mova m6, [r3+2*16] +.main_oddhalf_part1_fast2: + REPX {paddd x, m3}, m1, m2, m4, m5, m6, m7 + REPX {psrad x, 12}, m1, m2, m4, m5, m6, m7 + psubd m3, m0, m4 ; t17 + mova [r3+0*16], m3 + mova m3, [r3+3*16] + paddd m0, m4 ; t16 + psubd m4, m6, m2 ; t18 + paddd m6, m2 ; t19 + psubd m2, m1, m5 ; t29 + paddd m1, m5 ; t28 + psubd m5, m3, m7 ; t30 + paddd m7, m3 ; t31 + mova m3, [o(clip_18b_min)] + REPX {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7 + pmaxsd m3, [r3+0*16] + mova [r3+0*16], m3 + mova m3, [o(clip_18b_max)] + REPX {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7 + pminsd m3, [r3+0*16] + mova [r3+0*16], m0 + mova [r3+1*16], m1 + mova [r3+2*16], m6 + mova [r3+3*16], m7 + mova m0, [o(pd_2048)] + ITX_MULSUB_2D 5, 3, 1, 6, 7, 0, 799, 4017 ; t17a, t30a + ITX_MULSUB_2D 2, 4, 1, 6, _, 0, 7, 4017, 4 ; t29a, t18a + psubd m1, m5, m4 ; t18 + paddd m5, m4 ; t17 + psubd m4, m3, m2 ; t29 + paddd m3, m2 ; t30 + mova m0, [r3+0*16] + mova m2, [r3+1*16] + mova m6, [r3+2*16] + mova m7, [r3+3*16] + mova [r3+0*16], m3 + psubd m3, m0, m6 ; t19a + paddd m0, m6 ; t16a + psubd m6, m7, m2 ; t28a + paddd m7, m2 ; t31a + mova m2, [o(clip_18b_min)] + REPX {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5 + pmaxsd m2, [r3+0*16] + mova [r3+0*16], m2 + mova m2, [o(clip_18b_max)] + REPX {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5 + pminsd m2, [r3+0*16] + mova [r3+16*0], m0 + mova [r3+16*1], m5 + mova [r3+16*6], m2 + mova [r3+16*7], m7 + mova m7, [o(pd_2048)] + ITX_MULSUB_2D 4, 1, 0, 5, 2, 7, 1567, 3784 ; t18a, t29a + ITX_MULSUB_2D 6, 3, 0, 5, 2, 7, 2, 3784 ; t19, t28 + mova [r3+16*2], m4 + mova [r3+16*3], m6 + mova [r3+16*4], m3 + mova [r3+16*5], m1 +%endif + ret +.main_oddhalf_part2_fast: ; lower half zero + pmulld m7, m0, [o(pd_m601)] + pmulld m0, [o(pd_4052)] + pmulld m4, m3, [o(pd_3290)] +%if ARCH_X86_32 + pmulld m3, [o(pd_2440)] + mova m5, [o(pd_2048)] + REPX {paddd x, m5}, m0, m7 + REPX {psrad x, 12}, m0, m7 + mova [r3+11*16], m7 + mova m7, m3 + mova m3, m5 +%else + pmulld m3, [o(pd_2440)] +%endif + pmulld m6, m1, [o(pd_3973)] + pmulld m1, [o(pd_995)] + pmulld m5, m2, [o(pd_m2106)] + pmulld m2, [o(pd_3513)] + jmp .main_oddhalf_part2_fast2 +.main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29 +%if ARCH_X86_64 + ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a + ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a + ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a + ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a +.main_oddhalf_part2_fast2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 + psubd m8, m0, m4 ; t25 + paddd m0, m4 ; t24 + psubd m4, m6, m2 ; t26 + paddd m6, m2 ; t27 + psubd m2, m1, m5 ; t21 + paddd m1, m5 ; t20 + psubd m5, m7, m3 ; t22 + paddd m7, m3 ; t23 + REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 + REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 + mova m15, [o(pd_2276)] + mova m10, [o(pd_3406)] + ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a + ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 4 ; t25a, t22a + psubd m3, m0, m6 ; t27a + paddd m0, m6 ; t24a + psubd m6, m7, m1 ; t20a + paddd m7, m1 ; t23a + psubd m1, m5, m4 ; t21 + paddd m5, m4 ; t22 + psubd m4, m8, m2 ; t26 + paddd m8, m2 ; t25 + REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 + REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 + mova m15, [o(pd_3784)] + mova m10, [o(pd_1567)] + ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 4 ; t26a, t21a + ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 4 ; t27, t20 + mova m9, [r3+16*0] ; t16a + mova m10, [r3+16*1] ; t17 + psubd m2, m9, m7 ; t23 + paddd m9, m7 ; t16 + psubd m7, m10, m5 ; t22a + paddd m10, m5 ; t17a + REPX {pmaxsd x, m12}, m9, m10, m2, m7 + REPX {pminsd x, m13}, m9, m10, m2, m7 + mova [r3+16*0], m9 + mova [r3+16*1], m10 + mova m9, [r3+16*2] ; t18a + mova m10, [r3+16*3] ; t19 + psubd m5, m9, m1 ; t21 + paddd m9, m1 ; t18 + psubd m1, m10, m6 ; t20a + paddd m10, m6 ; t19a + REPX {pmaxsd x, m12}, m9, m10, m5, m1 + REPX {pminsd x, m13}, m9, m10, m5, m1 + mova [r3+16*2], m9 + mova [r3+16*3], m10 + mova m9, [r3+16*4] ; t28 + mova m10, [r3+16*5] ; t29a + psubd m6, m9, m3 ; t27a + paddd m9, m3 ; t28a + psubd m3, m10, m4 ; t26 + paddd m10, m4 ; t29 + REPX {pmaxsd x, m12}, m9, m10, m6, m3 + REPX {pminsd x, m13}, m9, m10, m6, m3 + REPX {pmulld x, m14}, m6, m3, m1, m5 + paddd m6, m11 + paddd m3, m11 + psubd m4, m6, m1 ; t20 + paddd m6, m1 ; t27 + psubd m1, m3, m5 ; t21a + paddd m3, m5 ; t26a + REPX {psrad x, 12 }, m4, m1, m3, m6 + mova [r3+16*4], m4 + mova [r3+16*5], m1 + mova m4, [r3+16*6] ; t30 + mova m1, [r3+16*7] ; t31a + psubd m5, m4, m8 ; t25a + paddd m4, m8 ; t30a + psubd m8, m1, m0 ; t24 + paddd m1, m0 ; t31 + REPX {pmaxsd x, m12}, m8, m5, m4, m1 + REPX {pminsd x, m13}, m8, m5, m4, m1 + REPX {pmulld x, m14}, m5, m8, m7, m2 + paddd m5, m11 + paddd m8, m11 + psubd m0, m5, m7 ; t22 + paddd m5, m7 ; t25 + psubd m7, m8, m2 ; t23a + paddd m2, m8 ; t24a + REPX {psrad x, 12 }, m0, m7, m2, m5 + mova [r3+16*6], m0 + mova [r3+16*7], m7 + mova [r3+16*8], m2 + mova [r3+16*9], m5 + mova [r3+16*10], m3 + mova [r3+16*11], m6 + mova [r3+16*12], m9 + mova [r3+16*13], m10 + mova [r3+16*14], m4 + mova [r3+16*15], m1 +%else + mova [r3+ 8*16], m2 + mova [r3+ 9*16], m3 + mova [r3+10*16], m4 + mova [r3+11*16], m5 + mova m3, [o(pd_2048)] + ITX_MULSUB_2D 7, 0, 2, 4, 5, 3, 4052, 601 ; t23a, t24a + ITX_MULSUB_2D 1, 6, 2, 4, 5, _, 995, 3973 ; t20a, t27a + mova m2, [r3+ 8*16] + mova m4, [r3+10*16] + mova m5, [r3+11*16] + mova [r3+ 8*16], m0 + mova [r3+10*16], m6 + mova [r3+11*16], m7 + mova m7, [r3+ 9*16] + mova [r3+ 9*16], m1 + ITX_MULSUB_2D 5, 2, 0, 6, 1, _, 3513, 2106 ; t21a, t26a + ITX_MULSUB_2D 7, 4, 0, 6, 1, _, 2440, 3290 ; t22a, t25a + mova m0, [r3+ 8*16] + mova m1, [r3+ 9*16] + mova m6, [r3+10*16] +.main_oddhalf_part2_fast2: + REPX {paddd x, m3}, m1, m2, m7, m4, m5, m6 + REPX {psrad x, 12}, m1, m2, m7, m4, m5, m6 + psubd m3, m0, m4 ; t25 + mova [r3+ 8*16], m3 + mova m3, [r3+11*16] + paddd m0, m4 ; t24 + psubd m4, m6, m2 ; t26 + paddd m6, m2 ; t27 + psubd m2, m1, m5 ; t21 + paddd m1, m5 ; t20 + psubd m5, m3, m7 ; t22 + paddd m7, m3 ; t23 + mova m3, [o(clip_18b_min)] + REPX {pmaxsd x, m3}, m5, m4, m2, m0, m6, m1, m7 + pmaxsd m3, [r3+ 8*16] + mova [r3+ 8*16], m3 + mova m3, [o(clip_18b_max)] + REPX {pminsd x, m3}, m5, m4, m2, m0, m6, m1, m7 + pminsd m3, [r3+ 8*16] + mova [r3+ 8*16], m0 + mova [r3+ 9*16], m1 + mova [r3+10*16], m6 + mova [r3+11*16], m7 + mova m7, [o(pd_2048)] + ITX_MULSUB_2D 4, 2, 0, 1, 6, 7, 3406, 2276 ; t21a, t26a + ITX_MULSUB_2D 3, 5, 0, 1, _, 7, 6, 2276, 4 ; t25a, t22a + psubd m1, m5, m4 ; t21 + paddd m5, m4 ; t22 + psubd m4, m3, m2 ; t26 + paddd m3, m2 ; t25 + mova m0, [r3+ 8*16] + mova m2, [r3+ 9*16] + mova m6, [r3+10*16] + mova m7, [r3+11*16] + mova [r3+ 8*16], m3 + psubd m3, m0, m6 ; t27a + paddd m0, m6 ; t24a + psubd m6, m7, m2 ; t20a + paddd m7, m2 ; t23a + mova m2, [o(clip_18b_min)] + REPX {pmaxsd x, m2}, m3, m6, m1, m4, m0, m7, m5 + pmaxsd m2, [r3+ 8*16] + mova [r3+ 8*16], m2 + mova m2, [o(clip_18b_max)] + REPX {pminsd x, m2}, m3, m6, m1, m4, m0, m7, m5 + pminsd m2, [r3+ 8*16] + mova [r3+ 8*16], m0 + mova [r3+ 9*16], m2 + mova [r3+14*16], m5 + mova [r3+15*16], m7 + mova m0, [o(pd_2048)] + ITX_MULSUB_2D 4, 1, 2, 5, 7, 0, 1567, 3784, 4 ; t26a, t21a + ITX_MULSUB_2D 3, 6, 2, 5, _, 0, 7, 3784, 4 ; t27, t20 + mova [r3+10*16], m3 + mova m0, [o(clip_18b_min)] + mova m2, [o(clip_18b_max)] + mova m5, [r3+16*2] ; t18a + mova m7, [r3+16*3] ; t19 + psubd m3, m5, m1 ; t21 + paddd m5, m1 ; t18 + psubd m1, m7, m6 ; t20a + paddd m7, m6 ; t19a + REPX {pmaxsd x, m0}, m5, m7, m3, m1 + REPX {pminsd x, m2}, m5, m7, m3, m1 + mova [r3+16*2], m5 + mova [r3+16*3], m7 + mova [r3+11*16], m3 + mova m3, [r3+10*16] + mova m5, [r3+16*4] ; t28 + mova m7, [r3+16*5] ; t29a + psubd m6, m5, m3 ; t27a + paddd m5, m3 ; t28a + psubd m3, m7, m4 ; t26 + paddd m7, m4 ; t29 + REPX {pmaxsd x, m0}, m5, m7, m6, m3 + REPX {pminsd x, m2}, m5, m7, m6, m3 + mova [r3+16*12], m5 + mova [r3+16*13], m7 + mova m5, [o(pd_2048)] + mova m7, [o(pd_2896)] + mova m4, [r3+11*16] + REPX {pmulld x, m7}, m6, m3, m1, m4 + paddd m6, m5 + paddd m3, m5 + psubd m5, m6, m1 ; t20 + paddd m6, m1 ; t27 + psubd m1, m3, m4 ; t21a + paddd m3, m4 ; t26a + REPX {psrad x, 12}, m5, m1, m3, m6 + mova [r3+16*4], m5 + mova [r3+16*5], m1 + mova [r3+16*10], m3 + mova [r3+16*11], m6 + + mova m5, [r3+14*16] + mova m6, [r3+15*16] + mova m3, [r3+16*0] ; t16a + mova m4, [r3+16*1] ; t17 + psubd m1, m3, m6 ; t23 + paddd m3, m6 ; t16 + psubd m6, m4, m5 ; t22a + paddd m4, m5 ; t17a + REPX {pmaxsd x, m0}, m3, m4, m1, m6 + REPX {pminsd x, m2}, m3, m4, m1, m6 + mova [r3+16*0], m3 + mova [r3+16*1], m4 + mova m5, [r3+ 8*16] + mova m3, [r3+ 9*16] + mova [r3+ 8*16], m1 + mova [r3+ 9*16], m6 + mova m4, [r3+16*6] ; t30 + mova m1, [r3+16*7] ; t31a + psubd m6, m1, m5 ; t24 + paddd m1, m5 ; t31 + psubd m5, m4, m3 ; t25a + paddd m4, m3 ; t30a + REPX {pmaxsd x, m0}, m6, m5, m4, m1 + REPX {pminsd x, m2}, m6, m5, m4, m1 + mova [r3+16*14], m4 + mova [r3+16*15], m1 + mova m4, [o(pd_2048)] + mova m1, [r3+ 9*16] + mova m2, [r3+ 8*16] + REPX {pmulld x, m7}, m5, m6, m1, m2 + paddd m5, m4 + paddd m6, m4 + psubd m0, m5, m1 ; t22 + paddd m5, m1 ; t25 + psubd m1, m6, m2 ; t23a + paddd m2, m6 ; t24a + REPX {psrad x, 12}, m0, m1, m2, m5 + mova [r3+16*6], m0 + mova [r3+16*7], m1 + mova [r3+16*8], m2 + mova [r3+16*9], m5 +%endif + ret + + ; final sumsub for idct16 as well as idct32, plus final downshift +%macro IDCT32_END 6 ; in/out1, out2-4, tmp, shift, idx + mova m%4, [r3+16*(23-%1)] + pmaxsd m%1, m12 + pminsd m%1, m13 + psubd m%3, m%1, m%4 ; idct16 out15 - n + paddd m%1, m%4 ; idct16 out0 + n + pmaxsd m%1, m12 + pmaxsd m%3, m12 + pminsd m%1, m13 + pminsd m%3, m13 + paddd m%1, m11 + paddd m%3, m11 + mova m%5, [r3+16*( 0+%1)] + mova m%2, [r3+16*(15-%1)] + psubd m%4, m%1, m%2 ; out31 - n + paddd m%1, m%2 ; out0 + n + paddd m%2, m%3, m%5 ; out15 - n + psubd m%3, m%5 ; out16 + n + REPX {psrad x, %6}, m%1, m%3, m%2, m%4 +%endmacro + +.round_dct32: +%if ARCH_X86_64 + psrld m11, 10 ; pd_2 + IDCT32_END 0, 15, 8, 9, 10, 2 ; 0 15 16 31 + mova [r3+ 0*16], m6 + mova [r3+23*16], m7 + IDCT32_END 1, 14, 6, 7, 10, 2 ; 1 14 17 30 + packssdw m0, m1 ; 0 1 + packssdw m14, m15 ; 14 15 + packssdw m8, m6 ; 16 17 + packssdw m7, m9 ; 30 31 + mova [r3+16*15], m14 + mova [r3+16*14], m7 + IDCT32_END 2, 15, 10, 7, 6, 2 ; 2 13 18 29 + IDCT32_END 3, 14, 1, 9, 6, 2 ; 3 12 19 28 + packssdw m2, m3 ; 2 3 + packssdw m14, m15 ; 12 13 + packssdw m10, m1 ; 18 19 + packssdw m9, m7 ; 28 29 + mova [r3+16*13], m14 + mova [r3+16*12], m9 + IDCT32_END 4, 15, 1, 7, 6, 2 ; 4 11 20 27 + IDCT32_END 5, 14, 3, 9, 6, 2 ; 5 10 21 26 + packssdw m4, m5 ; 4 5 + packssdw m14, m15 ; 10 11 + packssdw m1, m3 ; 20 21 + packssdw m9, m7 ; 26 27 + mova [r3+16*11], m14 + mova [r3+16*10], m9 + mova m6, [r3+ 0*16] + mova m7, [r3+23*16] + IDCT32_END 6, 15, 14, 5, 3, 2 ; 6 9 22 25 + IDCT32_END 7, 11, 3, 9, 13, 2 ; 7 8 23 24 + packssdw m6, m7 ; 6 7 + packssdw m11, m15 ; 8 9 + packssdw m14, m3 ; 22 23 + packssdw m9, m5 ; 24 25 + mova [r3+16*9], m11 + mova [r3+16*8], m9 + mova m12, m1 + ret +%else + mova [r3+16*16], m0 + mova [r3+17*16], m1 + mova [r3+18*16], m2 + mova [r3+19*16], m3 + mova [r3+20*16], m4 + mova [r3+21*16], m5 + mova [r3+22*16], m6 + mova [r3+23*16], m7 + mova m1, [o(pd_2)] + mova m2, [o(clip_18b_min)] + mova m3, [o(clip_18b_max)] + + mov r4, 15*16 +.loop_dct32_end: + mova m0, [r3+16*16] + mova m6, [r3+16*24] + pmaxsd m0, m2 + pminsd m0, m3 + psubd m5, m0, m6 ; idct16 out15 - n + paddd m0, m6 ; idct16 out0 + n + pmaxsd m0, m2 + pmaxsd m5, m2 + pminsd m0, m3 + pminsd m5, m3 + paddd m0, m1 + paddd m5, m1 + mova m7, [r3] + mova m4, [r3+r4] + psubd m6, m0, m4 ; out31 - n + paddd m0, m4 ; out0 + n + paddd m4, m5, m7 ; out15 - n + psubd m5, m7 ; out16 + n + REPX {psrad x, 2}, m0, m5, m4, m6 + mova [r3], m0 + mova [r3+r4], m4 + mova [r3+16*16], m5 + mova [r3+24*16], m6 + add r3, 16 + sub r4, 32 + jg .loop_dct32_end + ret +%endif + +.dconly: + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 8 +.dconly1: + add r5d, 640 + sar r5d, 10 +.dconly2: + imul r5d, 2896 + add r5d, 34816 + movd m0, r5d + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + mova m6, [o(pixel_10bpc_max)] + pxor m5, m5 +.dconly_loop: + mova m1, [dstq+16*0] + mova m2, [dstq+16*1] + mova m3, [dstq+16*2] + mova m4, [dstq+16*3] + REPX {paddw x, m0}, m1, m2, m3, m4 + REPX {pminsw x, m6}, m1, m2, m3, m4 + REPX {pmaxsw x, m5}, m1, m2, m3, m4 + mova [dstq+16*0], m1 + mova [dstq+16*1], m2 + mova [dstq+16*2], m3 + mova [dstq+16*3], m4 + add dstq, strideq + dec r3d + jg .dconly_loop + RET + +cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \ + dst, stride, c, eob + LEA r6, base + test eobd, eobd + jz .dconly + + ; remove entirely-zero iterations +%undef cmp + mov r5d, 8 +.zero_loop: + sub r5d, 2 + cmp eobw, word [o2(tbl_32x16_2d)+r5] + jl .zero_loop + + ; actual first pass after skipping all-zero data +.loop_pass1: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + mova m0, [cq+64* 1+r5*8] + mova m1, [cq+64* 7+r5*8] + mova m2, [cq+64* 9+r5*8] + mova m3, [cq+64*15+r5*8] + mova m4, [cq+64*17+r5*8] + mova m5, [cq+64*23+r5*8] + mova m6, [cq+64*25+r5*8] + mova m7, [cq+64*31+r5*8] + mov r3, rsp + call m(idct_8x4_internal_16bpc).rect2_mul + call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1 + + mova m0, [cq+64* 3+r5*8] + mova m1, [cq+64* 5+r5*8] + mova m2, [cq+64*11+r5*8] + mova m3, [cq+64*13+r5*8] + mova m4, [cq+64*19+r5*8] + mova m5, [cq+64*21+r5*8] + mova m6, [cq+64*27+r5*8] + mova m7, [cq+64*29+r5*8] +%if ARCH_X86_32 + add r3, 16*8 +%endif + call m(idct_8x4_internal_16bpc).rect2_mul +%if ARCH_X86_32 + sub r3, 16*8 +%endif + call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2 + add r3, 16*(16+4*ARCH_X86_32) + + mova m0, [cq+64* 2+r5*8] + mova m1, [cq+64* 6+r5*8] + mova m2, [cq+64*10+r5*8] + mova m3, [cq+64*14+r5*8] + mova m4, [cq+64*18+r5*8] + mova m5, [cq+64*22+r5*8] + mova m6, [cq+64*26+r5*8] + mova m7, [cq+64*30+r5*8] + call m(idct_8x4_internal_16bpc).rect2_mul + call m(idct_16x4_internal_16bpc).main_oddhalf + + mova m0, [cq+64* 0+r5*8] + mova m1, [cq+64* 4+r5*8] + mova m2, [cq+64* 8+r5*8] + mova m3, [cq+64*12+r5*8] + mova m4, [cq+64*16+r5*8] + mova m5, [cq+64*20+r5*8] + mova m6, [cq+64*24+r5*8] + mova m7, [cq+64*28+r5*8] + call m(idct_8x4_internal_16bpc).rect2_mul + call m(idct_8x4_internal_16bpc).main_pass1 + call m(idct_8x4_internal_16bpc).round + sub r3, 16*(16+4*ARCH_X86_32) + call .round_dct32 + +%if ARCH_X86_64 + call m(idct_8x4_internal_16bpc).transpose4x8packed + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [cq+64* 8+r5*8], m8 + mova [cq+64* 9+r5*8], m9 + mova [cq+64*10+r5*8], m10 + mova [cq+64*11+r5*8], m11 + mova m8, [r3+16* 9] ; 8 9 + mova m10, [r3+16*11] ; 10 11 + mova m12, [r3+16*13] ; 12 13 + mova m14, [r3+16*15] ; 14 15 + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [cq+64* 4+r5*8], m8 + mova [cq+64* 5+r5*8], m9 + mova [cq+64* 6+r5*8], m10 + mova [cq+64* 7+r5*8], m11 + mova m8, [r3+16* 8] ; 24 25 + mova m10, [r3+16*10] ; 26 27 + mova m12, [r3+16*12] ; 28 29 + mova m14, [r3+16*14] ; 30 31 + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [cq+64*12+r5*8], m8 + mova [cq+64*13+r5*8], m9 + mova [cq+64*14+r5*8], m10 + mova [cq+64*15+r5*8], m11 +%else + sub r3, 8*16 + mova m0, [r3+ 8*16] + mova m2, [r3+10*16] + mova m4, [r3+12*16] + mova m6, [r3+14*16] + packssdw m0, [r3+ 9*16] + packssdw m2, [r3+11*16] + packssdw m4, [r3+13*16] + packssdw m6, [r3+15*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [cq+64* 4+r5*8], m0 + mova [cq+64* 5+r5*8], m1 + mova [cq+64* 6+r5*8], m2 + mova [cq+64* 7+r5*8], m3 + mova m0, [r3+16*16] + mova m2, [r3+18*16] + mova m4, [r3+20*16] + mova m6, [r3+22*16] + packssdw m0, [r3+17*16] + packssdw m2, [r3+19*16] + packssdw m4, [r3+21*16] + packssdw m6, [r3+23*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [cq+64* 8+r5*8], m0 + mova [cq+64* 9+r5*8], m1 + mova [cq+64*10+r5*8], m2 + mova [cq+64*11+r5*8], m3 + mova m0, [r3+31*16] + mova m2, [r3+29*16] + mova m4, [r3+27*16] + mova m6, [r3+25*16] + packssdw m0, [r3+30*16] + packssdw m2, [r3+28*16] + packssdw m4, [r3+26*16] + packssdw m6, [r3+24*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [cq+64*12+r5*8], m0 + mova [cq+64*13+r5*8], m1 + mova [cq+64*14+r5*8], m2 + mova [cq+64*15+r5*8], m3 + mova m0, [r3+ 0*16] + mova m2, [r3+ 2*16] + mova m4, [r3+ 4*16] + mova m6, [r3+ 6*16] + packssdw m0, [r3+ 1*16] + packssdw m2, [r3+ 3*16] + packssdw m4, [r3+ 5*16] + packssdw m6, [r3+ 7*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed +%endif + mova [cq+64* 0+r5*8], m0 + mova [cq+64* 1+r5*8], m1 + mova [cq+64* 2+r5*8], m2 + mova [cq+64* 3+r5*8], m3 + pxor m0, m0 + REPX {mova [cq+x*64+r5*8], m0}, 16, 17, 18, 19, 20, 21, 22, 23, \ + 24, 25, 26, 27, 28, 29, 30, 31 + sub r5d, 2 + jge .loop_pass1 + + ; pass=2, we need to call this otherwise the stack pointer has + ; the wrong offset in the 8-bit code + call .pass2 + RET + +.pass2: +%if ARCH_X86_64 + mova m8, [o(pw_2048)] + pxor m9, m9 + mova m10, [o(pixel_10bpc_max)] +%if WIN64 + mov [rsp+16*16+gprsize], r7 +%endif + mov r7, dstq +%else + mov [rsp+2*gprsize+16*16], dstq +%endif + lea r3, [strideq*3] + mov r4d, 4 + jmp m(idct_16x16_internal_16bpc).loop_pass2 + +.round_dct32: +%if ARCH_X86_64 + psrld m11, 11 ; pd_1 + IDCT32_END 0, 15, 8, 9, 10, 1 ; 0 15 16 31 + mova [r3+ 0*16], m6 + mova [r3+23*16], m7 + IDCT32_END 1, 14, 6, 7, 10, 1 ; 1 14 17 30 + packssdw m0, m1 ; 0 1 + packssdw m14, m15 ; 14 15 + packssdw m8, m6 ; 16 17 + packssdw m7, m9 ; 30 31 + mova [r3+16*15], m14 + mova [r3+16*14], m7 + IDCT32_END 2, 15, 10, 7, 6, 1 ; 2 13 18 29 + IDCT32_END 3, 14, 1, 9, 6, 1 ; 3 12 19 28 + packssdw m2, m3 ; 2 3 + packssdw m14, m15 ; 12 13 + packssdw m10, m1 ; 18 19 + packssdw m9, m7 ; 28 29 + mova [r3+16*13], m14 + mova [r3+16*12], m9 + IDCT32_END 4, 15, 1, 7, 6, 1 ; 4 11 20 27 + IDCT32_END 5, 14, 3, 9, 6, 1 ; 5 10 21 26 + packssdw m4, m5 ; 4 5 + packssdw m14, m15 ; 10 11 + packssdw m1, m3 ; 20 21 + packssdw m9, m7 ; 26 27 + mova [r3+16*11], m14 + mova [r3+16*10], m9 + mova m6, [r3+ 0*16] + mova m7, [r3+23*16] + IDCT32_END 6, 15, 14, 5, 3, 1 ; 6 9 22 25 + IDCT32_END 7, 11, 3, 9, 13, 1 ; 7 8 23 24 + packssdw m6, m7 ; 6 7 + packssdw m11, m15 ; 8 9 + packssdw m14, m3 ; 22 23 + packssdw m9, m5 ; 24 25 + mova [r3+16*9], m11 + mova [r3+16*8], m9 + mova m12, m1 + ret +%else + mova [r3+16*16], m0 + mova [r3+17*16], m1 + mova [r3+18*16], m2 + mova [r3+19*16], m3 + mova [r3+20*16], m4 + mova [r3+21*16], m5 + mova [r3+22*16], m6 + mova [r3+23*16], m7 + pcmpeqd m1, m1 ; -1 + mova m2, [o(clip_18b_min)] + mova m3, [o(clip_18b_max)] + + mov r4, 15*16 +.loop_dct32_end: + mova m0, [r3+16*16] + mova m6, [r3+16*24] + psubd m5, m0, m6 ; idct16 out15 - n + paddd m0, m6 ; idct16 out0 + n + pmaxsd m0, m2 + pmaxsd m5, m2 + pminsd m0, m3 + pminsd m5, m3 + psubd m0, m1 + psubd m5, m1 + mova m7, [r3] + mova m4, [r3+r4] + psubd m6, m0, m4 ; out31 - n + paddd m0, m4 ; out0 + n + paddd m4, m5, m7 ; out15 - n + psubd m5, m7 ; out16 + n + REPX {psrad x, 1}, m0, m5, m4, m6 + mova [r3], m0 + mova [r3+r4], m4 + mova [r3+16*16], m5 + mova [r3+24*16], m6 + add r3, 16 + sub r4, 32 + jg .loop_dct32_end + ret +%endif + +.dconly: + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 16 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + add r5d, 384 + sar r5d, 9 + jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2 + +cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \ + dst, stride, c, eob + LEA r6, base + test eobd, eobd + jz .dconly + + ; remove entirely-zero iterations +%if ARCH_X86_32 + mov [rsp+5*32*16+1*gprsize], dstq +%elif WIN64 + mov [rsp+5*32*16+1*gprsize], r7 +%endif +%undef cmp + mov r5d, 14 + cmp eobw, word [o2(tbl_32x32_2d)+r5] + jge .end_zero_loop + pxor m0, m0 +.zero_loop: + movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] + movzx t1d, t0b + shr t0d, 8 + mova [rsp+32*16+r5*8+0*32*16], m0 + mova [rsp+40*16+r5*8+0*32*16], m0 + mova [rsp+32*16+t0*8+0*32*16], m0 + mova [rsp+32*16+t1*8+0*32*16], m0 + mova [rsp+32*16+r5*8+1*32*16], m0 + mova [rsp+40*16+r5*8+1*32*16], m0 + mova [rsp+32*16+t0*8+1*32*16], m0 + mova [rsp+32*16+t1*8+1*32*16], m0 + mova [rsp+32*16+r5*8+2*32*16], m0 + mova [rsp+40*16+r5*8+2*32*16], m0 + mova [rsp+32*16+t0*8+2*32*16], m0 + mova [rsp+32*16+t1*8+2*32*16], m0 + mova [rsp+32*16+r5*8+3*32*16], m0 + mova [rsp+40*16+r5*8+3*32*16], m0 + mova [rsp+32*16+t0*8+3*32*16], m0 + mova [rsp+32*16+t1*8+3*32*16], m0 + sub r5d, 2 + cmp eobw, word [o2(tbl_32x32_2d)+r5] + jl .zero_loop +.end_zero_loop: + + ; actual first pass after skipping all-zero data + mov [rsp+gprsize*0+5*32*16], eobd +.loop_pass1: + mova m0, [cq+128* 1+r5*8] + mova m1, [cq+128* 7+r5*8] + mova m2, [cq+128* 9+r5*8] + mova m3, [cq+128*15+r5*8] + mova m4, [cq+128*17+r5*8] + mova m5, [cq+128*23+r5*8] + mova m6, [cq+128*25+r5*8] + mova m7, [cq+128*31+r5*8] +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + mov r3, rsp + call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1 + mova m0, [cq+128* 3+r5*8] + mova m1, [cq+128* 5+r5*8] + mova m2, [cq+128*11+r5*8] + mova m3, [cq+128*13+r5*8] + mova m4, [cq+128*19+r5*8] + mova m5, [cq+128*21+r5*8] + mova m6, [cq+128*27+r5*8] + mova m7, [cq+128*29+r5*8] + call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2 + mova m0, [cq+128* 2+r5*8] + mova m1, [cq+128* 6+r5*8] + mova m2, [cq+128*10+r5*8] + mova m3, [cq+128*14+r5*8] + mova m4, [cq+128*18+r5*8] + mova m5, [cq+128*22+r5*8] + mova m6, [cq+128*26+r5*8] + mova m7, [cq+128*30+r5*8] + add r3, 16*(16+4*ARCH_X86_32) + call m(idct_16x4_internal_16bpc).main_oddhalf + mova m0, [cq+128* 0+r5*8] + mova m1, [cq+128* 4+r5*8] + mova m2, [cq+128* 8+r5*8] + mova m3, [cq+128*12+r5*8] + mova m4, [cq+128*16+r5*8] + mova m5, [cq+128*20+r5*8] + mova m6, [cq+128*24+r5*8] + mova m7, [cq+128*28+r5*8] + call m(idct_8x4_internal_16bpc).main_pass1 + call m(idct_8x4_internal_16bpc).round + sub r3, 16*(16+4*ARCH_X86_32) + call m(inv_txfm_add_dct_dct_32x8_16bpc).round_dct32 + movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] + movzx t1d, t0b + shr t0d, 8 +%if ARCH_X86_64 + call m(idct_8x4_internal_16bpc).transpose4x8packed + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [rsp+32*16+r5*8+2*32*16], m8 + mova [rsp+40*16+r5*8+2*32*16], m10 + mova [rsp+32*16+t1*8+2*32*16], m9 + mova [rsp+32*16+t0*8+2*32*16], m11 + mova m8, [r3+16* 9] ; 8 9 + mova m10, [r3+16*11] ; 10 11 + mova m12, [r3+16*13] ; 12 13 + mova m14, [r3+16*15] ; 14 15 + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [rsp+32*16+r5*8+1*32*16], m8 + mova [rsp+40*16+r5*8+1*32*16], m10 + mova [rsp+32*16+t1*8+1*32*16], m9 + mova [rsp+32*16+t0*8+1*32*16], m11 + mova m8, [r3+16* 8] ; 24 25 + mova m10, [r3+16*10] ; 26 27 + mova m12, [r3+16*12] ; 28 29 + mova m14, [r3+16*14] ; 30 31 + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [rsp+32*16+r5*8+3*32*16], m8 + mova [rsp+40*16+r5*8+3*32*16], m10 + mova [rsp+32*16+t1*8+3*32*16], m9 + mova [rsp+32*16+t0*8+3*32*16], m11 +%else + sub r3, 8*16 + mova m0, [r3+ 8*16] + mova m2, [r3+10*16] + mova m4, [r3+12*16] + mova m6, [r3+14*16] + packssdw m0, [r3+ 9*16] + packssdw m2, [r3+11*16] + packssdw m4, [r3+13*16] + packssdw m6, [r3+15*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [rsp+32*16+r5*8+1*32*16], m0 + mova [rsp+40*16+r5*8+1*32*16], m2 + mova [rsp+32*16+t1*8+1*32*16], m1 + mova [rsp+32*16+t0*8+1*32*16], m3 + mova m0, [r3+16*16] + mova m2, [r3+18*16] + mova m4, [r3+20*16] + mova m6, [r3+22*16] + packssdw m0, [r3+17*16] + packssdw m2, [r3+19*16] + packssdw m4, [r3+21*16] + packssdw m6, [r3+23*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [rsp+32*16+r5*8+2*32*16], m0 + mova [rsp+40*16+r5*8+2*32*16], m2 + mova [rsp+32*16+t1*8+2*32*16], m1 + mova [rsp+32*16+t0*8+2*32*16], m3 + mova m0, [r3+31*16] + mova m2, [r3+29*16] + mova m4, [r3+27*16] + mova m6, [r3+25*16] + packssdw m0, [r3+30*16] + packssdw m2, [r3+28*16] + packssdw m4, [r3+26*16] + packssdw m6, [r3+24*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [rsp+32*16+r5*8+3*32*16], m0 + mova [rsp+40*16+r5*8+3*32*16], m2 + mova [rsp+32*16+t1*8+3*32*16], m1 + mova [rsp+32*16+t0*8+3*32*16], m3 + mova m0, [r3+ 0*16] + mova m2, [r3+ 2*16] + mova m4, [r3+ 4*16] + mova m6, [r3+ 6*16] + packssdw m0, [r3+ 1*16] + packssdw m2, [r3+ 3*16] + packssdw m4, [r3+ 5*16] + packssdw m6, [r3+ 7*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed +%endif + pxor m7, m7 + ; clear lower half of [cq] + REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14, 15, \ + 16, 17, 18, 19, 20, 21, 22, 23, \ + 24, 25, 26, 27, 28, 29, 30, 31 + mova [rsp+32*16+r5*8+0*32*16], m0 + mova [rsp+40*16+r5*8+0*32*16], m2 + mova [rsp+32*16+t1*8+0*32*16], m1 + mova [rsp+32*16+t0*8+0*32*16], m3 + sub r5d, 2 + jge .loop_pass1 + + ; pass=2 code starts here + mov eobd, [rsp+gprsize*0+5*32*16] + add rsp, 29*16 + cmp eobd, 36 + jl .load_veryfast + cmp eobd, 136 + jl .load_fast + ; load normal + lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)] + jmp .run +.load_fast: + lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] + jmp .run +.load_veryfast: + lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] + ; fall-through +.run: +%if ARCH_X86_64 + lea r2, [dstq+64] + mov r7, -8 +%else + lea r2, [rsp+(4*32+3)*16] + mov dword [r2+0*gprsize], 4 +%endif + jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry + +.dconly: + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 32 + add rsp, (5*32+1-(24+8*ARCH_X86_32))*16 + jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly1 + +cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 16, \ + 0-(12+2*64)*16-(4+4*ARCH_X86_32)*gprsize, \ + dst, stride, c, eob + LEA r6, base + test eobd, eobd + jz .dconly + +%if ARCH_X86_32 + DECLARE_REG_TMP 4, 1, 2, 0 + mov [rsp+gprsize*1+(64*2+12)*16], r0 + mov [rsp+gprsize*2+(64*2+12)*16], r1 + mov [rsp+gprsize*3+(64*2+12)*16], r2 +%else + DECLARE_REG_TMP 8, 9, 4, 7 + mov [rsp+gprsize*1+(64*2+12)*16], r9 +%if WIN64 + mov [rsp+gprsize*2+(64*2+12)*16], r7 + mov [rsp+gprsize*3+(64*2+12)*16], r8 +%endif +%endif +%undef cmp + ; remove entirely-zero iterations + mov r5d, 7*2 + cmp eobw, word [o2(tbl_16x32_2d)+r5] + jge .end_zero_loop + pxor m0, m0 +.zero_loop: + movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] + movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] + movzx t0d, t1b + movzx t2d, t3b + shr t1d, 8 + shr t3d, 8 + mova [rsp+12*16+t0*8], m0 + mova [rsp+12*16+t1*8], m0 + mova [rsp+12*16+t2*8], m0 + mova [rsp+12*16+t3*8], m0 + mova [rsp+76*16+t0*8], m0 + mova [rsp+76*16+t1*8], m0 + mova [rsp+76*16+t2*8], m0 + mova [rsp+76*16+t3*8], m0 + sub r5d, 2 + cmp eobw, word [o2(tbl_16x32_2d)+r5] + jl .zero_loop +.end_zero_loop: + ; actual first pass after skipping all-zero data + mov [rsp+gprsize*0+(64*2+12)*16], eobd + mov r3, rsp +%if ARCH_X86_32 + DECLARE_REG_TMP 4, 1, 6, 0 + mov r2, [rsp+gprsize*3+(64*2+12)*16] + mov [rsp+gprsize*3+(64*2+12)*16], r6 +%endif +.loop_pass1: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + mova m0, [cq+ 1*128+r5*8] + mova m1, [cq+ 3*128+r5*8] + mova m2, [cq+ 5*128+r5*8] + mova m3, [cq+ 7*128+r5*8] + mova m4, [cq+ 9*128+r5*8] + mova m5, [cq+11*128+r5*8] + mova m6, [cq+13*128+r5*8] + mova m7, [cq+15*128+r5*8] + call m(idct_16x4_internal_16bpc).main_oddhalf + + mova m0, [cq+ 0*128+r5*8] + mova m1, [cq+ 2*128+r5*8] + mova m2, [cq+ 4*128+r5*8] + mova m3, [cq+ 6*128+r5*8] + mova m4, [cq+ 8*128+r5*8] + mova m5, [cq+10*128+r5*8] + mova m6, [cq+12*128+r5*8] + mova m7, [cq+14*128+r5*8] + call m(idct_8x4_internal_16bpc).main_pass1 + call m(idct_8x4_internal_16bpc).round + call m(idct_16x16_internal_16bpc).round +%if ARCH_X86_64 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m14, m15 +%endif + call m(idct_8x4_internal_16bpc).transpose4x8packed + movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] + movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] + movzx t0d, t1b + movzx t2d, t3b + shr t1d, 8 + shr t3d, 8 +%if ARCH_X86_64 + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [rsp+76*16+t0*8], m8 + mova [rsp+76*16+t1*8], m9 + mova [rsp+76*16+t2*8], m10 + mova [rsp+76*16+t3*8], m11 +%else + mova [rsp+76*16+t0*8], m0 + mova [rsp+76*16+t1*8], m1 + mova [rsp+76*16+t2*8], m2 + mova [rsp+76*16+t3*8], m3 + mova m0, [rsp+ 8*16] + mova m2, [rsp+ 9*16] + mova m4, [rsp+10*16] + mova m6, [rsp+11*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed +%endif + mova [rsp+12*16+t0*8], m0 + mova [rsp+12*16+t1*8], m1 + mova [rsp+12*16+t2*8], m2 + mova [rsp+12*16+t3*8], m3 +%if ARCH_X86_32 + mov r6, [rsp+gprsize*3+(64*2+12)*16] +%endif + pxor m7, m7 + REPX {mova [cq+x*128+r5*8], m7}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + sub r5d, 2 + jge .loop_pass1 + + ; pass=2 + mov eobd, [rsp+gprsize*0+(64*2+12)*16] + cmp eobd, 151 + jl .fast + ; fall-through +%if ARCH_X86_64 + DECLARE_REG_TMP 8, 9 +%else + DECLARE_REG_TMP 1, 5 +%endif + lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] + lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)] + jmp .run +.fast: + lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] + lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)] +.run: + add rsp, 9*16 + +%if ARCH_X86_64 + lea r2, [dstq+32] + mov r7, -4 +%else + lea r2, [rsp+(64*2+3)*16] + mov [r2+4*gprsize], t0 + mov [r2+5*gprsize], t1 + mov r1, [r2+2*gprsize] + mov dword [r2+0*gprsize], 2 +%endif +.loop_pass2: +%if ARCH_X86_32 + mov dstq, [r2+1*gprsize] +%endif + call .pass2 + add rsp, 64*16 +%if ARCH_X86_64 + add r7, 2 + lea dstq, [r2+r7*8] + jl .loop_pass2 +%else + add dword [r2+1*gprsize], 16 + dec dword [r2+0*gprsize] + jg .loop_pass2 +%endif +%assign stack_size (stack_size-(64*2+9)*16) +%if STACK_ALIGNMENT >= 16 +%assign stack_size_padded (stack_size_padded-(64*2+9)*16) +%assign stack_offset (stack_offset-(64*2+9)*16) +%else +%xdefine rstkm [rsp + stack_size] +%endif +%if ARCH_X86_64 + mov r9, [rsp+gprsize*1+3*16] +%if WIN64 + mov r7, [rsp+gprsize*2+3*16] + mov r8, [rsp+gprsize*3+3*16] +%endif +%endif + RET + +.pass2: +%if ARCH_X86_32 + lea r5, [o(itx8_start)] +%endif + mova m0, [rsp+gprsize+16* 3] + mova m1, [rsp+gprsize+16* 4] + mova m2, [rsp+gprsize+16* 5] + mova m3, [rsp+gprsize+16* 6] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m_suffix(idct_8x8_internal_8bpc, _ssse3).main + mova [rsp+gprsize+ 3*16], m0 + mova [rsp+gprsize+ 4*16], m1 + mova [rsp+gprsize+ 5*16], m2 + mova [rsp+gprsize+ 6*16], m3 + mova [rsp+gprsize+ 7*16], m4 + mova [rsp+gprsize+ 8*16], m5 + mova [rsp+gprsize+ 9*16], m6 + mova [rsp+gprsize+10*16], m7 + mova m0, [rsp+gprsize+16*11] + mova m1, [rsp+gprsize+16*12] + mova m2, [rsp+gprsize+16*13] + mova m3, [rsp+gprsize+16*14] + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m_suffix(idct_16x8_internal_8bpc, _ssse3).main + mova m7, [rsp+gprsize+ 0*16] + mova [rsp+gprsize+11*16], m0 + mova [rsp+gprsize+12*16], m1 + mova [rsp+gprsize+13*16], m2 + mova [rsp+gprsize+14*16], m3 + mova [rsp+gprsize+15*16], m4 + mova [rsp+gprsize+16*16], m5 + mova [rsp+gprsize+17*16], m6 + mova [rsp+gprsize+18*16], m7 +%if ARCH_X86_64 + call r8 +%else + call [r2+4*gprsize] +%endif + mova [rsp+gprsize+ 3*16], m0 + mova [rsp+gprsize+ 5*16], m2 + mova [rsp+gprsize+ 8*16], m5 + mova [rsp+gprsize+10*16], m7 +%if ARCH_X86_64 + call r9 + mova m8, [o(pw_2048)] + pxor m9, m9 + mova m10, [o(pixel_10bpc_max)] +%else + call [r2+5*gprsize] +%endif + lea r3, [strideq*3] + lea r4, [rsp+gprsize+ 3*16] +%if ARCH_X86_64 + mov r6d, 8 +%else + mov dword [r2+2*gprsize], 8 +%endif +.loop_write: + mova m0, [r4+0*16] + mova m1, [r4+1*16] + mova m2, [r4+2*16] + mova m3, [r4+3*16] + mova m4, [r4+4*16] + mova m5, [r4+5*16] + mova m6, [r4+6*16] + mova m7, [r4+7*16] + call m(idct_8x8_internal_16bpc).round1_and_write_8x8 + lea dstq, [dstq+strideq*8] + add r4, 8*16 +%if ARCH_X86_64 + dec r6d +%else + dec dword [r2+2*gprsize] +%endif + jg .loop_write + ret + +.dconly: + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 64 + add r5d, 640 + sar r5d, 10 + add rsp, (12+2*64)*16+(4+4*ARCH_X86_32)*gprsize-(8+4*ARCH_X86_32)*16 + jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2 + +cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 16, \ + 0-(32+4*64)*16-(4+4*ARCH_X86_32)*gprsize, \ + dst, stride, c, eob + LEA r6, base + test eobd, eobd + jz .dconly + +%if ARCH_X86_32 + DECLARE_REG_TMP 4, 1, 2, 0 + mov [rsp+gprsize*1+(64*4+32)*16], r0 + mov [rsp+gprsize*2+(64*4+32)*16], r1 + mov [rsp+gprsize*3+(64*4+32)*16], r2 +%else + DECLARE_REG_TMP 8, 9, 4, 7 + mov [rsp+gprsize*1+(64*4+32)*16], r9 +%if WIN64 + mov [rsp+gprsize*2+(64*4+32)*16], r7 + mov [rsp+gprsize*3+(64*4+32)*16], r8 +%endif +%endif +%undef cmp + ; remove entirely-zero iterations + mov r5d, 7*2 + cmp eobw, word [o2(tbl_32x32_2d)+r5] + jge .end_zero_loop + pxor m0, m0 +.zero_loop: + movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] + movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] + movzx t0d, t1b + movzx t2d, t3b + shr t1d, 8 + shr t3d, 8 + mova [rsp+ 32*16+t0*8], m0 + mova [rsp+ 32*16+t1*8], m0 + mova [rsp+ 32*16+t2*8], m0 + mova [rsp+ 32*16+t3*8], m0 + mova [rsp+ 96*16+t0*8], m0 + mova [rsp+ 96*16+t1*8], m0 + mova [rsp+ 96*16+t2*8], m0 + mova [rsp+ 96*16+t3*8], m0 + mova [rsp+160*16+t0*8], m0 + mova [rsp+160*16+t1*8], m0 + mova [rsp+160*16+t2*8], m0 + mova [rsp+160*16+t3*8], m0 + mova [rsp+224*16+t0*8], m0 + mova [rsp+224*16+t1*8], m0 + mova [rsp+224*16+t2*8], m0 + mova [rsp+224*16+t3*8], m0 + sub r5d, 2 + cmp eobw, word [o2(tbl_32x32_2d)+r5] + jl .zero_loop +.end_zero_loop: + ; actual first pass after skipping all-zero data + mov [rsp+gprsize*0+(64*4+32)*16], eobd + mov r3, rsp +%if ARCH_X86_32 + DECLARE_REG_TMP 4, 1, 6, 0 + mov r2, [rsp+gprsize*3+(64*4+32)*16] + mov [rsp+gprsize*3+(64*4+32)*16], r6 +%endif +.loop_pass1: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + mova m0, [cq+128* 1+r5*8] + mova m1, [cq+128* 7+r5*8] + mova m2, [cq+128* 9+r5*8] + mova m3, [cq+128*15+r5*8] + mova m4, [cq+128*17+r5*8] + mova m5, [cq+128*23+r5*8] + mova m6, [cq+128*25+r5*8] + mova m7, [cq+128*31+r5*8] + mov r3, rsp + call m(idct_8x4_internal_16bpc).rect2_mul + call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1 + + mova m0, [cq+128* 3+r5*8] + mova m1, [cq+128* 5+r5*8] + mova m2, [cq+128*11+r5*8] + mova m3, [cq+128*13+r5*8] + mova m4, [cq+128*19+r5*8] + mova m5, [cq+128*21+r5*8] + mova m6, [cq+128*27+r5*8] + mova m7, [cq+128*29+r5*8] +%if ARCH_X86_32 + add r3, 16*8 +%endif + call m(idct_8x4_internal_16bpc).rect2_mul +%if ARCH_X86_32 + sub r3, 16*8 +%endif + call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2 + add r3, 16*(16+4*ARCH_X86_32) + + mova m0, [cq+128* 2+r5*8] + mova m1, [cq+128* 6+r5*8] + mova m2, [cq+128*10+r5*8] + mova m3, [cq+128*14+r5*8] + mova m4, [cq+128*18+r5*8] + mova m5, [cq+128*22+r5*8] + mova m6, [cq+128*26+r5*8] + mova m7, [cq+128*30+r5*8] + call m(idct_8x4_internal_16bpc).rect2_mul + call m(idct_16x4_internal_16bpc).main_oddhalf + + mova m0, [cq+128* 0+r5*8] + mova m1, [cq+128* 4+r5*8] + mova m2, [cq+128* 8+r5*8] + mova m3, [cq+128*12+r5*8] + mova m4, [cq+128*16+r5*8] + mova m5, [cq+128*20+r5*8] + mova m6, [cq+128*24+r5*8] + mova m7, [cq+128*28+r5*8] + call m(idct_8x4_internal_16bpc).rect2_mul + call m(idct_8x4_internal_16bpc).main_pass1 + call m(idct_8x4_internal_16bpc).round + sub r3, 16*(16+4*ARCH_X86_32) + call m(inv_txfm_add_dct_dct_32x16_16bpc).round_dct32 + + movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] + movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] + movzx t0d, t1b + movzx t2d, t3b + shr t1d, 8 + shr t3d, 8 +%if ARCH_X86_64 + call m(idct_8x4_internal_16bpc).transpose4x8packed + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [rsp+160*16+t0*8], m8 + mova [rsp+160*16+t1*8], m9 + mova [rsp+160*16+t2*8], m10 + mova [rsp+160*16+t3*8], m11 + mova m8, [r3+16* 9] ; 8 9 + mova m10, [r3+16*11] ; 10 11 + mova m12, [r3+16*13] ; 12 13 + mova m14, [r3+16*15] ; 14 15 + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [rsp+ 96*16+t0*8], m8 + mova [rsp+ 96*16+t1*8], m9 + mova [rsp+ 96*16+t2*8], m10 + mova [rsp+ 96*16+t3*8], m11 + mova m8, [r3+16* 8] ; 24 25 + mova m10, [r3+16*10] ; 26 27 + mova m12, [r3+16*12] ; 28 29 + mova m14, [r3+16*14] ; 30 31 + call m(idct_16x4_internal_16bpc).transpose4x8packed_hi + mova [rsp+224*16+t0*8], m8 + mova [rsp+224*16+t1*8], m9 + mova [rsp+224*16+t2*8], m10 + mova [rsp+224*16+t3*8], m11 +%else + sub r3, 8*16 + mova m0, [r3+ 8*16] + mova m2, [r3+10*16] + mova m4, [r3+12*16] + mova m6, [r3+14*16] + packssdw m0, [r3+ 9*16] + packssdw m2, [r3+11*16] + packssdw m4, [r3+13*16] + packssdw m6, [r3+15*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [rsp+ 96*16+t0*8], m0 + mova [rsp+ 96*16+t1*8], m1 + mova [rsp+ 96*16+t2*8], m2 + mova [rsp+ 96*16+t3*8], m3 + mova m0, [r3+16*16] + mova m2, [r3+18*16] + mova m4, [r3+20*16] + mova m6, [r3+22*16] + packssdw m0, [r3+17*16] + packssdw m2, [r3+19*16] + packssdw m4, [r3+21*16] + packssdw m6, [r3+23*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [rsp+160*16+t0*8], m0 + mova [rsp+160*16+t1*8], m1 + mova [rsp+160*16+t2*8], m2 + mova [rsp+160*16+t3*8], m3 + mova m0, [r3+31*16] + mova m2, [r3+29*16] + mova m4, [r3+27*16] + mova m6, [r3+25*16] + packssdw m0, [r3+30*16] + packssdw m2, [r3+28*16] + packssdw m4, [r3+26*16] + packssdw m6, [r3+24*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [rsp+224*16+t0*8], m0 + mova [rsp+224*16+t1*8], m1 + mova [rsp+224*16+t2*8], m2 + mova [rsp+224*16+t3*8], m3 + mova m0, [r3+ 0*16] + mova m2, [r3+ 2*16] + mova m4, [r3+ 4*16] + mova m6, [r3+ 6*16] + packssdw m0, [r3+ 1*16] + packssdw m2, [r3+ 3*16] + packssdw m4, [r3+ 5*16] + packssdw m6, [r3+ 7*16] + call m(idct_8x4_internal_16bpc).transpose4x8packed +%endif + mova [rsp+ 32*16+t0*8], m0 + mova [rsp+ 32*16+t1*8], m1 + mova [rsp+ 32*16+t2*8], m2 + mova [rsp+ 32*16+t3*8], m3 + pxor m0, m0 + REPX {mova [cq+x*128+r5*8], m0}, 0, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14, 15, \ + 16, 17, 18, 19, 20, 21, 22, 23, \ + 24, 25, 26, 27, 28, 29, 30, 31 +%if ARCH_X86_32 + mov r6, [rsp+gprsize*3+(64*4+32)*16] +%endif + sub r5d, 2 + jge .loop_pass1 + + ; pass=2 + mov eobd, [rsp+gprsize*0+(64*4+32)*16] + cmp eobd, 136 + jl .fast + ; fall-through +%if ARCH_X86_64 + DECLARE_REG_TMP 8, 9 +%else + DECLARE_REG_TMP 1, 5 +%endif + lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] + lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)] + jmp .run +.fast: + lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] + lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)] +.run: + add rsp, 29*16 + +%if ARCH_X86_64 + lea r2, [dstq+64] + mov r7, -8 +%else + lea r2, [rsp+(64*4+3)*16] + mov [r2+4*gprsize], t0 + mov [r2+5*gprsize], t1 + mov r1, [r2+2*gprsize] + mov dword [r2+0*gprsize], 4 +%endif + jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2 + +.dconly: + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 64 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + add r5d, 384 + sar r5d, 9 + add rsp, (32+4*64)*16+(4+4*ARCH_X86_32)*gprsize-(24+8*ARCH_X86_32)*16 + jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2 + +cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \ + dst, stride, c, eob + LEA r6, base + test eobd, eobd + jz .dconly + + ; remove entirely-zero iterations +%undef cmp + mov r5d, 8 +.zero_loop: + sub r5d, 2 + cmp eobw, word [o2(tbl_32x16_2d)+r5] + jl .zero_loop + + ; actual first pass after skipping all-zero data +.loop_pass1: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + + mov r3, rsp + lea r4, [o(idct64_mul_16bpc)] + mova m0, [cq+64* 1+r5*8] + mova m1, [cq+64*31+r5*8] + mova m2, [cq+64*17+r5*8] + mova m3, [cq+64*15+r5*8] + call .main_part1 + mova m0, [cq+64* 7+r5*8] + mova m1, [cq+64*25+r5*8] + mova m2, [cq+64*23+r5*8] + mova m3, [cq+64* 9+r5*8] + call .main_part1 + mova m0, [cq+64* 5+r5*8] + mova m1, [cq+64*27+r5*8] + mova m2, [cq+64*21+r5*8] + mova m3, [cq+64*11+r5*8] + call .main_part1 + mova m0, [cq+64* 3+r5*8] + mova m1, [cq+64*29+r5*8] + mova m2, [cq+64*19+r5*8] + mova m3, [cq+64*13+r5*8] + call .main_part1 + call .main_part2 + + mova m0, [cq+64* 2+r5*8] + mova m1, [cq+64*14+r5*8] + mova m2, [cq+64*18+r5*8] + mova m3, [cq+64*30+r5*8] + call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast + + mova m0, [cq+64* 6+r5*8] + mova m1, [cq+64*10+r5*8] + mova m2, [cq+64*22+r5*8] + mova m3, [cq+64*26+r5*8] + call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast + add r3, 16*(24+4*ARCH_X86_32) + + mova m0, [cq+64* 4+r5*8] + mova m1, [cq+64*12+r5*8] + mova m2, [cq+64*20+r5*8] + mova m3, [cq+64*28+r5*8] + call m(idct_16x4_internal_16bpc).main_oddhalf_fast + + mova m0, [cq+64* 0+r5*8] + mova m1, [cq+64* 8+r5*8] + mova m2, [cq+64*16+r5*8] + mova m3, [cq+64*24+r5*8] + call m(idct_8x4_internal_16bpc).main_pass1_fast + call m(idct_8x4_internal_16bpc).round + mova [r3-(7+4*ARCH_X86_32)*16], m1 + mova [r3-(6+4*ARCH_X86_32)*16], m2 + mova [r3-(5+4*ARCH_X86_32)*16], m3 + mova [r3-(4+4*ARCH_X86_32)*16], m4 + mova [r3-(3+4*ARCH_X86_32)*16], m5 + mova [r3-(2+4*ARCH_X86_32)*16], m6 + mova [r3-(1+4*ARCH_X86_32)*16], m7 + sub r3, 16*(40+4*ARCH_X86_32-4) + +%if ARCH_X86_64 + psrld m15, m11, 10 ; pd_2 +%else + mova m7, [o(pd_2)] +%endif + call .main_end_loop_start + + lea r3, [rsp+56*16] + lea r4, [cq+r5*8+64*28] + call .shift_transpose + sub r5d, 2 + jge .loop_pass1 + + ; pass=2, we need to call this otherwise the stack pointer has + ; the wrong offset in the 8-bit code + call .pass2 + RET + +.pass2: +%if ARCH_X86_64 + mova m8, [o(pw_2048)] + pxor m9, m9 + mova m10, [o(pixel_10bpc_max)] +%if WIN64 + mov [rsp+16*16+gprsize], r7 +%endif + mov r7, dstq +%else + mov [rsp+2*gprsize+16*16], dstq +%endif + lea r3, [strideq*3] + mov r4d, 8 + jmp m(idct_16x16_internal_16bpc).loop_pass2 + +.main_part1: ; idct64 steps 1-5 + ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a +%if ARCH_X86_64 + movd m7, [r4+4*0] + movd m8, [r4+4*1] + movd m6, [r4+4*2] + movd m9, [r4+4*3] + movd m5, [r4+4*4] + movd m10, [r4+4*5] + movd m4, [r4+4*6] + movd m15, [r4+4*7] + REPX {pshufd x, x, q0000}, m7, m8, m6, m9, m5, m10, m4, m15 + pmulld m7, m0 ; t63a + pmulld m0, m8 ; t32a + pmulld m6, m1 ; t62a + pmulld m1, m9 ; t33a + pmulld m5, m2 ; t61a + pmulld m2, m10 ; t34a + pmulld m4, m3 ; t60a + pmulld m3, m15 ; t35a + movd m10, [r4+4*8] + movd m15, [r4+4*9] + REPX {pshufd x, x, q0000}, m10, m15 + REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3 + REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4 + psubd m8, m0, m1 ; t33 + paddd m0, m1 ; t32 + psubd m1, m7, m6 ; t62 + paddd m7, m6 ; t63 + psubd m6, m3, m2 ; t34 + paddd m3, m2 ; t35 + psubd m2, m4, m5 ; t61 + paddd m4, m5 ; t60 + REPX {pmaxsd x, m12}, m8, m1, m6, m2 + REPX {pminsd x, m13}, m8, m1, m6, m2 + ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a + ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 4 ; t61a, t34a + REPX {pmaxsd x, m12}, m0, m3, m7, m4 + REPX {pminsd x, m13}, m0, m3, m7, m4 + movd m10, [r4+4*10] + movd m15, [r4+4*11] + REPX {pshufd x, x, q0000}, m10, m15 + psubd m5, m0, m3 ; t35a + paddd m0, m3 ; t32a + psubd m3, m7, m4 ; t60a + paddd m7, m4 ; t63a + psubd m4, m1, m6 ; t34 + paddd m1, m6 ; t33 + psubd m6, m8, m2 ; t61 + paddd m8, m2 ; t62 + REPX {pmaxsd x, m12}, m5, m3, m4, m6 + REPX {pminsd x, m13}, m5, m3, m4, m6 + ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15 ; t35, t60 + ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a + REPX {pmaxsd x, m12}, m0, m7, m1, m8 + REPX {pminsd x, m13}, m0, m7, m1, m8 + add r4, 4*12 + mova [r3+16*0], m0 + mova [r3+16*7], m7 + mova [r3+16*1], m1 + mova [r3+16*6], m8 + mova [r3+16*2], m6 + mova [r3+16*5], m4 + mova [r3+16*3], m3 + mova [r3+16*4], m5 +%else + movd m7, [r4+4*0] + movd m6, [r4+4*2] + movd m5, [r4+4*4] + movd m4, [r4+4*6] + REPX {pshufd x, x, q0000}, m7, m6, m5, m4 + pmulld m7, m0 ; t63a + pmulld m6, m1 ; t62a + pmulld m5, m2 ; t61a + pmulld m4, m3 ; t60a + mova [r3+0*16], m6 + mova [r3+1*16], m7 + movd m6, [r4+4*1] + movd m7, [r4+4*3] + REPX {pshufd x, x, q0000}, m7, m6 + pmulld m0, m6 ; t32a + pmulld m1, m7 ; t33a + movd m6, [r4+4*5] + movd m7, [r4+4*7] + REPX {pshufd x, x, q0000}, m7, m6 + pmulld m2, m6 ; t34a + pmulld m3, m7 ; t35a + mova m6, [r3+0*16] + mova m7, [o(pd_2048)] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6 + paddd m7, [r3+1*16] + REPX {psrad x, 12}, m0, m1, m7, m6, m2, m3, m5, m4 + mova [r3+0*16], m5 + psubd m5, m0, m1 ; t33 + paddd m0, m1 ; t32 + mova [r3+1*16], m0 + mova m0, [r3+0*16] + psubd m1, m7, m6 ; t62 + paddd m7, m6 ; t63 + psubd m6, m3, m2 ; t34 + paddd m3, m2 ; t35 + psubd m2, m4, m0 ; t61 + paddd m4, m0 ; t60 + mova m0, [o(clip_18b_min)] + REPX {pmaxsd x, m0}, m5, m1, m7, m6, m3, m2, m4 + pmaxsd m0, [r3+1*16] + mova [r3+0*16], m0 + mova m0, [o(clip_18b_max)] + REPX {pminsd x, m0}, m5, m1, m7, m6, m3, m2, m4 + pminsd m0, [r3+0*16] + mova [r3+0*16], m0 + mova [r3+1*16], m3 + mova [r3+2*16], m4 + mova [r3+3*16], m7 + mova m0, [o(pd_2048)] + movd m3, [r4+4*8] + movd m4, [r4+4*9] + REPX {pshufd x, x, q0000}, m3, m4 + mova [r3+4*16], m2 + ITX_MULSUB_2D 1, 5, 2, 7, _, 0, 3, 4 ; t33a, t62a + mova m2, [r3+4*16] + mova [r3+4*16], m5 + ITX_MULSUB_2D 2, 6, 5, 7, _, 0, 3, 4, 4 ; t61a, t34a + mova m0, [r3+0*16] + mova m3, [r3+1*16] + mova m4, [r3+2*16] + mova m7, [r3+3*16] + psubd m5, m0, m3 ; t35a + paddd m0, m3 ; t32a + mova [r3+0*16], m5 + mova m5, [r3+4*16] + psubd m3, m7, m4 ; t60a + paddd m7, m4 ; t63a + psubd m4, m1, m6 ; t34 + paddd m1, m6 ; t33 + psubd m6, m5, m2 ; t61 + paddd m2, m5 ; t62 + mova m5, [o(clip_18b_min)] + REPX {pmaxsd x, m5}, m0, m3, m7, m4, m1, m6, m2 + pmaxsd m5, [r3+0*16] + mova [r3+0*16], m5 + mova m5, [o(clip_18b_max)] + REPX {pminsd x, m5}, m0, m3, m7, m4, m1, m6, m2 + pminsd m5, [r3+0*16] + mova [r3+16*0], m0 + mova [r3+16*7], m7 + mova [r3+16*1], m1 + mova [r3+16*6], m2 + mova [r3+16*2], m4 + mova m7, [o(pd_2048)] + movd m0, [r4+4*10] + movd m1, [r4+4*11] + REPX {pshufd x, x, q0000}, m0, m1 + ITX_MULSUB_2D 3, 5, 2, 4, _, 7, 0, 1 ; t35, t60 + mova [r3+16*3], m3 + mova [r3+16*4], m5 + mova m4, [r3+2*16] + ITX_MULSUB_2D 6, 4, 2, 3, _, 7, 0, 1 ; t34a, t61a + add r4, 4*12 + mova [r3+16*2], m6 + mova [r3+16*5], m4 +%endif + add r3, 16*8 + ret + +.main_part2: ; idct64 steps 6-9 + lea r4, [r3+16*7] +%if ARCH_X86_64 + mova m10, [o(pd_1567)] + mova m15, [o(pd_3784)] +.main_part2_loop: + mova m0, [r3-16*32] ; t32a + mova m1, [r4-16*24] ; t39a + mova m2, [r4-16*32] ; t63a + mova m3, [r3-16*24] ; t56a + mova m4, [r3-16*16] ; t40a + mova m5, [r4-16* 8] ; t47a + mova m6, [r4-16*16] ; t55a + mova m7, [r3-16* 8] ; t48a + psubd m8, m0, m1 ; t39 + paddd m0, m1 ; t32 + psubd m1, m2, m3 ; t56 + paddd m2, m3 ; t63 + psubd m3, m5, m4 ; t40 + paddd m5, m4 ; t47 + psubd m4, m7, m6 ; t55 + paddd m7, m6 ; t48 + REPX {pmaxsd x, m12}, m8, m1, m3, m4 + REPX {pminsd x, m13}, m8, m1, m3, m4 + ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a + ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 4 ; t55a, t40a + REPX {pmaxsd x, m12}, m0, m2, m5, m7 + REPX {pminsd x, m13}, m0, m5, m2, m7 + psubd m6, m2, m7 ; t48a + paddd m2, m7 ; t63a + psubd m7, m0, m5 ; t47a + paddd m0, m5 ; t32a + psubd m5, m8, m4 ; t55 + paddd m8, m4 ; t56 + psubd m4, m1, m3 ; t40 + paddd m1, m3 ; t39 + REPX {pmaxsd x, m12}, m6, m7, m5, m4 + REPX {pminsd x, m13}, m6, m7, m5, m4 + REPX {pmulld x, m14}, m6, m7, m5, m4 + REPX {pmaxsd x, m12}, m2, m0, m8, m1 + REPX {pminsd x, m13}, m2, m0, m8, m1 + paddd m6, m11 + paddd m5, m11 + psubd m3, m6, m7 ; t47 + paddd m6, m7 ; t48 + psubd m7, m5, m4 ; t40a + paddd m5, m4 ; t55a + REPX {psrad x, 12}, m3, m6, m7, m5 + mova [r4-16* 8], m2 + mova [r3-16*32], m0 + mova [r3-16* 8], m8 + mova [r4-16*32], m1 + mova [r4-16*24], m3 + mova [r3-16*16], m6 + mova [r3-16*24], m7 + mova [r4-16*16], m5 +%else +.main_part2_loop: + mova m0, [r3-16*32] ; t32a + mova m1, [r4-16*24] ; t39a + mova m2, [r4-16*32] ; t63a + mova m3, [r3-16*24] ; t56a + mova m4, [r3-16*16] ; t40a + mova m5, [r4-16* 8] ; t47a + mova m6, [r4-16*16] ; t55a + psubd m7, m0, m1 ; t39 + paddd m0, m1 ; t32 + mova [r3+0*16], m7 + mova m7, [r3-16* 8] ; t48a + psubd m1, m2, m3 ; t56 + paddd m2, m3 ; t63 + psubd m3, m5, m4 ; t40 + paddd m5, m4 ; t47 + psubd m4, m7, m6 ; t55 + paddd m7, m6 ; t48 + mova m6, [o(clip_18b_min)] + REPX {pmaxsd x, m6}, m0, m1, m2, m3, m5, m4, m7 + pmaxsd m6, [r3+0*16] + mova [r3+0*16], m6 + mova m6, [o(clip_18b_max)] + REPX {pminsd x, m6}, m0, m1, m2, m3, m5, m4, m7 + pminsd m6, [r3+0*16] + mova [r3+0*16], m0 + mova [r3+1*16], m2 + mova [r3+2*16], m5 + mova [r3+3*16], m7 + mova m0, [o(pd_2048)] + ITX_MULSUB_2D 1, 6, 2, 5, 7, 0, 1567, 3784 ; t39a, t56a + ITX_MULSUB_2D 4, 3, 2, 5, _, 0, 7, 3784, 4 ; t55a, t40a + mova m2, [r3+1*16] + mova m7, [r3+3*16] + psubd m5, m2, m7 ; t48a + paddd m2, m7 ; t63a + mova [r3+1*16], m5 + mova m0, [r3+0*16] + mova m5, [r3+2*16] + psubd m7, m0, m5 ; t47a + paddd m0, m5 ; t32a + psubd m5, m6, m4 ; t55 + paddd m6, m4 ; t56 + psubd m4, m1, m3 ; t40 + paddd m1, m3 ; t39 + mova m3, [o(clip_18b_min)] + REPX {pmaxsd x, m3}, m2, m7, m0, m5, m6, m4, m1 + pmaxsd m3, [r3+1*16] + mova [r3+0*16], m3 + mova m3, [o(clip_18b_max)] + REPX {pminsd x, m3}, m2, m7, m0, m5, m6, m4, m1 + pminsd m3, [r3+0*16] + mova [r4-16* 8], m2 + mova [r3-16*32], m0 + mova [r3-16* 8], m6 + mova [r4-16*32], m1 + mova m0, [o(pd_2896)] + mova m1, [o(pd_2048)] + REPX {pmulld x, m0}, m3, m7, m5, m4 + REPX {paddd x, m1}, m3, m5 + psubd m6, m3, m7 ; t47 + paddd m3, m7 ; t48 + psubd m7, m5, m4 ; t40a + paddd m5, m4 ; t55a + REPX {psrad x, 12}, m6, m3, m7, m5 + mova [r4-16*24], m6 + mova [r3-16*16], m3 + mova [r3-16*24], m7 + mova [r4-16*16], m5 +%endif + add r3, 16 + sub r4, 16 + cmp r3, r4 + jl .main_part2_loop + sub r3, 4*16 + ret + +.main_end_loop: + mova m0, [r3+16*28] ; idct8 0 + n +.main_end_loop_start: + mova m2, [r3+16*12] ; idct32 16 + n + mova m3, [r4+16*12] ; idct32 31 - n +%if ARCH_X86_64 + mova m1, [r4+16*28] ; idct16 15 - n + mova m4, [r4-16* 4] ; idct64 63 - n + mova m5, [r3-16* 4] ; idct64 48 + n + mova m6, [r4-16*20] ; idct64 47 - n + mova m7, [r3-16*20] ; idct64 32 + n + pmaxsd m0, m12 + pminsd m0, m13 + paddd m8, m0, m1 ; idct16 out0 + n + psubd m0, m1 ; idct16 out15 - n + REPX {pmaxsd x, m12}, m8, m0 + REPX {pminsd x, m13}, m8, m0 + paddd m1, m8, m3 ; idct32 out0 + n + psubd m8, m3 ; idct32 out31 - n + paddd m3, m0, m2 ; idct32 out15 - n + psubd m0, m2 ; idct32 out16 + n + REPX {pmaxsd x, m12}, m1, m8, m3, m0 + REPX {pminsd x, m13}, m1, m3, m8, m0 + REPX {paddd x, m15}, m1, m3, m0, m8 + paddd m2, m1, m4 ; idct64 out0 + n (unshifted) + psubd m1, m4 ; idct64 out63 - n (unshifted) + paddd m4, m3, m5 ; idct64 out15 - n (unshifted) + psubd m3, m5 ; idct64 out48 + n (unshifted) + paddd m5, m0, m6 ; idct64 out16 + n (unshifted) + psubd m0, m6 ; idct64 out47 - n (unshifted) + paddd m6, m8, m7 ; idct64 out31 - n (unshifted) + psubd m8, m7 ; idct64 out32 + n (unshifted) + mova [r3-16*20], m2 + mova [r4+16*28], m1 + mova [r4-16*20], m4 + mova [r3+16*28], m3 + mova [r3-16* 4], m5 + mova [r4+16*12], m0 + mova [r4-16* 4], m6 + mova [r3+16*12], m8 +%else + mova m5, [o(clip_18b_min)] + mova m6, [o(clip_18b_max)] + mova m1, [r3+16*44] ; idct16 15 - n + pmaxsd m0, m5 + pminsd m0, m6 + paddd m4, m0, m1 ; idct16 out0 + n + psubd m0, m1 ; idct16 out15 - n + REPX {pmaxsd x, m5}, m4, m0 + REPX {pminsd x, m6}, m4, m0 + paddd m1, m4, m3 ; idct32 out0 + n + psubd m4, m3 ; idct32 out31 - n + paddd m3, m0, m2 ; idct32 out15 - n + psubd m0, m2 ; idct32 out16 + n + REPX {pmaxsd x, m5}, m1, m4, m3, m0 + REPX {pminsd x, m6}, m1, m3, m4, m0 + REPX {paddd x, m7}, m1, m3, m0, m4 + mova m5, [r4-16* 4] ; idct64 63 - n + mova m6, [r3-16* 4] ; idct64 48 + n + paddd m2, m1, m5 ; idct64 out0 + n (unshifted) + psubd m1, m5 ; idct64 out63 - n (unshifted) + paddd m5, m3, m6 ; idct64 out15 - n (unshifted) + psubd m3, m6 ; idct64 out48 + n (unshifted) + mova [r4+16*28], m1 + mova [r3+16*28], m3 + mova m6, [r4-16*20] ; idct64 47 - n + mova m1, [r3-16*20] ; idct64 32 + n + mova [r3-16*20], m2 + mova [r4-16*20], m5 + paddd m5, m0, m6 ; idct64 out16 + n (unshifted) + psubd m0, m6 ; idct64 out47 - n (unshifted) + paddd m6, m4, m1 ; idct64 out31 - n (unshifted) + psubd m4, m1 ; idct64 out32 + n (unshifted) + mova [r3-16* 4], m5 + mova [r4+16*12], m0 + mova [r4-16* 4], m6 + mova [r3+16*12], m4 +%endif + sub r4, 16 + add r3, 16 + cmp r3, r4 + jl .main_end_loop + ret + +.shift_transpose: + mova m0, [r3+0*16] + mova m1, [r3+1*16] + mova m2, [r3+2*16] + mova m3, [r3+3*16] + mova m4, [r3+4*16] + mova m5, [r3+5*16] + mova m6, [r3+6*16] + mova m7, [r3+7*16] + REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [r4+0*64], m0 + mova [r4+1*64], m1 + mova [r4+2*64], m2 + mova [r4+3*64], m3 + sub r4, 4*64 + sub r3, 8*16 + cmp r3, rsp + jg .shift_transpose + ret + +.dconly: + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 16 +.dconly1: + add r5d, 640 + sar r5d, 10 +.dconly2: + imul r5d, 2896 + add r5d, 34816 + movd m0, r5d + pshuflw m0, m0, q1111 + punpcklqdq m0, m0 + mova m6, [o(pixel_10bpc_max)] + pxor m5, m5 +.dconly_loop: + paddw m1, m0, [dstq+16*0] + paddw m2, m0, [dstq+16*1] + paddw m3, m0, [dstq+16*2] + paddw m4, m0, [dstq+16*3] + REPX {pmaxsw x, m5}, m1, m2, m3, m4 + REPX {pminsw x, m6}, m1, m2, m3, m4 + mova [dstq+16*0], m1 + mova [dstq+16*1], m2 + mova [dstq+16*2], m3 + mova [dstq+16*3], m4 + add dstq, 64 + btc r3d, 16 + jnc .dconly_loop + lea dstq, [dstq+strideq-128] + dec r3d + jg .dconly_loop + RET + +cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 16, \ + 0-(1+64+8*ARCH_X86_32+8*32+1*WIN64)*16, \ + dst, stride, c, eob + LEA r6, base + test eobd, eobd + jz .dconly + +%if ARCH_X86_32 + DECLARE_REG_TMP 0, 4, 1 + mov [rsp+(8*32+64+8)*16+1*gprsize], dstq + mov [rsp+(8*32+64+8)*16+2*gprsize], strideq +%else + DECLARE_REG_TMP 4, 7, 8 +%if WIN64 + mov [rsp+(8*32+64+1)*16+1*gprsize], r7 + mov [rsp+64*16+0*gprsize], r8 +%endif +%endif +%undef cmp + ; remove entirely-zero iterations + mov r5d, 14 + cmp eobw, word [o2(tbl_32x32_2d)+r5] + jge .end_zero_loop + pxor m0, m0 +.zero_loop: + movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] + movzx t1d, t0b + shr t0d, 8 + lea t2, [rsp+7*32*16] +.zero_loop_inner: + mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0 + mova [t2+(72+8*ARCH_X86_32+1*WIN64)*16+r5*8], m0 + mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t0*8], m0 + mova [t2+(64+8*ARCH_X86_32+1*WIN64)*16+t1*8], m0 + sub t2, 32*16 + cmp t2, rsp + jge .zero_loop_inner + sub r5d, 2 + cmp eobw, word [o2(tbl_32x32_2d)+r5] + jl .zero_loop +.end_zero_loop: + mov [rsp+(8*32+64+8*ARCH_X86_32+1*WIN64)*16+0*gprsize], eobd + ; actual first pass after skipping all-zero data +.loop_pass1: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + + mov r3, rsp + lea r4, [o(idct64_mul_16bpc)] + mova m0, [cq+128* 1+r5*8] + mova m1, [cq+128*31+r5*8] + mova m2, [cq+128*17+r5*8] + mova m3, [cq+128*15+r5*8] + call .rect2_mul_fast + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 + mova m0, [cq+128* 7+r5*8] + mova m1, [cq+128*25+r5*8] + mova m2, [cq+128*23+r5*8] + mova m3, [cq+128* 9+r5*8] + call .rect2_mul_fast + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 + mova m0, [cq+128* 5+r5*8] + mova m1, [cq+128*27+r5*8] + mova m2, [cq+128*21+r5*8] + mova m3, [cq+128*11+r5*8] + call .rect2_mul_fast + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 + mova m0, [cq+128* 3+r5*8] + mova m1, [cq+128*29+r5*8] + mova m2, [cq+128*19+r5*8] + mova m3, [cq+128*13+r5*8] + call .rect2_mul_fast + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2 + + mova m0, [cq+128* 2+r5*8] + mova m1, [cq+128*14+r5*8] + mova m2, [cq+128*18+r5*8] + mova m3, [cq+128*30+r5*8] + call .rect2_mul_fast + call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast + + mova m0, [cq+128* 6+r5*8] + mova m1, [cq+128*10+r5*8] + mova m2, [cq+128*22+r5*8] + mova m3, [cq+128*26+r5*8] + call .rect2_mul_fast + call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast + add r3, 16*(24+4*ARCH_X86_32) + + mova m0, [cq+128* 4+r5*8] + mova m1, [cq+128*12+r5*8] + mova m2, [cq+128*20+r5*8] + mova m3, [cq+128*28+r5*8] + call .rect2_mul_fast + call m(idct_16x4_internal_16bpc).main_oddhalf_fast + + mova m0, [cq+128* 0+r5*8] + mova m1, [cq+128* 8+r5*8] + mova m2, [cq+128*16+r5*8] + mova m3, [cq+128*24+r5*8] + call .rect2_mul_fast + call m(idct_8x4_internal_16bpc).main_pass1_fast + call m(idct_8x4_internal_16bpc).round + mova [r3-(7+4*ARCH_X86_32)*16], m1 + mova [r3-(6+4*ARCH_X86_32)*16], m2 + mova [r3-(5+4*ARCH_X86_32)*16], m3 + mova [r3-(4+4*ARCH_X86_32)*16], m4 + mova [r3-(3+4*ARCH_X86_32)*16], m5 + mova [r3-(2+4*ARCH_X86_32)*16], m6 + mova [r3-(1+4*ARCH_X86_32)*16], m7 + sub r3, 16*(40+4*ARCH_X86_32-4) + +%if ARCH_X86_64 + psrld m15, m11, 11 ; pd_1 +%else + mova m7, [o(pd_1)] +%endif + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start + + lea r3, [rsp+56*16] + lea t2, [rsp+7*32*16+(64+8*ARCH_X86_32+1*WIN64)*16] + movzx t0d, word [o2(tbl_Nx32_odd_offset)+r5] + movzx t1d, t0b + shr t0d, 8 + call .shift_transpose + ; zero cq + pxor m7, m7 + lea r4, [cq+30*128+r5*8] +.zero_cq_loop: + REPX {mova [r4+x*128], m7}, -2, -1, 0, 1 + sub r4, 4*128 + cmp r4, cq + jg .zero_cq_loop + sub r5d, 2 + jge .loop_pass1 + + ; pass=2 code starts here + mov eobd, [rsp+gprsize*0+(8*32+64+8*ARCH_X86_32+1*WIN64)*16] +%if ARCH_X86_32 + mov strideq, [rsp+gprsize*2+(8*32+64+8)*16] +%elif WIN64 + mov r8, [rsp+gprsize*0+64*16] +%endif + add rsp, (64+8*ARCH_X86_32+1*WIN64-3)*16 + cmp eobd, 36 + jl .load_veryfast + cmp eobd, 136 + jl .load_fast + ; load normal + lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main)] + jmp .run +.load_fast: + lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] + jmp .run +.load_veryfast: + lea r4, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] + ; fall-through +.run: +%if ARCH_X86_64 + lea r2, [dstq+128] + mov r7, -16 +%else + lea r2, [rsp+(8*32+3)*16] + mov dword [r2+0*gprsize], 8 +%endif + jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry + +.rect2_mul_fast: +%if ARCH_X86_64 + REPX {pmulld x, m14}, m0, m1, m2, m3 + REPX {paddd x, m11}, m0, m1, m2, m3 +%else + mova m4, [o(pd_2896)] + mova m5, [o(pd_2048)] + REPX {pmulld x, m4 }, m0, m1, m2, m3 + REPX {paddd x, m5 }, m0, m1, m2, m3 +%endif + REPX {psrad x, 12 }, m0, m1, m2, m3 + ret + +.shift_transpose: + mova m0, [r3+0*16] + mova m1, [r3+1*16] + mova m2, [r3+2*16] + mova m3, [r3+3*16] + mova m4, [r3+4*16] + mova m5, [r3+5*16] + mova m6, [r3+6*16] + mova m7, [r3+7*16] + REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [t2+0*16+r5*8], m0 + mova [t2+8*16+r5*8], m2 + mova [t2+0*16+t0*8], m3 + mova [t2+0*16+t1*8], m1 + sub t2, 16*32 + sub r3, 8*16 + cmp r3, rsp + jg .shift_transpose + ret + +.dconly: + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 32 + add r5d, 128 + sar r5d, 8 + imul r5d, 181 + add r5d, 384 + sar r5d, 9 + add rsp, (1+8*32+1*WIN64)*16 + jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2 + +cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 16, \ + 0-(64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16-(4+4*ARCH_X86_32)*gprsize, \ + dst, stride, c, eob + LEA r6, base + test eobd, eobd + jz .dconly + +%if ARCH_X86_32 + DECLARE_REG_TMP 4, 1, 2, 0, 6 + mov [rsp+gprsize*1+(64*9+8)*16], r0 + mov [rsp+gprsize*2+(64*9+8)*16], r1 + mov [rsp+gprsize*3+(64*9+8)*16], r2 + mov [rsp+gprsize*4+(64*9+8)*16], r6 +%else + DECLARE_REG_TMP 8, 9, 4, 7, 0 + mov [rsp+gprsize*1+(64*9+1)*16], r9 + mov [rsp+gprsize*0+64*16], r0 +%if WIN64 + mov [rsp+gprsize*2+(64*9+1)*16], r7 + mov [rsp+gprsize*3+(64*9+1)*16], r8 +%endif +%endif +%undef cmp + + ; remove entirely-zero iterations + mov r5d, 14 + cmp eobw, word [o2(tbl_32x32_2d)+r5] + jge .end_zero_loop + pxor m0, m0 +.zero_loop: + movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] + movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] + movzx t0d, t1b + movzx t2d, t3b + shr t1d, 8 + shr t3d, 8 + lea t4, [rsp+7*64*16] +.zero_loop_inner: + mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t0*8], m0 + mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t1*8], m0 + mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t2*8], m0 + mova [t4+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16+t3*8], m0 + sub t4, 64*16 + cmp t4, rsp + jge .zero_loop_inner +%if ARCH_X86_32 + mov r6, [rsp+gprsize*4+(64*9+8)*16] +%endif + sub r5d, 2 + cmp eobw, word [o2(tbl_32x32_2d)+r5] + jl .zero_loop +.end_zero_loop: + mov [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16], eobd +%if ARCH_X86_32 + mov cq, [rsp+gprsize*3+(64*9+8)*16] +%endif + ; actual first pass after skipping all-zero data +.loop_pass1: +%if ARCH_X86_64 + mova m11, [o(pd_2048)] + mova m12, [o(clip_18b_min)] + mova m13, [o(clip_18b_max)] + mova m14, [o(pd_2896)] +%endif + + mov r3, rsp + lea r4, [o(idct64_mul_16bpc)] + mova m0, [cq+128* 1+r5*8] + mova m1, [cq+128*31+r5*8] + mova m2, [cq+128*17+r5*8] + mova m3, [cq+128*15+r5*8] + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 + mova m0, [cq+128* 7+r5*8] + mova m1, [cq+128*25+r5*8] + mova m2, [cq+128*23+r5*8] + mova m3, [cq+128* 9+r5*8] + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 + mova m0, [cq+128* 5+r5*8] + mova m1, [cq+128*27+r5*8] + mova m2, [cq+128*21+r5*8] + mova m3, [cq+128*11+r5*8] + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 + mova m0, [cq+128* 3+r5*8] + mova m1, [cq+128*29+r5*8] + mova m2, [cq+128*19+r5*8] + mova m3, [cq+128*13+r5*8] + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part1 + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_part2 + + mova m0, [cq+128* 2+r5*8] + mova m1, [cq+128*14+r5*8] + mova m2, [cq+128*18+r5*8] + mova m3, [cq+128*30+r5*8] + call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part1_fast + + mova m0, [cq+128* 6+r5*8] + mova m1, [cq+128*10+r5*8] + mova m2, [cq+128*22+r5*8] + mova m3, [cq+128*26+r5*8] + call m(inv_txfm_add_dct_dct_32x8_16bpc).main_oddhalf_part2_fast + add r3, 16*(24+4*ARCH_X86_32) + + mova m0, [cq+128* 4+r5*8] + mova m1, [cq+128*12+r5*8] + mova m2, [cq+128*20+r5*8] + mova m3, [cq+128*28+r5*8] + call m(idct_16x4_internal_16bpc).main_oddhalf_fast + + mova m0, [cq+128* 0+r5*8] + mova m1, [cq+128* 8+r5*8] + mova m2, [cq+128*16+r5*8] + mova m3, [cq+128*24+r5*8] + call m(idct_8x4_internal_16bpc).main_pass1_fast + call m(idct_8x4_internal_16bpc).round + mova [r3-(7+4*ARCH_X86_32)*16], m1 + mova [r3-(6+4*ARCH_X86_32)*16], m2 + mova [r3-(5+4*ARCH_X86_32)*16], m3 + mova [r3-(4+4*ARCH_X86_32)*16], m4 + mova [r3-(3+4*ARCH_X86_32)*16], m5 + mova [r3-(2+4*ARCH_X86_32)*16], m6 + mova [r3-(1+4*ARCH_X86_32)*16], m7 + sub r3, 16*(40+4*ARCH_X86_32-4) + +%if ARCH_X86_64 + psrld m15, m11, 10 ; pd_2 +%else + mova m7, [o(pd_2)] +%endif + call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end_loop_start + + lea r3, [rsp+56*16] + movzx t1d, word [o2(tbl_Nx64_offset)+r5*2+0] + movzx t3d, word [o2(tbl_Nx64_offset)+r5*2+2] + movzx t0d, t1b + movzx t2d, t3b + shr t1d, 8 + shr t3d, 8 + lea t4, [rsp+7*64*16+(64+8*ARCH_X86_32+1*ARCH_X86_64)*16] + call .shift_transpose + ; zero cq + pxor m7, m7 +%if ARCH_X86_32 + mov cq, [rsp+gprsize*3+(64*9+8)*16] +%endif + lea r4, [cq+30*128+r5*8] +.zero_cq_loop: + REPX {mova [r4+x*128], m7}, -2, -1, 0, 1 + sub r4, 4*128 + cmp r4, cq + jg .zero_cq_loop +%if ARCH_X86_32 + mov r6, [rsp+gprsize*4+(64*9+8)*16] +%endif + sub r5d, 2 + jge .loop_pass1 + + ; pass=2 code starts here + mov eobd, [rsp+gprsize*0+(9*64+8*ARCH_X86_32+1*ARCH_X86_64)*16] +%if ARCH_X86_32 + mov strideq, [rsp+gprsize*2+(9*64+8)*16] +%else + mov r0, [rsp+gprsize*0+64*16] +%endif + add rsp, (64+8*ARCH_X86_32+1*ARCH_X86_64-3)*16 + cmp eobd, 151 + jl .fast + ; fall-through +%if ARCH_X86_64 + DECLARE_REG_TMP 8, 9 +%else + DECLARE_REG_TMP 1, 5 +%endif + lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_fast)] + lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main)] + jmp .run +.fast: + lea t0, [o(m_suffix(idct_8x32_internal_8bpc, _ssse3).main_veryfast)] + lea t1, [o(m_suffix(idct_16x64_internal_8bpc, _ssse3).main_fast)] +.run: + +%if ARCH_X86_64 + lea r2, [dstq+128] + mov r7, -16 +%else + lea r2, [rsp+(64*8+3)*16] + mov [r2+4*gprsize], t0 + mov [r2+5*gprsize], t1 + mov r1, [r2+2*gprsize] + mov dword [r2+0*gprsize], 8 +%endif + jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2 + + ; copy of pass=1 tmp-regs +%if ARCH_X86_32 + DECLARE_REG_TMP 4, 1, 2, 0, 6 +%else + DECLARE_REG_TMP 8, 9, 4, 7, 0 +%endif + +.shift_transpose: + mova m0, [r3+0*16] + mova m1, [r3+1*16] + mova m2, [r3+2*16] + mova m3, [r3+3*16] + mova m4, [r3+4*16] + mova m5, [r3+5*16] + mova m6, [r3+6*16] + mova m7, [r3+7*16] + REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + call m(idct_8x4_internal_16bpc).transpose4x8packed + mova [t4+t0*8], m0 + mova [t4+t1*8], m1 + mova [t4+t2*8], m2 + mova [t4+t3*8], m3 + sub t4, 16*64 + sub r3, 8*16 + cmp r3, rsp + jg .shift_transpose + ret + +.dconly: + imul r5d, [cq], 181 + mov [cq], eobd ; 0 + mov r3d, 64 + add rsp, (64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16 + \ + (4+4*ARCH_X86_32)*gprsize - (64+8*ARCH_X86_32)*16 + jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly1 |