From 36d22d82aa202bb199967e9512281e9a53db42c9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 21:33:14 +0200 Subject: Adding upstream version 115.7.0esr. Signed-off-by: Daniel Baumann --- third_party/dav1d/src/x86/itx16_avx2.asm | 8599 ++++++++++++++++++++++++++++++ 1 file changed, 8599 insertions(+) create mode 100644 third_party/dav1d/src/x86/itx16_avx2.asm (limited to 'third_party/dav1d/src/x86/itx16_avx2.asm') diff --git a/third_party/dav1d/src/x86/itx16_avx2.asm b/third_party/dav1d/src/x86/itx16_avx2.asm new file mode 100644 index 0000000000..2315ec1e47 --- /dev/null +++ b/third_party/dav1d/src/x86/itx16_avx2.asm @@ -0,0 +1,8599 @@ +; Copyright © 2021, VideoLAN and dav1d authors +; Copyright © 2021, Two Orioles, LLC +; Copyright © 2021, Matthias Dressel +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 32 +itx4_shuf: dd 0x50401600, 0xd0c09284, 0x70603422, 0xf0e0b0a6 + dd 0x50401701, 0xd0c09385, 0x70603523, 0xf0e0b1a7 +idct4_12_shuf: dd 0, 2, 4, 6, 1, 3, 5, 7 +idct4_12_shuf2: dd 2, 0, 6, 4, 3, 1, 7, 5 +iadst8_12_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 +idct16_12_shuf: dd 0, 4, 1, 5, 3, 7, 2, 6 +iadst16_12_shuf: dd 3, 7, 0, 4, 2, 6, 1, 5 +pw_2048_m2048: dw 2048, 2048, 2048, 2048, -2048, -2048, -2048, -2048 +idct4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11 +idct32_shuf: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 + +%macro COEF_PAIR 2-3 0 +pd_%1_%2: dd %1, %1, %2, %2 +%define pd_%1 (pd_%1_%2 + 4*0) +%define pd_%2 (pd_%1_%2 + 4*2) +%if %3 +dd -%2, -%2 +%define pd_%2_m%2 pd_%2 +%endif +%endmacro + +COEF_PAIR 201, 995 +COEF_PAIR 401, 1931 +COEF_PAIR 799, 3406 +COEF_PAIR 1380, 601 +COEF_PAIR 1751, 2440 +COEF_PAIR 2598, 1189 +COEF_PAIR 2751, 2106 +COEF_PAIR 2896, 1567, 1 +COEF_PAIR 2896, 3784, 1 +COEF_PAIR 3035, 3513 +COEF_PAIR 3166, 3920 +COEF_PAIR 3703, 3290 +COEF_PAIR 3857, 4052 +COEF_PAIR 4017, 2276 +COEF_PAIR 4076, 3612 +COEF_PAIR 4091, 3973 + +pd_8: dd 8 +pd_m601: dd -601 +pd_m1189: dd -1189 +pd_m1380: dd -1380 +pd_m2106: dd -2106 +pd_m2598: dd -2598 +pd_m2751: dd -2751 +pd_m3344: dd -3344 +pd_1024: dd 1024 +pd_1321: dd 1321 +pd_1448: dd 1448 +pd_1697: dd 1697 +pd_2482: dd 2482 +pd_3072: dd 3072 ; 1024 + 2048 +pd_3803: dd 3803 +pd_5119: dd 5119 ; 1024 + 4096 - 1 +pd_5120: dd 5120 ; 1024 + 4096 +pd_5793: dd 5793 +pd_6144: dd 6144 ; 2048 + 4096 +pd_17408: dd 17408 ; 1024 + 16384 + +pixel_10bpc_max: times 2 dw 0x03ff +pixel_12bpc_max: times 2 dw 0x0fff +dconly_10bpc: times 2 dw 0x7c00 +dconly_12bpc: times 2 dw 0x7000 +clip_18b_min: dd -0x20000 +clip_18b_max: dd 0x1ffff +clip_20b_min: dd -0x80000 +clip_20b_max: dd 0x7ffff + +idct64_mul_16bpc: +dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017 +dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799 +dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276 +dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406 + +cextern deint_shuf +cextern idct64_mul +cextern pw_1697x8 +cextern pw_1697x16 +cextern pw_1567_3784 +cextern pw_m1567_m3784 +cextern pw_m3784_1567 +cextern pw_2896_2896 +cextern pw_m2896_2896 +cextern pw_5 +cextern pw_2048 +cextern pw_4096 +cextern pw_8192 +cextern pw_16384 +cextern pw_2896x8 +cextern pd_2048 + +cextern idct_4x8_internal_8bpc_avx2.main +cextern idct_4x16_internal_8bpc_avx2.main +cextern idct_8x8_internal_8bpc_avx2.main +cextern idct_8x16_internal_8bpc_avx2.main +cextern idct_16x4_internal_8bpc_avx2.main +cextern idct_16x8_internal_8bpc_avx2.main +cextern idct_16x16_internal_8bpc_avx2.main +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main +cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main_fast +cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf +cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf_fast +cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part1 +cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part2_internal + +cextern iadst_4x4_internal_8bpc_avx2.main +cextern iadst_4x8_internal_8bpc_avx2.main_pass2 +cextern iadst_4x16_internal_8bpc_avx2.main2 +cextern iadst_8x4_internal_8bpc_avx2.main +cextern iadst_8x8_internal_8bpc_avx2.main_pass2 +cextern iadst_8x16_internal_8bpc_avx2.main +cextern iadst_8x16_internal_8bpc_avx2.main_pass2_end +cextern iadst_16x4_internal_8bpc_avx2.main +cextern iadst_16x8_internal_8bpc_avx2.main +cextern iadst_16x8_internal_8bpc_avx2.main_pass2_end +cextern iadst_16x16_internal_8bpc_avx2.main +cextern iadst_16x16_internal_8bpc_avx2.main_pass2_end + +SECTION .text + +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +%macro WRAP_XMM 1+ + INIT_XMM cpuname + %1 + INIT_YMM cpuname +%endmacro + +%macro IWHT4_1D_PACKED 0 + ; m0 = in0 in2, m1 = in1 in3 + psubd m2, m0, m1 ; t2 + paddd xm0, xm1 ; t0 + vpermq m2, m2, q3322 + vpermq m0, m0, q1100 + vpermq m1, m1, q3120 + psubd m3, m0, m2 + psrad m3, 1 + psubd m3, m1 ; t1 t3 + psubd m0, m3 ; ____ out0 + paddd m2, m3 ; out3 ____ +%endmacro + +INIT_YMM avx2 +cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax + mova xm0, [cq+16*0] + vinserti128 m0, [cq+16*2], 1 + mova xm1, [cq+16*1] + vinserti128 m1, [cq+16*3], 1 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + lea r6, [dstq+strideq*2] + psrad m0, 2 + psrad m1, 2 + IWHT4_1D_PACKED + punpckhdq m0, m3 + punpckldq m3, m2 + punpckhqdq m1, m0, m3 + punpcklqdq m0, m3 + IWHT4_1D_PACKED + vpblendd m0, m2, 0x33 + packssdw m0, m3 + vextracti128 xm2, m0, 1 + punpckhdq xm1, xm0, xm2 ; out2 out1 + punpckldq xm0, xm2 ; out3 out0 + movq xm2, [r6 +strideq*1] + movhps xm2, [dstq+strideq*0] + movq xm3, [r6 +strideq*0] + movhps xm3, [dstq+strideq*1] +%ifidn bdmaxd, bdmaxm + movd xm5, bdmaxd + vpbroadcastw xm5, xm5 +%else ; win64: load from stack + vpbroadcastw xm5, bdmaxm +%endif + paddsw xm0, xm2 + paddsw xm1, xm3 + pmaxsw xm0, xm4 + pmaxsw xm1, xm4 + pminsw xm0, xm5 + pminsw xm1, xm5 + movhps [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm1 + movq [r6 +strideq*0], xm1 + movq [r6 +strideq*1], xm0 + RET + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +; flags: 1 = packed, 2 = inv_dst2 +; skip round/shift if rnd is not a number +%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags +%if %8 < 32 + pmulld m%4, m%1, m%8 + pmulld m%3, m%2, m%8 +%else +%if %9 & 1 + vbroadcasti128 m%3, [pd_%8] +%else + vpbroadcastd m%3, [pd_%8] +%endif + pmulld m%4, m%1, m%3 + pmulld m%3, m%2 +%endif +%if %7 < 32 + pmulld m%1, m%7 + pmulld m%2, m%7 +%else +%if %9 & 1 + vbroadcasti128 m%5, [pd_%7] +%else + vpbroadcastd m%5, [pd_%7] +%endif + pmulld m%1, m%5 + pmulld m%2, m%5 +%endif +%if %9 & 2 + psubd m%4, m%6, m%4 + psubd m%2, m%4, m%2 +%else +%ifnum %6 + paddd m%4, m%6 +%endif + paddd m%2, m%4 +%endif +%ifnum %6 + paddd m%1, m%6 +%endif + psubd m%1, m%3 +%ifnum %6 + psrad m%2, 12 + psrad m%1, 12 +%endif +%endmacro + +%macro INV_TXFM_FN 4-5 10 ; type1, type2, eob_offset, size, bitdepth +cglobal inv_txfm_add_%1_%2_%4_%5bpc, 4, 5, 0, dst, stride, c, eob, tx2 + %define %%p1 m(i%1_%4_internal_%5bpc) + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [m(i%2_%4_internal_%5bpc).pass2] +%ifidn %1_%2, dct_dct + test eobd, eobd + jnz %%p1 +%else +%if %3 + add eobd, %3 +%endif + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endmacro + +%macro INV_TXFM_4X4_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 4x4, %3 +%ifidn %1_%2, dct_dct + vpbroadcastd xm2, [dconly_%3bpc] +%if %3 = 10 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 4 +.dconly2: + add r6d, 128 + sar r6d, 8 +.dconly3: + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + movd xm0, r6d + paddsw xm0, xm2 + vpbroadcastw xm0, xm0 +.dconly_loop: + movq xm1, [dstq+strideq*0] + movhps xm1, [dstq+strideq*1] + paddsw xm1, xm0 + psubusw xm1, xm2 + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + WRAP_XMM RET +%else + jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly +%endif +%endif +%endmacro + +%macro IDCT4_1D_PACKED 6 ; dst/src[1-2], tmp[1-3], rnd + ITX_MULSUB_2D %1, %2, %3, %4, %5, %6, 2896_1567, 2896_3784, 1 + punpckhqdq m%3, m%2, m%1 ; t3 t2 + punpcklqdq m%2, m%1 ; t0 t1 + paddd m%1, m%2, m%3 ; out0 out1 + psubd m%2, m%3 ; out3 out2 +%endmacro + +%macro IDCT4_1D_PACKED_WORD 6 ; dst/src[1-2], tmp[1-3], rnd + vpbroadcastd m%5, [pw_m3784_1567] + punpckhwd m%3, m%2, m%1 + vpbroadcastd m%4, [pw_1567_3784] + punpcklwd m%2, m%1 + vpbroadcastd m%1, [pw_m2896_2896] + pmaddwd m%5, m%3 + pmaddwd m%3, m%4 + vpbroadcastd m%4, [pw_2896_2896] + pmaddwd m%1, m%2 + pmaddwd m%2, m%4 + REPX {paddd x, m%6}, m%5, m%3, m%1, m%2 + REPX {psrad x, 12 }, m%5, m%3, m%1, m%2 + packssdw m%3, m%5 ; t3 t2 + packssdw m%2, m%1 ; t0 t1 + paddsw m%1, m%2, m%3 ; out0 out1 + psubsw m%2, m%3 ; out3 out2 +%endmacro + +INV_TXFM_4X4_FN dct, dct +INV_TXFM_4X4_FN dct, identity +INV_TXFM_4X4_FN dct, adst +INV_TXFM_4X4_FN dct, flipadst + +cglobal idct_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 + call .main + vbroadcasti128 m2, [idct4_shuf] + packssdw m0, m1 + pshufb m0, m2 + jmp tx2q +.pass2: + vextracti128 xm1, m0, 1 + WRAP_XMM IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 5 + packssdw xm5, xm5 ; pw_2048 + pmulhrsw xm0, xm5 + pmulhrsw xm1, xm5 + movq xm2, [dstq+strideq*0] + movhps xm2, [dstq+strideq*1] + lea r6, [dstq+strideq*2] + movq xm3, [r6 +strideq*1] + movhps xm3, [r6 +strideq*0] + vpbroadcastd xm5, [pixel_10bpc_max] + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + paddw xm0, xm2 + paddw xm1, xm3 + pmaxsw xm0, xm4 + pmaxsw xm1, xm4 + pminsw xm0, xm5 + pminsw xm1, xm5 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movhps [r6 +strideq*0], xm1 + movq [r6 +strideq*1], xm1 + RET +ALIGN function_align +.main: + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m5, [pd_2048] +.main2: + IDCT4_1D_PACKED 0, 1, 2, 3, 4, 5 + ret + +INV_TXFM_4X4_FN adst, dct +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst +INV_TXFM_4X4_FN adst, identity + +%macro IADST4_1D 0 + vpbroadcastd m5, [pd_1321] + vpbroadcastd m7, [pd_2482] + pmulld m4, m0, m5 ; 1321*in0 + pmulld m6, m3, m7 ; 2482*in3 + paddd m4, m6 ; 1321*in0 + 2482*in3 + pmulld m6, m0, m7 ; 2482*in0 + paddd m0, m3 ; in0 + in3 + paddd m7, m5 ; pd_3803 + pmulld m5, m2 ; 1321*in2 + pmulld m3, m7 ; 3803*in3 + pmulld m7, m2 ; 3803*in2 + psubd m2, m0 ; in2 - in0 - in3 + vpbroadcastd m0, [pd_m3344] + pmulld m1, m0 ; -t3 + pmulld m2, m0 ; out2 (unrounded) + psubd m6, m5 ; 2482*in0 - 1321*in2 + paddd m4, m7 ; t0 + psubd m6, m3 ; t1 + paddd m3, m4, m6 + psubd m4, m1 ; out0 (unrounded) + psubd m6, m1 ; out1 (unrounded) + paddd m3, m1 ; out3 (unrounded) +%endmacro + +cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 + call .main + vinserti128 m0, m4, xm6, 1 + vinserti128 m1, m2, xm3, 1 +.pass1_end: + vpbroadcastd m5, [pd_2048] + mova m2, [itx4_shuf] + paddd m0, m5 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 + packssdw m0, m1 + vpermd m0, m2, m0 + psrld m2, 4 + pshufb m0, m2 +%if WIN64 + movaps xmm6, [rsp+ 8] + movaps xmm7, [rsp+24] +%endif + jmp tx2q +.pass2: + lea r6, [deint_shuf+128] + vextracti128 xm1, m0, 1 + call m(iadst_4x4_internal_8bpc).main +.end: + vpbroadcastd xm4, [pw_2048] + movq xm2, [dstq+strideq*0] + movhps xm2, [dstq+strideq*1] + lea r6, [dstq+strideq*2] + movq xm3, [r6 +strideq*0] + movhps xm3, [r6 +strideq*1] + vpbroadcastd xm5, [pixel_10bpc_max] + pmulhrsw xm0, xm4 + pmulhrsw xm1, xm4 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + paddw xm0, xm2 + paddw xm1, xm3 + pmaxsw xm0, xm4 + pmaxsw xm1, xm4 + pminsw xm0, xm5 + pminsw xm1, xm5 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [r6 +strideq*0], xm1 + movhps [r6 +strideq*1], xm1 + RET +ALIGN function_align +.main: + mova xm0, [cq+16*0] + mova xm1, [cq+16*1] + mova xm2, [cq+16*2] + mova xm3, [cq+16*3] +%if WIN64 + movaps [rsp+16], xmm6 + movaps [rsp+32], xmm7 +%endif +.main2: + WRAP_XMM IADST4_1D + ret + +INV_TXFM_4X4_FN flipadst, dct +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst +INV_TXFM_4X4_FN flipadst, identity + +cglobal iflipadst_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 + call m(iadst_4x4_internal_10bpc).main + vinserti128 m0, m3, xm2, 1 + vinserti128 m1, m6, xm4, 1 + jmp m(iadst_4x4_internal_10bpc).pass1_end +.pass2: + lea r6, [deint_shuf+128] + vextracti128 xm1, m0, 1 + call m(iadst_4x4_internal_8bpc).main + vpbroadcastd xm4, [pw_2048] + movq xm3, [dstq+strideq*1] + movhps xm3, [dstq+strideq*0] + lea r6, [dstq+strideq*2] + movq xm2, [r6 +strideq*1] + movhps xm2, [r6 +strideq*0] + vpbroadcastd xm5, [pixel_10bpc_max] + pmulhrsw xm0, xm4 + pmulhrsw xm1, xm4 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + paddw xm0, xm2 + paddw xm1, xm3 + pmaxsw xm0, xm4 + pmaxsw xm1, xm4 + pminsw xm0, xm5 + pminsw xm1, xm5 + movhps [dstq+strideq*0], xm1 + movq [dstq+strideq*1], xm1 + movhps [r6 +strideq*0], xm0 + movq [r6 +strideq*1], xm0 + RET + +INV_TXFM_4X4_FN identity, dct +INV_TXFM_4X4_FN identity, adst +INV_TXFM_4X4_FN identity, flipadst +INV_TXFM_4X4_FN identity, identity + +cglobal iidentity_4x4_internal_10bpc, 0, 7, 6, dst, stride, c, eob, tx2 + vpbroadcastd m1, [pd_5793] + pmulld m0, m1, [cq+32*0] + pmulld m1, [cq+32*1] + vpbroadcastd m5, [pd_2048] + mova m3, [itx4_shuf] + paddd m0, m5 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 + packssdw m0, m1 + vpermd m0, m3, m0 + psrld m3, 4 + pshufb m0, m3 + jmp tx2q +.pass2: + vpbroadcastd m1, [pw_1697x8] + movq xm2, [dstq+strideq*0] + movhps xm2, [dstq+strideq*1] + lea r6, [dstq+strideq*2] + pmulhrsw m1, m0 + paddsw m0, m1 + movq xm3, [r6 +strideq*0] + movhps xm3, [r6 +strideq*1] + vpbroadcastd xm4, [pixel_10bpc_max] + packssdw m5, m5 ; pw_2048 + pmulhrsw m0, m5 + pxor m5, m5 + mova [cq+32*0], m5 + mova [cq+32*1], m5 + vextracti128 xm1, m0, 1 + paddw xm0, xm2 + paddw xm1, xm3 + pmaxsw xm0, xm5 + pmaxsw xm1, xm5 + pminsw xm0, xm4 + pminsw xm1, xm4 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [r6 +strideq*0], xm1 + movhps [r6 +strideq*1], xm1 + RET + +INV_TXFM_4X4_FN dct, dct, 12 +INV_TXFM_4X4_FN dct, identity, 12 +INV_TXFM_4X4_FN dct, adst, 12 +INV_TXFM_4X4_FN dct, flipadst, 12 + +cglobal idct_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 + call m(idct_4x4_internal_10bpc).main + mova m3, [idct4_12_shuf] + mova m4, [idct4_12_shuf2] + vpermd m2, m4, m1 + vpermd m1, m3, m0 + jmp m(iadst_4x4_internal_12bpc).pass1_end2 +.pass2: + vpbroadcastd m5, [pd_2048] + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 + call m(idct_4x4_internal_10bpc).main2 + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + jmp m(iadst_4x4_internal_12bpc).end + +INV_TXFM_4X4_FN adst, dct, 12 +INV_TXFM_4X4_FN adst, adst, 12 +INV_TXFM_4X4_FN adst, flipadst, 12 +INV_TXFM_4X4_FN adst, identity, 12 + +cglobal iadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 + call m(iadst_4x4_internal_10bpc).main + vinserti128 m1, m4, xm6, 1 + vinserti128 m2, xm3, 1 +.pass1_end: + mova m3, [itx4_shuf] + vpbroadcastd m5, [pd_1024] + psrad m1, 1 + psrad m2, 1 + vpermd m1, m3, m1 + vpermd m2, m3, m2 + paddd m1, m5 + paddd m2, m5 + psrad m1, 11 + psrad m2, 11 +.pass1_end2: + vpbroadcastd m3, [clip_18b_min] + vpbroadcastd m4, [clip_18b_max] + punpcklqdq m0, m1, m2 + punpckhqdq m1, m2 + pmaxsd m0, m3 + pmaxsd m1, m3 + pminsd m0, m4 + pminsd m1, m4 + jmp tx2q +.pass2: + call .main_pass2 + vinserti128 m0, m4, xm6, 1 + vinserti128 m1, m2, xm3, 1 +.pass2_end: + vpbroadcastd m5, [pd_2048] + paddd m0, m5 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 +.end: +%if WIN64 + WIN64_RESTORE_XMM_INTERNAL + %assign xmm_regs_used 6 +%endif +.end2: + vpbroadcastd m4, [pw_16384] + movq xm2, [dstq+strideq*0] + movq xm3, [dstq+strideq*1] + lea r6, [dstq+strideq*2] + movhps xm2, [r6 +strideq*0] ; dst0 dst2 + movhps xm3, [r6 +strideq*1] ; dst1 dst3 + vpbroadcastd m5, [pixel_12bpc_max] + vinserti128 m2, xm3, 1 + psrad m0, 3 + psrad m1, 3 + packssdw m0, m1 ; t0 t2 t1 t3 + pmulhrsw m0, m4 + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + paddw m0, m2 ; out0 out2 out1 out3 + pmaxsw m0, m4 + pminsw m0, m5 + vextracti128 xm1, m0, 1 ; out1 out3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*1], xm1 + movhps [r6 +strideq*0], xm0 + movhps [r6 +strideq*1], xm1 + RET +.main_pass2: + vextracti128 xm3, m1, 1 + mova xm2, xm1 + vextracti128 xm1, m0, 1 + jmp m(iadst_4x4_internal_10bpc).main2 + +INV_TXFM_4X4_FN flipadst, dct, 12 +INV_TXFM_4X4_FN flipadst, adst, 12 +INV_TXFM_4X4_FN flipadst, flipadst, 12 +INV_TXFM_4X4_FN flipadst, identity, 12 + +cglobal iflipadst_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 + call m(iadst_4x4_internal_10bpc).main + vinserti128 m1, m3, xm2, 1 + vinserti128 m2, m6, xm4, 1 + jmp m(iadst_4x4_internal_12bpc).pass1_end +.pass2: + call m(iadst_4x4_internal_12bpc).main_pass2 + vinserti128 m0, m3, xm2, 1 + vinserti128 m1, m6, xm4, 1 + jmp m(iadst_4x4_internal_12bpc).pass2_end + +INV_TXFM_4X4_FN identity, dct, 12 +INV_TXFM_4X4_FN identity, adst, 12 +INV_TXFM_4X4_FN identity, flipadst, 12 +INV_TXFM_4X4_FN identity, identity, 12 + +cglobal iidentity_4x4_internal_12bpc, 0, 7, 8, dst, stride, c, eob, tx2 + mova m2, [itx4_shuf] + vpbroadcastd m3, [pd_1697] + vpermd m0, m2, [cq+32*0] + vpermd m2, m2, [cq+32*1] + vpbroadcastd m5, [pd_2048] + pmulld m1, m3, m0 + pmulld m3, m2 + paddd m1, m5 + paddd m3, m5 + psrad m1, 12 + psrad m3, 12 + paddd m1, m0 + paddd m2, m3 + jmp m(iadst_4x4_internal_12bpc).pass1_end2 +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 + vpbroadcastd m3, [pd_5793] + vpbroadcastd m5, [pd_2048] + pmulld m0, m3 + pmulld m1, m3 + paddd m0, m5 ; 2048 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 + jmp m(iadst_4x4_internal_12bpc).end + +%macro INV_TXFM_4X8_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 4x8, %3 +%ifidn %1_%2, dct_dct + vpbroadcastd xm2, [dconly_%3bpc] +%if %3 = 10 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 8 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly2 +%else + jmp m(inv_txfm_add_dct_dct_4x8_10bpc).dconly +%endif +%endif +%endmacro + +%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd + ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; t2, t3 + vpbroadcastd m%5, [pd_2896] + pmulld m%1, m%5 + pmulld m%3, m%5 + paddd m%1, m%8 + paddd m%5, m%1, m%3 + psubd m%1, m%3 + psrad m%5, 12 ; t0 + psrad m%1, 12 ; t1 + psubd m%3, m%1, m%2 + paddd m%2, m%1 + paddd m%1, m%5, m%4 + psubd m%4, m%5, m%4 +%endmacro + +INV_TXFM_4X8_FN dct, dct +INV_TXFM_4X8_FN dct, identity +INV_TXFM_4X8_FN dct, adst +INV_TXFM_4X8_FN dct, flipadst + +cglobal idct_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m3, [pd_2896] + pmulld m0, m3, [cq+32*0] + pmulld m1, m3, [cq+32*1] + pmulld m2, m3, [cq+32*2] + pmulld m3, m3, [cq+32*3] + vpbroadcastd m7, [pd_2048] + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7 + jmp tx2q +.pass2: + packssdw m0, m2 + packssdw m1, m3 + lea r6, [deint_shuf+128] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhdq m1, m0, m2 ; 2 3 + punpckldq m0, m2 ; 0 1 + vextracti128 xm2, m0, 1 ; 4 5 + vextracti128 xm3, m1, 1 ; 6 7 + call m(idct_4x8_internal_8bpc).main + vpbroadcastd xm4, [pw_2048] + REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3 + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+strideq*1] + movq xm5, [dstq+r3 ] + movhps xm5, [dstq+strideq*2] + movq xm6, [r6 +strideq*0] + movhps xm6, [r6 +strideq*1] + movq xm7, [r6 +r3 ] + movhps xm7, [r6 +strideq*2] + paddw xm0, xm4 ; 0 1 + paddw xm1, xm5 ; 3 2 + paddw xm2, xm6 ; 4 5 + paddw xm3, xm7 ; 7 6 + vpbroadcastd xm5, [pixel_10bpc_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3 + REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movhps [dstq+strideq*2], xm1 + movq [dstq+r3 ], xm1 + movq [r6 +strideq*0], xm2 + movhps [r6 +strideq*1], xm2 + movhps [r6 +strideq*2], xm3 + movq [r6 +r3 ], xm3 + RET + +INV_TXFM_4X8_FN adst, dct +INV_TXFM_4X8_FN adst, adst +INV_TXFM_4X8_FN adst, flipadst +INV_TXFM_4X8_FN adst, identity + +cglobal iadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 + call m(iadst_8x4_internal_10bpc).main + vpbroadcastd m5, [pd_2048] + paddd m0, m5, m4 + paddd m1, m5, m6 + paddd m2, m5 + paddd m3, m5 +.pass1_end: + REPX {psrad x, 12}, m0, m1, m2, m3 + jmp tx2q +.pass2: + call .pass2_main + mova xm4, [pw_2048_m2048] + REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3 +.end: + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+strideq*1] + movq xm5, [dstq+strideq*2] + movhps xm5, [dstq+r3 ] + movq xm6, [r6 +strideq*0] + movhps xm6, [r6 +strideq*1] + movq xm7, [r6 +strideq*2] + movhps xm7, [r6 +r3 ] + paddw xm0, xm4 ; 0 1 + paddw xm1, xm5 ; 2 3 + paddw xm2, xm6 ; 4 5 + paddw xm3, xm7 ; 6 7 + vpbroadcastd xm5, [pixel_10bpc_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3 + REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r3 ], xm1 + movq [r6 +strideq*0], xm2 + movhps [r6 +strideq*1], xm2 + movq [r6 +strideq*2], xm3 + movhps [r6 +r3 ], xm3 + RET +ALIGN function_align +.pass2_main: + packssdw m0, m2 + packssdw m1, m3 + lea r6, [deint_shuf+128] + punpcklwd m4, m0, m1 + punpckhwd m0, m1 + punpckhdq m5, m4, m0 + punpckldq m4, m0 + vextracti128 xm2, m4, 1 ; 4 5 + vextracti128 xm3, m5, 1 ; 6 7 + pshufd xm4, xm4, q1032 ; 1 0 + pshufd xm5, xm5, q1032 ; 3 2 + jmp m(iadst_4x8_internal_8bpc).main_pass2 +ALIGN function_align +.main: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] +.main2: + vbroadcasti128 m0, [cq+16*0] + vbroadcasti128 m2, [cq+16*2] + vbroadcasti128 m3, [cq+16*5] + vbroadcasti128 m1, [cq+16*7] + vpbroadcastd m6, [pd_2896] + shufpd m0, m2, 0x0c ; 0 2 + shufpd m1, m3, 0x0c ; 7 5 + vbroadcasti128 m2, [cq+16*4] + vbroadcasti128 m4, [cq+16*6] + vbroadcasti128 m5, [cq+16*1] + vbroadcasti128 m3, [cq+16*3] + vpbroadcastd m7, [pd_2048] + shufpd m2, m4, 0x0c ; 4 6 + shufpd m3, m5, 0x0c ; 3 1 + REPX {pmulld x, m6}, m0, m1, m2, m3 + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 +.main3: + ITX_MULSUB_2D 1, 0, 4, 5, 6, 7, 401_1931, 4076_3612, 1 + ITX_MULSUB_2D 3, 2, 4, 5, 6, 7, 3166_3920, 2598_1189, 1 + psubd m4, m0, m2 ; t4 t6 + paddd m0, m2 ; t0 t2 + psubd m2, m1, m3 ; t5 t7 + paddd m1, m3 ; t1 t3 + REPX {pmaxsd x, m8}, m4, m2, m0, m1 + REPX {pminsd x, m9}, m4, m2, m0, m1 + pxor m5, m5 + psubd m5, m4 + vpblendd m4, m2, 0xcc ; t4 t7 + vpblendd m2, m5, 0xcc ; t5 -t6 + ITX_MULSUB_2D 4, 2, 3, 5, 6, 7, 1567, 3784 + vpbroadcastd m5, [pd_2896] + vbroadcasti128 m6, [pw_2048_m2048] ; + + - - + punpckhqdq m3, m0, m1 + punpcklqdq m0, m1 + psubd m1, m0, m3 ; t2 t3 + paddd m0, m3 ; out0 -out7 + punpckhqdq m3, m4, m2 ; t7a t6a + punpcklqdq m4, m2 ; t5a t4a + psubd m2, m4, m3 ; t7 t6 + paddd m4, m3 ; out6 -out1 + REPX {pmaxsd x, m8}, m1, m2 + REPX {pminsd x, m9}, m1, m2 + vpblendd m3, m1, m2, 0xcc + shufpd m1, m2, 0x05 + pmulld m3, m5 + pmulld m5, m1 + psignd m0, m6 ; out0 out7 + psignd m4, m6 ; out6 out1 + paddd m3, m7 + psubd m2, m3, m5 + paddd m5, m3 + psrad m2, 12 ; out4 -out5 + psrad m5, 12 ; -out3 out2 + ret + +INV_TXFM_4X8_FN flipadst, dct +INV_TXFM_4X8_FN flipadst, adst +INV_TXFM_4X8_FN flipadst, flipadst +INV_TXFM_4X8_FN flipadst, identity + +cglobal iflipadst_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 + call m(iadst_8x4_internal_10bpc).main + vpbroadcastd m5, [pd_2048] + paddd m0, m5, m3 + paddd m1, m5, m2 + paddd m2, m5, m6 + paddd m3, m5, m4 + jmp m(iadst_4x8_internal_10bpc).pass1_end +.pass2: + call m(iadst_4x8_internal_10bpc).pass2_main + mova xm4, [pw_2048_m2048] + REPX {pmulhrsw x, xm4}, xm3, xm2, xm1, xm0 + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm4, [dstq+strideq*1] + movhps xm4, [dstq+strideq*0] + movq xm5, [dstq+r3 ] + movhps xm5, [dstq+strideq*2] + movq xm6, [r6 +strideq*1] + movhps xm6, [r6 +strideq*0] + movq xm7, [r6 +r3 ] + movhps xm7, [r6 +strideq*2] + paddw xm3, xm4 ; 1 0 + paddw xm2, xm5 ; 3 2 + paddw xm1, xm6 ; 5 4 + paddw xm0, xm7 ; 7 6 + vpbroadcastd xm5, [pixel_10bpc_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + REPX {pmaxsw x, xm4}, xm3, xm2, xm1, xm0 + REPX {pminsw x, xm5}, xm3, xm2, xm1, xm0 + movhps [dstq+strideq*0], xm3 + movq [dstq+strideq*1], xm3 + movhps [dstq+strideq*2], xm2 + movq [dstq+r3 ], xm2 + movhps [r6 +strideq*0], xm1 + movq [r6 +strideq*1], xm1 + movhps [r6 +strideq*2], xm0 + movq [r6 +r3 ], xm0 + RET + +INV_TXFM_4X8_FN identity, dct +INV_TXFM_4X8_FN identity, adst +INV_TXFM_4X8_FN identity, flipadst +INV_TXFM_4X8_FN identity, identity + +cglobal iidentity_4x8_internal_10bpc, 0, 7, 8, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m3, [pd_2896] + pmulld m0, m3, [cq+32*0] + pmulld m1, m3, [cq+32*1] + pmulld m2, m3, [cq+32*2] + pmulld m3, [cq+32*3] + vpbroadcastd m5, [pd_2048] + vpbroadcastd m4, [pd_5793] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + REPX {pmulld x, m4}, m0, m1, m2, m3 + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + jmp tx2q +.pass2: + vpbroadcastd m6, [pixel_10bpc_max] + call .pass2_end + RET +ALIGN function_align +.pass2_end: + vpbroadcastd m4, [pw_4096] + packssdw m0, m2 + packssdw m1, m3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmulhrsw m2, m4 + pmulhrsw m0, m4 + punpckhdq m1, m0, m2 ; 2 3 6 7 + punpckldq m0, m2 ; 0 1 4 5 + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm2, [dstq+strideq*0] + movhps xm2, [dstq+strideq*1] + vpbroadcastq m4, [r6 +strideq*0] + vpbroadcastq m5, [r6 +strideq*1] + movq xm3, [dstq+strideq*2] + movhps xm3, [dstq+r3 ] + vpblendd m2, m4, 0x30 + vpblendd m2, m5, 0xc0 + vpbroadcastq m4, [r6 +strideq*2] + vpbroadcastq m5, [r6 +r3 ] + vpblendd m3, m4, 0x30 + vpblendd m3, m5, 0xc0 + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + paddw m0, m2 ; out0 out1 out4 out5 + paddw m1, m3 ; out2 out3 out6 out7 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m6 + pminsw m1, m6 + vextracti128 xm2, m0, 1 ; out4 out5 + vextracti128 xm3, m1, 1 ; out6 out7 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r3 ], xm1 + movq [r6 +strideq*0], xm2 + movhps [r6 +strideq*1], xm2 + movq [r6 +strideq*2], xm3 + movhps [r6 +r3 ], xm3 + ret + +INV_TXFM_4X8_FN dct, dct, 12 +INV_TXFM_4X8_FN dct, identity, 12 +INV_TXFM_4X8_FN dct, adst, 12 +INV_TXFM_4X8_FN dct, flipadst, 12 + +cglobal idct_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 + jmp m(idct_4x8_internal_10bpc).pass1 +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + ; transpose & interleave + pshufd m0, m0, q1320 + pshufd m1, m1, q1320 + pshufd m2, m2, q1320 + pshufd m3, m3, q1320 + punpckldq m4, m0, m1 + punpckhdq m0, m1 + punpckldq m5, m2, m3 + punpckhdq m2, m3 + vpermq m0, m0, q3102 + vpermq m2, m2, q3102 + vperm2i128 m1, m0, m2, 0x31 ; 1 5 (interleaved) + vperm2i128 m3, m0, m2, 0x20 ; 7 3 (interleaved) + vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved) + vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved) + vpbroadcastd m7, [pd_2048] + call m(idct_8x4_internal_10bpc).main + psubd m3, m0, m4 ; out7 out6 + paddd m0, m4 ; out0 out1 + paddd m1, m2, m5 ; out3 out2 + psubd m2, m5 ; out4 out5 + pshufd m1, m1, q1032 + pshufd m3, m3, q1032 + jmp m(iadst_4x8_internal_12bpc).end + +INV_TXFM_4X8_FN adst, dct, 12 +INV_TXFM_4X8_FN adst, adst, 12 +INV_TXFM_4X8_FN adst, flipadst, 12 +INV_TXFM_4X8_FN adst, identity, 12 + +cglobal iadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 + call m(iadst_8x4_internal_10bpc).main + psrad m0, m4, 1 + psrad m1, m6, 1 + psrad m2, 1 + psrad m3, 1 +.pass1_end: + vpbroadcastd m5, [pd_1024] + REPX {paddd x, m5}, m0, m1, m2, m3 + REPX {psrad x, 11}, m0, m1, m2, m3 + jmp tx2q +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call .pass2_main + vpblendd m3, m0, m4, 0x33 ; out6 out7 + vpblendd m0, m4, 0xcc ; out0 out1 + pshufd m1, m5, q1032 + psignd m2, m6 ; out4 out5 + psignd m1, m6 ; out2 out3 +.end: + vpbroadcastd m4, [pw_16384] + REPX {psrad x, 3}, m0, m1, m2, m3 + packssdw m0, m2 ; 0 1 4 5 (interleaved) + packssdw m1, m3 ; 2 3 6 7 (interleaved) + mova m2, [iadst8_12_shuf] + vpermd m0, m2, m0 ; 0 1 4 5 + vpermd m1, m2, m1 ; 2 3 6 7 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + lea r3, [strideq*3] + lea r6, [dstq+strideq*4] + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+strideq*1] + movq xm5, [dstq+strideq*2] + movhps xm5, [dstq+r3 ] + movq xm6, [r6 +strideq*0] + movhps xm6, [r6 +strideq*1] + vinserti128 m4, xm6, 1 + movq xm7, [r6 +strideq*2] + movhps xm7, [r6 +r3 ] + vinserti128 m5, xm7, 1 + paddw m0, m4 ; 0 1 4 5 + paddw m1, m5 ; 2 3 6 7 + vpbroadcastd m5, [pixel_12bpc_max] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + REPX {pmaxsw x, m4}, m0, m1 + REPX {pminsw x, m5}, m0, m1 + vextracti128 xm2, m0, 1 ; out4 out5 + vextracti128 xm3, m1, 1 ; out6 out7 + movq [dstq+strideq*0], xm0 + movhps [dstq+strideq*1], xm0 + movq [dstq+strideq*2], xm1 + movhps [dstq+r3 ], xm1 + movq [r6 +strideq*0], xm2 + movhps [r6 +strideq*1], xm2 + movq [r6 +strideq*2], xm3 + movhps [r6 +r3 ], xm3 + RET +ALIGN function_align +.pass2_main: + ; transpose & interleave + pshufd m0, m0, q1320 + pshufd m1, m1, q1320 + pshufd m2, m2, q1320 + pshufd m3, m3, q1320 + punpckldq m4, m0, m1 + punpckhdq m0, m1 + punpckldq m5, m2, m3 + punpckhdq m2, m3 + vperm2i128 m1, m0, m2, 0x31 ; 7 5 (interleaved) + vperm2i128 m3, m0, m2, 0x20 ; 3 1 (interleaved) + vperm2i128 m0, m4, m5, 0x20 ; 0 2 (interleaved) + vperm2i128 m2, m4, m5, 0x31 ; 4 6 (interleaved) + vpbroadcastd m7, [pd_2048] + jmp m(iadst_4x8_internal_10bpc).main3 + +INV_TXFM_4X8_FN flipadst, dct, 12 +INV_TXFM_4X8_FN flipadst, adst, 12 +INV_TXFM_4X8_FN flipadst, flipadst, 12 +INV_TXFM_4X8_FN flipadst, identity, 12 + +cglobal iflipadst_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 + call m(iadst_8x4_internal_10bpc).main + psrad m0, m3, 1 + psrad m1, m2, 1 + psrad m2, m6, 1 + psrad m3, m4, 1 + jmp m(iadst_4x8_internal_12bpc).pass1_end +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call m(iadst_4x8_internal_12bpc).pass2_main + shufpd m3, m4, m0, 0x05 ; out1 out0 + shufpd m0, m4, 0x05 ; out7 out6 + psignd m2, m6 + pshufd m6, m6, q1032 + pshufd m1, m2, q1032 ; out5 out4 + psignd m2, m5, m6 ; out3 out2 + jmp m(iadst_4x8_internal_12bpc).end + +INV_TXFM_4X8_FN identity, dct, 12 +INV_TXFM_4X8_FN identity, adst, 12 +INV_TXFM_4X8_FN identity, flipadst, 12 +INV_TXFM_4X8_FN identity, identity, 12 + +cglobal iidentity_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 + jmp m(iidentity_4x8_internal_10bpc).pass1 +.pass2: + ; m0 = in0 in1 + ; m1 = in2 in3 + ; m2 = in4 in5 + ; m3 = in6 in7 + vpbroadcastd m6, [pixel_12bpc_max] + call m(iidentity_4x8_internal_10bpc).pass2_end + RET + +%macro INV_TXFM_4X16_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 4x16, %3 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 181 + vpbroadcastd xm2, [dconly_%3bpc] + mov [cq], eobd ; 0 + or r3d, 16 + add r6d, 384 + sar r6d, 9 + jmp m(inv_txfm_add_dct_dct_4x4_10bpc).dconly3 +%endif +%endmacro + +INV_TXFM_4X16_FN dct, dct +INV_TXFM_4X16_FN dct, identity +INV_TXFM_4X16_FN dct, adst +INV_TXFM_4X16_FN dct, flipadst + +cglobal idct_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m10, [pd_3072] + mova m1, [cq+32*2] + mova m3, [cq+32*6] + mova m5, [cq+32*3] + mova m7, [cq+32*7] + call .pass1_main + pmulld m0, m6, [cq+32*0] + pmulld m2, m6, [cq+32*4] + pmulld m4, m6, [cq+32*1] + pmulld m6, [cq+32*5] + call .pass1_main2 + REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + lea r6, [deint_shuf+128] + punpcklwd m4, m2, m3 + punpckhwd m2, m3 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhdq m1, m0, m4 ; 2 3 + punpckldq m0, m4 ; 0 1 + punpckldq m4, m5, m2 ; 8 9 + punpckhdq m5, m2 ; a b + vextracti128 xm2, m0, 1 ; 4 5 + vextracti128 xm3, m1, 1 ; 6 7 + vextracti128 xm6, m4, 1 ; c d + vextracti128 xm7, m5, 1 ; e f + call m(idct_4x16_internal_8bpc).main + vpbroadcastd m9, [pw_2048] + vinserti128 m0, m0, xm1, 1 ; 0 1 3 2 + vinserti128 m1, m2, xm3, 1 ; 4 5 7 6 + vinserti128 m2, m4, xm5, 1 ; 8 9 b a + vinserti128 m3, m6, xm7, 1 ; c d f e + vpbroadcastd m8, [pixel_10bpc_max] + call .pass2_end + RET +ALIGN function_align +.pass1_main: + vpbroadcastd m4, [pd_3784] + vpbroadcastd m8, [pd_1567] + vpbroadcastd m9, [pd_2048] + vpbroadcastd m6, [pd_1448] + ITX_MULSUB_2D 1, 3, 0, 2, _, 9, 8, 4 ; t2l, t3l + ITX_MULSUB_2D 5, 7, 4, 2, _, 9, 8, 4 ; t2h, t3h + ret +ALIGN function_align +.pass1_main2: + paddd m0, m10 + paddd m4, m10 + paddd m8, m0, m2 + psubd m0, m2 + paddd m9, m4, m6 + psubd m4, m6 + REPX {psrad x, 11}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h + psubd m2, m0, m1 + paddd m1, m0 + psubd m6, m4, m5 + paddd m5, m4 + paddd m0, m8, m3 + psubd m3, m8, m3 + paddd m4, m9, m7 + psubd m7, m9, m7 + ret +ALIGN function_align +.pass2_end: + lea r6, [strideq*3] + pxor m7, m7 + pmulhrsw m0, m9 + call .write_4x4 + pmulhrsw m0, m1, m9 + call .write_4x4 + pmulhrsw m0, m2, m9 + call .write_4x4 + pmulhrsw m0, m3, m9 + call .write_4x4 + ret +ALIGN function_align +.write_4x4: + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+strideq*1] + vpbroadcastq m5, [dstq+strideq*2] + vpbroadcastq m6, [dstq+r6 ] + mova [cq+32*0], m7 + mova [cq+32*1], m7 + add cq, 32*2 + vpblendd m4, m5, 0xc0 + vpblendd m4, m6, 0x30 + paddw m4, m0 + pmaxsw m4, m7 + pminsw m4, m8 + vextracti128 xm5, m4, 1 + movq [dstq+strideq*0], xm4 + movhps [dstq+strideq*1], xm4 + movhps [dstq+strideq*2], xm5 + movq [dstq+r6 ], xm5 + lea dstq, [dstq+strideq*4] + ret + +INV_TXFM_4X16_FN adst, dct +INV_TXFM_4X16_FN adst, adst +INV_TXFM_4X16_FN adst, flipadst +INV_TXFM_4X16_FN adst, identity + +cglobal iadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 + call m(iadst_16x4_internal_10bpc).main + vpbroadcastd m6, [pd_6144] + call m(iadst_16x4_internal_10bpc).main_end + psrad m0, m4, 13 + psrad m1, m5, 13 + psrad m2, 13 + psrad m3, 13 + psrad m4, m8, 13 + psrad m5, m9, 13 + psrad m6, 13 + psrad m7, 13 + jmp tx2q +.pass2: + call .pass2_main + vpbroadcastd m5, [pw_2048] + vpbroadcastd m8, [pixel_10bpc_max] + lea r6, [strideq*3] + vpblendd m4, m3, m0, 0xcc ; -out3 out0 out2 -out1 + pshufd m2, m2, q1032 ; -out11 out8 out10 -out9 + vpblendd m3, m0, 0x33 ; -out15 out12 out14 -out13 + pxor m7, m7 + psubw m9, m7, m5 + vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048 + pmulhrsw m0, m4, m9 + call .write_4x4 + pmulhrsw m0, m1, m9 + call .write_4x4 + pmulhrsw m0, m2, m9 + call .write_4x4 + pmulhrsw m0, m3, m9 + call .write_4x4 + RET +ALIGN function_align +.write_4x4: + movq xm4, [dstq+r6 ] + movhps xm4, [dstq+strideq*0] + vpbroadcastq m5, [dstq+strideq*1] + vpbroadcastq m6, [dstq+strideq*2] + mova [cq+32*0], m7 + mova [cq+32*1], m7 + add cq, 32*2 + vpblendd m4, m5, 0xc0 + vpblendd m4, m6, 0x30 + paddw m4, m0 + pmaxsw m4, m7 + pminsw m4, m8 + vextracti128 xm5, m4, 1 + movhps [dstq+strideq*0], xm4 + movhps [dstq+strideq*1], xm5 + movq [dstq+strideq*2], xm5 + movq [dstq+r6 ], xm4 + lea dstq, [dstq+strideq*4] + ret +ALIGN function_align +.pass2_main: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + lea r6, [deint_shuf+128] + punpcklwd m4, m2, m3 + punpckhwd m2, m3 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhdq m1, m0, m4 + punpckldq m0, m4 + punpckldq m4, m5, m2 + punpckhdq m5, m2 + vpblendd m3, m0, m1, 0x33 + vpblendd m0, m1, 0xcc + shufpd m2, m5, m4, 0x05 + shufpd m4, m5, 0x05 + vperm2i128 m1, m0, m3, 0x31 ; 4 7 6 5 + vinserti128 m0, xm3, 1 ; 0 3 2 1 + vperm2i128 m3, m2, m4, 0x31 ; c f e d ; ???? + vinserti128 m2, xm4, 1 ; b 8 9 a + call m(iadst_4x16_internal_8bpc).main2 + vpbroadcastd m5, [pw_2896x8] + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m5 ; -out7 out4 out6 -out5 + pmulhrsw m2, m5 ; out8 -out11 -out9 out10 + ret +ALIGN function_align +.main: + vbroadcasti128 m0, [cq+16* 0] + vbroadcasti128 m4, [cq+16* 2] + vbroadcasti128 m1, [cq+16*15] + vbroadcasti128 m5, [cq+16*13] + vbroadcasti128 m2, [cq+16* 4] + vbroadcasti128 m6, [cq+16* 6] + vbroadcasti128 m3, [cq+16*11] + vbroadcasti128 m7, [cq+16* 9] + shufpd m0, m4, 0x0c ; 0 2 + shufpd m1, m5, 0x0c ; 15 13 + shufpd m2, m6, 0x0c ; 4 6 + shufpd m3, m7, 0x0c ; 11 9 + vbroadcasti128 m4, [cq+16* 8] + vbroadcasti128 m6, [cq+16*10] + vbroadcasti128 m5, [cq+16* 7] + vbroadcasti128 m7, [cq+16* 5] + shufpd m4, m6, 0x0c ; 8 10 + shufpd m5, m7, 0x0c ; 7 5 + vbroadcasti128 m6, [cq+16*12] + vbroadcasti128 m7, [cq+16*14] + shufpd m6, m7, 0x0c ; 12 14 + vbroadcasti128 m7, [cq+16* 3] + vbroadcasti128 m8, [cq+16* 1] + shufpd m7, m8, 0x0c ; 3 1 +.main2: + ; expects: m12 = clip_min m13 = clip_max + vpbroadcastd m11, [pd_2048] + ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201_995, 4091_3973, 1 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751_2440, 3703_3290, 1 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035_3513, 2751_2106, 1 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857_4052, 1380_601, 1 + psubd m8, m0, m4 ; t8a t10a + paddd m0, m4 ; t0a t2a + psubd m4, m1, m5 ; t9a t11a + paddd m1, m5 ; t1a t3a + psubd m5, m2, m6 ; t12a t14a + paddd m2, m6 ; t4a t6a + psubd m6, m3, m7 ; t13a t15a + paddd m3, m7 ; t5a t7a + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m8 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m8 + ITX_MULSUB_2D 8, 4, 7, 9, 10, 11, 799_3406, 4017_2276, 1 + ITX_MULSUB_2D 6, 5, 7, 9, 10, 11, 4017_2276, 10, 1 + psubd m7, m0, m2 ; t4 t6 + paddd m0, m2 ; t0 t2 + psubd m2, m1, m3 ; t5 t7 + paddd m1, m3 ; t1 t3 + psubd m3, m4, m6 ; t12a t14a + paddd m4, m6 ; t8a t10a + psubd m6, m8, m5 ; t13a t15a + paddd m8, m5 ; t9a t11a + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m6, m7, m8 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m6, m7, m8 + punpcklqdq m5, m3, m7 ; t12a t4 + punpckhqdq m3, m7 ; t14a t6 + punpckhqdq m7, m6, m2 ; t15a t7 + punpcklqdq m6, m2 ; t13a t5 + ITX_MULSUB_2D 7, 3, 2, 9, 10, 11, 3784, 1567 + ITX_MULSUB_2D 5, 6, 2, 9, 10, 11, 1567, 10 + vpbroadcastd m10, [pd_2896] + vbroadcasti128 m9, [pw_2048_m2048] ; + + - - + punpckhqdq m2, m4, m0 ; t10a t2 + punpcklqdq m4, m0 ; t8a t0 + punpckhqdq m0, m8, m1 ; t11a t3 + punpcklqdq m8, m1 ; t9a t1 + paddd m1, m6, m7 ; out2 -out3 + psubd m6, m7 ; t14a t6 + paddd m7, m5, m3 ; -out13 out12 + psubd m5, m3 ; t15a t7 + psubd m3, m8, m0 ; t11 t3a + paddd m8, m0 ; out14 -out15 + paddd m0, m4, m2 ; -out1 out0 + psubd m4, m2 ; t10 t2a + REPX {pmaxsd x, m12}, m6, m5, m3, m4 + REPX {pminsd x, m13}, m6, m5, m3, m4 + REPX {pmulld x, m10}, m6, m5, m3, m4 + paddd m6, m11 + paddd m4, m11 + paddd m2, m6, m5 ; -out5 out4 + psubd m6, m5 ; out10 -out11 + psubd m5, m4, m3 ; -out9 out8 + paddd m3, m4 ; out6 -out7 + REPX {psrad x, 12}, m2, m3, m5, m6 + REPX {psignd x, m9}, m1, m8, m3, m6 + pshufd m9, m9, q1032 + REPX {psignd x, m9}, m0, m7, m2, m5 + ret + +INV_TXFM_4X16_FN flipadst, dct +INV_TXFM_4X16_FN flipadst, adst +INV_TXFM_4X16_FN flipadst, flipadst +INV_TXFM_4X16_FN flipadst, identity + +cglobal iflipadst_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 +.pass1: + call m(iadst_16x4_internal_10bpc).main + vpbroadcastd m6, [pd_6144] + call m(iadst_16x4_internal_10bpc).main_end + psrad m0, m3, 13 + psrad m1, m2, 13 + psrad m2, m5, 13 + psrad m3, m4, 13 + psrad m4, m7, 13 + psrad m5, m6, 13 + psrad m6, m9, 13 + psrad m7, m8, 13 + jmp tx2q +.pass2: + call m(iadst_4x16_internal_10bpc).pass2_main + vpbroadcastd m5, [pw_2048] + vpbroadcastd m8, [pixel_10bpc_max] + lea r6, [strideq*3] + vpblendd m4, m3, m0, 0x33 ; -out0 out3 out1 -out2 + pshufd m2, m2, q1032 ; -out11 out8 out10 -out9 + vpblendd m3, m0, 0xcc ; -out12 out15 out13 -out14 + pxor m7, m7 + psubw m9, m7, m5 + vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048 + pmulhrsw m0, m4, m9 + call .write_4x4 + pmulhrsw m0, m2, m9 + call .write_4x4 + pmulhrsw m0, m1, m9 + call .write_4x4 + pmulhrsw m0, m3, m9 + call .write_4x4 + RET +ALIGN function_align +.write_4x4: + movq xm4, [dstq+strideq*0] + movhps xm4, [dstq+r6 ] + vpbroadcastq m5, [dstq+strideq*1] + vpbroadcastq m6, [dstq+strideq*2] + mova [cq+32*0], m7 + mova [cq+32*1], m7 + add cq, 32*2 + vpblendd m4, m5, 0x30 + vpblendd m4, m6, 0xc0 + paddw m4, m0 + pmaxsw m4, m7 + pminsw m4, m8 + vextracti128 xm5, m4, 1 + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*1], xm5 + movhps [dstq+strideq*2], xm5 + movhps [dstq+r6 ], xm4 + lea dstq, [dstq+strideq*4] + ret + +INV_TXFM_4X16_FN identity, dct +INV_TXFM_4X16_FN identity, adst +INV_TXFM_4X16_FN identity, flipadst +INV_TXFM_4X16_FN identity, identity + +cglobal iidentity_4x16_internal_10bpc, 0, 7, 11, dst, stride, c, eob, tx2 + vpbroadcastd m7, [pd_5793] + pmulld m0, m7, [cq+32*0] + pmulld m4, m7, [cq+32*1] + pmulld m1, m7, [cq+32*2] + pmulld m5, m7, [cq+32*3] + pmulld m2, m7, [cq+32*4] + pmulld m6, m7, [cq+32*5] + pmulld m3, m7, [cq+32*6] + pmulld m7, [cq+32*7] + vpbroadcastd m8, [pd_6144] + REPX {paddd x, m8}, m0, m4, m1, m5, m2, m6, m3, m7 + REPX {psrad x, 13}, m0, m4, m1, m5, m2, m6, m3, m7 + jmp tx2q +.pass2: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpbroadcastd m7, [pw_1697x16] + vpbroadcastd m8, [pw_2048] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + REPX {paddsw x, x}, m0, m1, m2, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + vpbroadcastd m4, [pixel_10bpc_max] + call .pass2_end + RET +ALIGN function_align +.pass2_end: + punpckhwd m7, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + lea r6, [strideq*5] + pxor m3, m3 + punpckhdq m5, m0, m2 ; 2 3 6 7 + punpckldq m0, m2 ; 0 1 4 5 + punpckldq m6, m7, m1 ; 8 9 c d + punpckhdq m7, m1 ; a b e f + pmulhrsw m0, m8 + call .write_2x4x2 + pmulhrsw m0, m5, m8 + call .write_2x4x2 + pmulhrsw m0, m6, m8 + lea dstq, [dstq+strideq*4] + call .write_2x4x2 + pmulhrsw m0, m7, m8 + call .write_2x4x2 + ret +ALIGN function_align +.write_2x4x2: + movq xm1, [dstq+strideq*0] + movhps xm1, [dstq+strideq*1] + vpbroadcastq m2, [dstq+strideq*4] + vpblendd m1, m2, 0x30 + vpbroadcastq m2, [dstq+r6 ] + vpblendd m1, m2, 0xc0 + mova [cq+32*0], m3 + mova [cq+32*1], m3 + add cq, 32*2 + paddw m1, m0 + pmaxsw m1, m3 + pminsw m1, m4 + vextracti128 xm2, m1, 1 + movq [dstq+strideq*0], xm1 + movhps [dstq+strideq*1], xm1 + movq [dstq+strideq*4], xm2 + movhps [dstq+r6 ], xm2 + lea dstq, [dstq+strideq*2] + ret + +INV_TXFM_4X16_FN dct, dct, 12 +INV_TXFM_4X16_FN dct, identity, 12 +INV_TXFM_4X16_FN dct, adst, 12 +INV_TXFM_4X16_FN dct, flipadst, 12 + +cglobal idct_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + jmp m(idct_4x16_internal_10bpc).pass1 +.pass2: + punpckldq m8, m0, m1 + punpckhdq m0, m1 + punpckldq m9, m2, m3 + punpckhdq m2, m3 + punpckldq m1, m4, m5 + punpckhdq m4, m5 + punpckldq m3, m6, m7 + punpckhdq m6, m7 + punpcklqdq m5, m0, m2 ; 2 6 + punpckhqdq m12, m0, m2 ; 3 7 + punpcklqdq m0, m8, m9 ; 0 4 + punpckhqdq m10, m8, m9 ; 1 5 + punpcklqdq m2, m1, m3 ; 8 12 + punpckhqdq m13, m1, m3 ; 9 13 + punpcklqdq m9, m4, m6 ; 10 14 + punpckhqdq m4, m6 ; 11 15 + vperm2i128 m1, m5, m9, 0x20 ; 2 10 + vperm2i128 m3, m9, m5, 0x31 ; 14 6 + vpermq m11, m4, q1302 ; 15 11 + ; interleave + REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m10 + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3, m10, m11, m12, m13 + REPX {pminsd x, m9}, m0, m1, m2, m3, m10, m11, m12, m13 + call m(idct_16x4_internal_10bpc).pass1_main + vpermq m6, m12, q1302 ; 7 3 + vpermq m5, m13, q3120 ; 9 13 + call m(idct_16x4_internal_10bpc).pass1_main2 + call m(idct_16x4_internal_10bpc).pass1_main3 + REPX {psrad x, 3}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + mova m4, [idct16_12_shuf] + REPX {vpermd x, m4, x}, m0, m1, m2, m3 + vpbroadcastd m9, [pw_16384] + vpbroadcastd m8, [pixel_12bpc_max] + call m(idct_4x16_internal_10bpc).pass2_end + RET + +INV_TXFM_4X16_FN adst, dct, 12 +INV_TXFM_4X16_FN adst, adst, 12 +INV_TXFM_4X16_FN adst, flipadst, 12 +INV_TXFM_4X16_FN adst, identity, 12 + +cglobal iadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + call .main_pass1 + psrad m0, m4, 12 + psrad m1, m5, 12 + psrad m2, 12 + psrad m3, 12 + psrad m4, m8, 12 + psrad m5, m9, 12 + psrad m6, 12 + psrad m7, 12 + jmp tx2q +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call .transpose_16x4 + call m(iadst_4x16_internal_10bpc).main2 + pshufd m4, m5, q1032 + psrad m5, m6, 3 + pshufd m6, m7, q1032 + psrad m7, m8, 3 + REPX {pshufd x, x, q1032}, m0, m2 + REPX {psrad x, 3}, m0, m1, m2, m3, m4, m6 +.pass2_end: + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + mova m4, [iadst16_12_shuf] + REPX {vpermd x, m4, x}, m0, m1, m2, m3 + vpbroadcastd m9, [pw_16384] + vpbroadcastd m8, [pixel_12bpc_max] + lea r6, [strideq*3] + pxor m7, m7 + pmulhrsw m0, m9 + call m(iadst_4x16_internal_10bpc).write_4x4 + pmulhrsw m0, m9, m1 + call m(iadst_4x16_internal_10bpc).write_4x4 + pmulhrsw m0, m9, m2 + call m(iadst_4x16_internal_10bpc).write_4x4 + pmulhrsw m0, m9, m3 + call m(iadst_4x16_internal_10bpc).write_4x4 + RET +ALIGN function_align +.transpose_16x4: + ; transpose & interleave + punpckldq m8, m0, m1 + punpckhdq m0, m1 + punpckldq m9, m2, m3 + punpckhdq m2, m3 + punpckldq m1, m4, m5 + punpckhdq m4, m5 + punpckldq m3, m6, m7 + punpckhdq m6, m7 + punpcklqdq m10, m8, m0 + punpckhqdq m0, m8 + punpcklqdq m11, m9, m2 + punpckhqdq m2, m9 + punpcklqdq m8, m1, m4 + punpckhqdq m4, m1 + punpcklqdq m9, m3, m6 + punpckhqdq m6, m3 + vperm2i128 m5, m0, m2, 0x31 ; 7 5 + vperm2i128 m7, m0, m2, 0x20 ; 3 1 + vperm2i128 m0, m10, m11, 0x20 ; 0 2 + vperm2i128 m2, m10, m11, 0x31 ; 4 6 + vperm2i128 m1, m4, m6, 0x31 ; 15 13 + vperm2i128 m3, m4, m6, 0x20 ; 11 9 + vperm2i128 m4, m8, m9, 0x20 ; 8 10 + vperm2i128 m6, m8, m9, 0x31 ; 12 14 + ret +ALIGN function_align +.main_pass1: + call m(iadst_16x4_internal_10bpc).main + vpbroadcastd m6, [pd_3072] + paddd m10, m4, m5 + psubd m4, m3 + psubd m5, m3 + paddd m3, m10 + psubd m8, m7, m1 + paddd m7, m9 + psubd m9, m1 + paddd m7, m1 + REPX {psrad x, 1 }, m4, m5, m2, m3, m8, m9, m0, m7 + REPX {paddd x, m6}, m4, m5, m2, m3, m8, m9, m7 + paddd m6, m0 + ret + +INV_TXFM_4X16_FN flipadst, dct, 12 +INV_TXFM_4X16_FN flipadst, adst, 12 +INV_TXFM_4X16_FN flipadst, flipadst, 12 +INV_TXFM_4X16_FN flipadst, identity, 12 + +cglobal iflipadst_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + call m(iadst_4x16_internal_12bpc).main_pass1 + psrad m0, m3, 12 + psrad m1, m2, 12 + psrad m2, m5, 12 + psrad m3, m4, 12 + psrad m4, m7, 12 + psrad m5, m6, 12 + psrad m6, m9, 12 + psrad m7, m8, 12 + jmp tx2q +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(iadst_4x16_internal_12bpc).transpose_16x4 + call m(iadst_4x16_internal_10bpc).main2 + pshufd m4, m3, q1032 + psrad m3, m5, 3 + psrad m5, m2, 3 + pshufd m2, m6, q1032 + pshufd m6, m1, q1032 + psrad m1, m7, 3 + psrad m7, m0, 3 + pshufd m0, m8, q1032 + REPX {psrad x, 3}, m0, m2, m4, m6 + jmp m(iadst_4x16_internal_12bpc).pass2_end + +INV_TXFM_4X16_FN identity, dct, 12 +INV_TXFM_4X16_FN identity, adst, 12 +INV_TXFM_4X16_FN identity, flipadst, 12 +INV_TXFM_4X16_FN identity, identity, 12 + +cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m8, [pd_1697] + mova m0, [cq+32*0] + mova m4, [cq+32*1] + mova m1, [cq+32*2] + mova m5, [cq+32*3] + vpbroadcastd m9, [pd_6144] + pmulld m2, m8, m0 + pmulld m6, m8, m4 + pmulld m3, m8, m1 + pmulld m7, m8, m5 + mova m10, [cq+32*4] + mova m11, [cq+32*5] + mova m12, [cq+32*6] + mova m13, [cq+32*7] + REPX {paddd x, m9}, m2, m6, m3, m7 + REPX {psrad x, 12}, m2, m6, m3, m7 + paddd m0, m2 + pmulld m2, m8, m10 + paddd m4, m6 + pmulld m6, m8, m11 + paddd m1, m3 + pmulld m3, m8, m12 + paddd m5, m7 + pmulld m7, m8, m13 + REPX {psrad x, 1 }, m0, m4, m1, m5 + REPX {paddd x, m9}, m2, m6, m3, m7 + REPX {psrad x, 12}, m2, m6, m3, m7 + paddd m2, m10 + paddd m6, m11 + paddd m3, m12 + paddd m7, m13 + REPX {psrad x, 1 }, m2, m6, m3, m7 + jmp tx2q +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m8, [pd_5793] + vpbroadcastd m9, [pd_1024] + REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpbroadcastd m8, [pw_16384] + vpbroadcastd m4, [pixel_12bpc_max] + call m(iidentity_4x16_internal_10bpc).pass2_end + RET + +%macro INV_TXFM_8X4_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 8x4, %3 +%ifidn %1_%2, dct_dct + vpbroadcastd m2, [dconly_%3bpc] +%if %3 = 10 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 4 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 128 + sar r6d, 8 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 +%else + jmp m(inv_txfm_add_dct_dct_8x4_10bpc).dconly +%endif +%endif +%endmacro + +INV_TXFM_8X4_FN dct, dct +INV_TXFM_8X4_FN dct, identity +INV_TXFM_8X4_FN dct, adst +INV_TXFM_8X4_FN dct, flipadst + +cglobal idct_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] +.pass1: + vbroadcasti128 m1, [cq+16*1] + vbroadcasti128 m0, [cq+16*5] + vbroadcasti128 m2, [cq+16*3] + vbroadcasti128 m3, [cq+16*7] + vpbroadcastd m6, [pd_2896] + shufpd m1, m0, 0x0c ; 1 5 + shufpd m3, m2, 0x0c ; 7 3 + vbroadcasti128 m0, [cq+16*0] + vbroadcasti128 m4, [cq+16*2] + vbroadcasti128 m2, [cq+16*4] + vbroadcasti128 m5, [cq+16*6] + vpbroadcastd m7, [pd_2048] + shufpd m0, m4, 0x0c ; 0 2 + shufpd m2, m5, 0x0c ; 4 6 + REPX {pmulld x, m6}, m1, m3, m0, m2 + REPX {paddd x, m7}, m1, m3, m0, m2 + REPX {psrad x, 12}, m1, m3, m0, m2 + call .main + psubd m3, m0, m4 ; out7 out6 (interleaved) + paddd m0, m4 ; out0 out1 (interleaved) + paddd m1, m2, m5 ; out3 out2 (interleaved) + psubd m2, m5 ; out4 out5 (interleaved) + pshufd m1, m1, q1032 + pshufd m3, m3, q1032 + jmp tx2q +.pass2: + vbroadcasti128 m4, [deint_shuf] + packssdw m0, m1 + packssdw m2, m3 + vperm2i128 m1, m0, m2, 0x31 + vinserti128 m0, xm2, 1 + pshufb m0, m4 + pshufb m1, m4 + IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 7 + vpermq m0, m0, q3120 ; out0 out1 + vpermq m2, m1, q2031 ; out2 out3 + jmp m(iadst_8x4_internal_10bpc).end +ALIGN function_align +.main: + ITX_MULSUB_2D 1, 3, 4, 5, 6, 7, 799_3406, 4017_2276, 1 + IDCT4_1D_PACKED 0, 2, 4, 5, 6, 7 + vpbroadcastd m6, [pd_2896] + punpcklqdq m4, m1, m3 ; t4a t7a + punpckhqdq m1, m3 ; t5a t6a + psubd m3, m4, m1 ; t5a t6a + paddd m4, m1 ; t4 t7 + REPX {pmaxsd x, m8}, m3, m4, m0, m2 + REPX {pminsd x, m9}, m3, m4, m0, m2 + pmulld m3, m6 + pshufd m1, m3, q1032 + paddd m3, m7 + psubd m5, m3, m1 + paddd m1, m3 + psrad m5, 12 + psrad m1, 12 + vpblendd m5, m4, 0x33 ; t4 t5 + punpckhqdq m4, m1 ; t7 t6 + ret + +INV_TXFM_8X4_FN adst, dct +INV_TXFM_8X4_FN adst, adst +INV_TXFM_8X4_FN adst, flipadst +INV_TXFM_8X4_FN adst, identity + +cglobal iadst_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2 + call m(iadst_4x8_internal_10bpc).main + vpblendd m3, m0, m4, 0x33 ; out6 out7 + vpblendd m0, m4, 0xcc ; out0 out1 + pshufd m1, m5, q1032 + psignd m2, m6 ; out4 out5 + psignd m1, m6 ; out2 out3 + jmp tx2q +.pass2: + call .pass2_main + vpermq m0, m0, q3120 ; out0 out1 + vpermq m2, m1, q3120 ; out2 out3 +.end: + vpbroadcastd m1, [pw_2048] + pmulhrsw m0, m1 + pmulhrsw m1, m2 + vpbroadcastd m5, [pixel_10bpc_max] +.end2: + mova xm2, [dstq+strideq*0] + vinserti128 m2, [dstq+strideq*1], 1 + lea r6, [dstq+strideq*2] + mova xm3, [r6 +strideq*0] + vinserti128 m3, [r6 +strideq*1], 1 + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [r6 +strideq*0], xm1 + vextracti128 [r6 +strideq*1], m1, 1 + RET +ALIGN function_align +.pass2_main: + vbroadcasti128 m4, [deint_shuf] + packssdw m0, m1 + packssdw m2, m3 + lea r6, [deint_shuf+128] + vperm2i128 m1, m0, m2, 0x31 + vinserti128 m0, xm2, 1 + pshufb m0, m4 + pshufb m1, m4 + jmp m(iadst_8x4_internal_8bpc).main +ALIGN function_align +.main: + vpbroadcastd m1, [pd_2896] + pmulld m0, m1, [cq+32*0] + pmulld m3, m1, [cq+32*3] + pmulld m2, m1, [cq+32*2] + pmulld m1, [cq+32*1] + vpbroadcastd m4, [pd_2048] + REPX {paddd x, m4}, m0, m3, m2, m1 + REPX {psrad x, 12}, m0, m3, m2, m1 +.main2: + IADST4_1D + ret + +INV_TXFM_8X4_FN flipadst, dct +INV_TXFM_8X4_FN flipadst, adst +INV_TXFM_8X4_FN flipadst, flipadst +INV_TXFM_8X4_FN flipadst, identity + +cglobal iflipadst_8x4_internal_10bpc, 0, 5, 10, dst, stride, c, eob, tx2 + call m(iadst_4x8_internal_10bpc).main + shufpd m3, m4, m0, 0x05 + shufpd m0, m4, 0x05 + psignd m2, m6 + pshufd m6, m6, q1032 + pshufd m1, m2, q1032 + psignd m2, m5, m6 + jmp tx2q +.pass2: + call m(iadst_8x4_internal_10bpc).pass2_main + vpermq m2, m0, q2031 + vpermq m0, m1, q2031 + jmp m(iadst_8x4_internal_10bpc).end + +INV_TXFM_8X4_FN identity, dct +INV_TXFM_8X4_FN identity, adst +INV_TXFM_8X4_FN identity, flipadst +INV_TXFM_8X4_FN identity, identity + +cglobal iidentity_8x4_internal_10bpc, 0, 7, 10, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m4, [pd_2896] + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpermq m2, [cq+32*2], q3120 + vpermq m3, [cq+32*3], q3120 + vpbroadcastd m7, [pd_2048] + REPX {pmulld x, m4}, m0, m1, m2, m3 + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 12}, m0, m1, m2, m3 + REPX {paddd x, x }, m0, m1, m2, m3 + jmp tx2q +.pass2: + vpbroadcastd m5, [pixel_10bpc_max] + vpbroadcastd m4, [pw_1697x8] + packssdw m0, m1 + packssdw m2, m3 + pmulhrsw m1, m4, m0 + pmulhrsw m4, m2 + paddsw m0, m1 + paddsw m2, m4 + packssdw m7, m7 ; pw_2048 +.pass2_end: + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + lea r6, [dstq+strideq*2] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmulhrsw m2, m7 + pmulhrsw m0, m7 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + mova xm2, [dstq+strideq*0] + vinserti128 m2, [r6 +strideq*0], 1 + mova xm3, [dstq+strideq*1] + vinserti128 m3, [r6 +strideq*1], 1 + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3 + paddw m0, m2 + paddw m1, m3 + pmaxsw m0, m4 + pmaxsw m1, m4 + pminsw m0, m5 + pminsw m1, m5 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm1 + vextracti128 [r6 +strideq*0], m0, 1 + vextracti128 [r6 +strideq*1], m1, 1 + RET + +INV_TXFM_8X4_FN dct, dct, 12 +INV_TXFM_8X4_FN dct, identity, 12 +INV_TXFM_8X4_FN dct, adst, 12 +INV_TXFM_8X4_FN dct, flipadst, 12 + +cglobal idct_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_20b_min] + vpbroadcastd m9, [clip_20b_max] + jmp m(idct_8x4_internal_10bpc).pass1 +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call m(iadst_8x4_internal_12bpc).transpose_4x8 + IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7 + jmp m(iadst_8x4_internal_12bpc).end + +INV_TXFM_8X4_FN adst, dct, 12 +INV_TXFM_8X4_FN adst, adst, 12 +INV_TXFM_8X4_FN adst, flipadst, 12 +INV_TXFM_8X4_FN adst, identity, 12 + +cglobal iadst_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_20b_min] + vpbroadcastd m9, [clip_20b_max] + call m(iadst_4x8_internal_10bpc).main2 + vpblendd m3, m0, m4, 0x33 ; out6 out7 + vpblendd m0, m4, 0xcc ; out0 out1 + pshufd m1, m5, q1032 + psignd m2, m6 ; out4 out5 + psignd m1, m6 ; out2 out3 + jmp tx2q +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call .pass2_main + vpbroadcastd m5, [pd_2048] + paddd m0, m5, m4 + paddd m1, m5, m6 + paddd m2, m5 + paddd m3, m5 +.pass2_end: + REPX {psrad x, 12}, m0, m1, m2, m3 +.end: + vpbroadcastd m4, [pw_16384] + REPX {psrad x, 3}, m0, m1, m2, m3 + packssdw m0, m1 + packssdw m2, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m2, m4 + vpermq m0, m0, q3120 ; out0 out1 + vpermq m1, m1, q3120 ; out2 out3 + vpbroadcastd m5, [pixel_12bpc_max] + jmp m(iadst_8x4_internal_10bpc).end2 +ALIGN function_align +.pass2_main: + call .transpose_4x8 + jmp m(iadst_8x4_internal_10bpc).main2 +ALIGN function_align +.transpose_4x8: + ; deinterleave + pshufd m0, m0, q3120 + pshufd m1, m1, q3120 + pshufd m2, m2, q3120 + pshufd m3, m3, q3120 + ; transpose + punpcklqdq m4, m0, m1 + punpckhqdq m0, m1 + punpcklqdq m5, m2, m3 + punpckhqdq m2, m3 + vperm2i128 m1, m0, m2, 0x20 ; out1 + vperm2i128 m3, m0, m2, 0x31 ; out3 + vperm2i128 m2, m4, m5, 0x31 ; out2 + vperm2i128 m0, m4, m5, 0x20 ; out0 + ret + +INV_TXFM_8X4_FN flipadst, dct, 12 +INV_TXFM_8X4_FN flipadst, adst, 12 +INV_TXFM_8X4_FN flipadst, flipadst, 12 +INV_TXFM_8X4_FN flipadst, identity, 12 + +cglobal iflipadst_8x4_internal_12bpc, 0, 5, 10, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_20b_min] + vpbroadcastd m9, [clip_20b_max] + call m(iadst_4x8_internal_10bpc).main2 + shufpd m3, m4, m0, 0x05 + shufpd m0, m4, 0x05 + psignd m2, m6 + pshufd m6, m6, q1032 + pshufd m1, m2, q1032 + psignd m2, m5, m6 + jmp tx2q +.pass2: + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + call m(iadst_8x4_internal_12bpc).pass2_main + vpbroadcastd m5, [pd_2048] + paddd m0, m5, m3 + paddd m1, m5, m2 + paddd m3, m5, m4 + paddd m2, m5, m6 + jmp m(iadst_8x4_internal_12bpc).pass2_end + +INV_TXFM_8X4_FN identity, dct, 12 +INV_TXFM_8X4_FN identity, adst, 12 +INV_TXFM_8X4_FN identity, flipadst, 12 +INV_TXFM_8X4_FN identity, identity, 12 + +cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 + jmp m(iidentity_8x4_internal_10bpc).pass1 +.pass2: + ; m0 = in0 in1 (interleaved) + ; m1 = in2 in3 (interleaved) + ; m2 = in4 in5 (interleaved) + ; m3 = in6 in7 (interleaved) + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] + REPX {pmaxsd x, m8}, m0, m1, m2, m3 + REPX {pminsd x, m9}, m0, m1, m2, m3 + vpbroadcastd m4, [pd_5793] + REPX {pmulld x, m4}, m0, m1, m2, m3 + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 15}, m0, m1, m2, m3 + vpbroadcastd m5, [pixel_12bpc_max] + vpbroadcastd m7, [pw_16384] + packssdw m0, m1 + packssdw m2, m3 + jmp m(iidentity_8x4_internal_10bpc).pass2_end + +%macro INV_TXFM_8X8_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 8x8, %3 +%ifidn %1_%2, dct_dct + vpbroadcastd m2, [dconly_%3bpc] +%if %3 = 10 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 8 +.dconly2: + add r6d, 384 + sar r6d, 9 +.dconly3: + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + movd xm0, r6d + paddsw xm0, xm2 + vpbroadcastw m0, xm0 +.dconly_loop: + mova xm1, [dstq+strideq*0] + vinserti128 m1, [dstq+strideq*1], 1 + paddsw m1, m0 + psubusw m1, m2 + mova [dstq+strideq*0], xm1 + vextracti128 [dstq+strideq*1], m1, 1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +%else + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly +%endif +%endif +%endmacro + +%macro IADST8_1D 14 ; src[1-8], tmp[1-3], pd_2048, clip[1-2] + ITX_MULSUB_2D %8, %1, %9, %10, %11, %12, 401, 4076 ; t1a, t0a + ITX_MULSUB_2D %2, %7, %9, %10, %11, %12, 3920, 1189 ; t7a, t6a + ITX_MULSUB_2D %6, %3, %9, %10, %11, %12, 1931, 3612 ; t3a, t2a + ITX_MULSUB_2D %4, %5, %9, %10, %11, %12, 3166, 2598 ; t5a, t4a + psubd m%9, m%3, m%7 ; t6 + paddd m%3, m%7 ; t2 + psubd m%7, m%1, m%5 ; t4 + paddd m%1, m%5 ; t0 + psubd m%5, m%6, m%2 ; t7 + paddd m%6, m%2 ; t3 + psubd m%2, m%8, m%4 ; t5 + paddd m%8, m%4 ; t1 + REPX {pmaxsd x, m%13}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8 + REPX {pminsd x, m%14}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8 + ITX_MULSUB_2D %7, %2, %4, %10, %11, %12, 1567, 3784 ; t5a, t4a + ITX_MULSUB_2D %5, %9, %4, %10, %11, %12, 3784, %11 ; t6a, t7a + psubd m%10, m%7, m%9 ; t7 + paddd m%7, m%9 ; out6 + vpbroadcastd m%9, [pd_1448] + psubd m%4, m%8, m%6 ; t3 + paddd m%8, m%6 ; -out7 + psubd m%6, m%1, m%3 ; t2 + paddd m%1, m%3 ; out0 + psubd m%3, m%2, m%5 ; t6 + paddd m%2, m%5 ; -out1 + REPX {pmaxsd x, m%13}, m%6, m%4, m%3, m%10 + REPX {pminsd x, m%14}, m%6, m%4, m%3, m%10 + REPX {pmulld x, m%9 }, m%6, m%4, m%3, m%10 + psubd m%5, m%6, m%4 ; (t2 - t3) * 1448 + paddd m%4, m%6 ; (t2 + t3) * 1448 + psubd m%6, m%3, m%10 ; (t6 - t7) * 1448 + paddd m%3, m%10 ; (t6 + t7) * 1448 +%endmacro + +INV_TXFM_8X8_FN dct, dct +INV_TXFM_8X8_FN dct, identity +INV_TXFM_8X8_FN dct, adst +INV_TXFM_8X8_FN dct, flipadst + +cglobal idct_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + mova m4, [cq+32*4] + mova m5, [cq+32*5] + mova m6, [cq+32*6] + mova m7, [cq+32*7] + vpbroadcastd m11, [pd_2048] + call .main + call .round_shift1 + jmp tx2q +.pass2: + call .transpose_8x8_packed + call m(idct_8x8_internal_8bpc).main + vpbroadcastd m12, [pw_2048] + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call .write_8x4_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call .write_8x4 + RET +ALIGN function_align +.write_8x4_start: + vpbroadcastd m11, [pixel_10bpc_max] + lea r6, [strideq*3] + pxor m10, m10 +.write_8x4: + mova xm8, [dstq+strideq*0] + vinserti128 m8, [dstq+strideq*1], 1 + mova xm9, [dstq+strideq*2] + vinserti128 m9, [dstq+r6 ], 1 + mova [cq+32*0], m10 + mova [cq+32*1], m10 + mova [cq+32*2], m10 + mova [cq+32*3], m10 + add cq, 32*4 + paddw m0, m8 + paddw m1, m9 + pmaxsw m0, m10 + pmaxsw m1, m10 + pminsw m0, m11 + pminsw m1, m11 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], xm1 + vextracti128 [dstq+r6 ], m1, 1 + lea dstq, [dstq+strideq*4] + ret +ALIGN function_align +.transpose_8x8_packed: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + lea r6, [deint_shuf+128] + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckhdq m2, m4, m1 + punpckldq m4, m1 + vinserti128 m1, m3, xm2, 1 + vperm2i128 m3, m2, 0x31 + vperm2i128 m2, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + ret +ALIGN function_align +.main_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main: + ITX_MULSUB_2D 5, 3, 8, 9, 10, 11, 3406, 2276 ; t5a t6a + ITX_MULSUB_2D 1, 7, 8, 9, 10, 11, 799, 4017 ; t4a t7a + ITX_MULSUB_2D 2, 6, 8, 9, 10, 11, 1567, 3784 ; t2 t3 + paddd m8, m1, m5 ; t4 + psubd m1, m5 ; t5a + paddd m9, m7, m3 ; t7 + psubd m7, m3 ; t6a + vpbroadcastd m3, [pd_2896] + REPX {pmaxsd x, m12}, m1, m8, m7, m9 + REPX {pminsd x, m13}, m1, m8, m7, m9 + REPX {pmulld x, m3 }, m0, m4, m7, m1 + paddd m0, m11 + paddd m7, m11 + psubd m5, m0, m4 + paddd m0, m4 + psubd m4, m7, m1 + paddd m7, m1 + REPX {psrad x, 12 }, m5, m0, m4, m7 + psubd m3, m0, m6 ; dct4 out3 + paddd m0, m6 ; dct4 out0 + paddd m6, m5, m2 ; dct4 out1 + psubd m5, m2 ; dct4 out2 + REPX {pmaxsd x, m12}, m0, m6, m5, m3 + REPX {pminsd x, m13}, m0, m6, m5, m3 + ret +ALIGN function_align +.round_shift1: + pcmpeqd m1, m1 + REPX {psubd x, m1}, m0, m6, m5, m3 + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + psubd m7, m0, m9 ; out7 + paddd m0, m9 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 + REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 + ret + +INV_TXFM_8X8_FN adst, dct +INV_TXFM_8X8_FN adst, adst +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity + +cglobal iadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + call .main + call .main_end + jmp tx2q +.pass2: + call m(idct_8x8_internal_10bpc).transpose_8x8_packed + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call m(iadst_8x8_internal_8bpc).main_pass2 + vpbroadcastd m5, [pw_2048] + vpbroadcastd xm12, [pw_4096] + psubw m12, m5 + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_10bpc).write_8x4_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call m(idct_8x8_internal_10bpc).write_8x4 + RET +ALIGN function_align +.main: + mova m0, [cq+32*0] + mova m7, [cq+32*7] + mova m1, [cq+32*1] + mova m6, [cq+32*6] + mova m2, [cq+32*2] + mova m5, [cq+32*5] + mova m3, [cq+32*3] + mova m4, [cq+32*4] + vpbroadcastd m11, [pd_2048] +.main2: + IADST8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 + psrld m8, 10 ; pd_1 + vpbroadcastd m9, [pd_3072] + ret +ALIGN function_align +.main_end: + paddd m0, m8 + psubd m1, m8, m1 + paddd m6, m8 + psubd m7, m8, m7 + REPX {psrad x, 1 }, m0, m1, m6, m7 + ; (1 + ((x + 1024) >> 11)) >> 1 = (3072 + x) >> 12 + ; (1 - ((x + 1024) >> 11)) >> 1 = (3071 - x) >> 12 + psubd m8, m9, m8 ; pd_3071 + paddd m2, m9 + psubd m3, m8, m3 + paddd m4, m9 + psubd m5, m8, m5 + REPX {psrad x, 12}, m2, m3, m4, m5 + ret + +INV_TXFM_8X8_FN flipadst, dct +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, flipadst +INV_TXFM_8X8_FN flipadst, identity + +cglobal iflipadst_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + call m(iadst_8x8_internal_10bpc).main + call .main_end + jmp tx2q +.pass2: + call m(idct_8x8_internal_10bpc).transpose_8x8_packed + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call m(iadst_8x8_internal_8bpc).main_pass2 + vpbroadcastd m12, [pw_2048] + vpbroadcastd xm5, [pw_4096] + psubw m12, m5 + vpermq m8, m3, q2031 + vpermq m9, m2, q2031 + vpermq m2, m1, q2031 + vpermq m3, m0, q2031 + pmulhrsw m0, m8, m12 + pmulhrsw m1, m9, m12 + call m(idct_8x8_internal_10bpc).write_8x4_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call m(idct_8x8_internal_10bpc).write_8x4 + RET +ALIGN function_align +.main_end: + paddd m10, m8, m0 + psubd m0, m8, m7 + psubd m7, m8, m1 + paddd m1, m8, m6 + psrad m0, 1 + psrad m1, 1 + psrad m6, m7, 1 + psrad m7, m10, 1 + psubd m8, m9, m8 ; pd_6143 + psubd m10, m8, m5 + paddd m5, m9, m2 + psubd m2, m8, m3 + paddd m3, m9, m4 + psrad m4, m2, 12 + psrad m2, m10, 12 + psrad m3, 12 + psrad m5, 12 + ret + +INV_TXFM_8X8_FN identity, dct +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 +.pass1: + mova m0, [cq+32*0] + mova m1, [cq+32*1] + mova m2, [cq+32*2] + mova m3, [cq+32*3] + mova m4, [cq+32*4] + mova m5, [cq+32*5] + mova m6, [cq+32*6] + mova m7, [cq+32*7] + jmp tx2q +.pass2: + packssdw m3, m7 + vpbroadcastd m7, [pixel_10bpc_max] +.pass2_main: + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + vpbroadcastd m12, [pw_4096] + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m1 + punpckhdq m4, m1 + punpckhqdq m1, m0, m2 ; 1 5 + punpcklqdq m0, m2 ; 0 4 + punpcklqdq m2, m3, m4 ; 2 6 + punpckhqdq m3, m4 ; 3 7 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call .write_2x8x2_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call .write_2x8x2_zero + RET +.write_2x8x2_start: + lea r6, [strideq*5] + pxor m6, m6 +.write_2x8x2_zero: + mova [cq+32*0], m6 + mova [cq+32*1], m6 + mova [cq+32*2], m6 + mova [cq+32*3], m6 + add cq, 32*4 +.write_2x8x2: + mova xm4, [dstq+strideq*0] + vinserti128 m4, [dstq+strideq*4], 1 + mova xm5, [dstq+strideq*1] + vinserti128 m5, [dstq+r6 ], 1 + paddw m0, m4 + paddw m1, m5 + pmaxsw m0, m6 + pmaxsw m1, m6 + pminsw m0, m7 + pminsw m1, m7 + mova [dstq+strideq*0], xm0 + mova [dstq+strideq*1], xm1 + vextracti128 [dstq+strideq*4], m0, 1 + vextracti128 [dstq+r6 ], m1, 1 + lea dstq, [dstq+strideq*2] + ret + +%macro TRANSPOSE_8X8_DWORD 12 ; src/dst[1-8], tmp[1-4] + punpckldq m%9, m%1, m%2 ; aibj emfn + punpckhdq m%1, m%2 ; ckdl gohp + punpckldq m%10, m%3, m%4 ; qyrz uCvD + punpckhdq m%3, m%4 ; sAtB wExF + punpckldq m%11, m%5, m%6 ; GOHP KSLT + punpckhdq m%5, m%6 ; IQJR MUNV + punpckldq m%12, m%7, m%8 ; WeXf aibj + punpckhdq m%7, m%8 ; YgZh ckdl + punpcklqdq m%2, m%9, m%10 ; aiqy emuC + punpckhqdq m%9, m%10 ; bjrz fnvD + punpcklqdq m%4, m%1, m%3 ; cksA gowE + punpckhqdq m%10, m%1, m%3 ; dltB hpxF + punpcklqdq m%6, m%11, m%12 ; GOWe KSai + punpckhqdq m%11, m%12 ; HPXf LTbj + punpcklqdq m%8, m%5, m%7 ; IQYg MUck + punpckhqdq m%12, m%5, m%7 ; JRZh NVdl + vperm2i128 m%1, m%2, m%6, 0x20 ; out0 + vperm2i128 m%5, m%2, m%6, 0x31 ; out4 + vperm2i128 m%2, m%9, m%11, 0x20 ; out1 + vperm2i128 m%6, m%9, m%11, 0x31 ; out5 + vperm2i128 m%3, m%4, m%8, 0x20 ; out2 + vperm2i128 m%7, m%4, m%8, 0x31 ; out6 + vperm2i128 m%4, m%10, m%12, 0x20 ; out3 + vperm2i128 m%8, m%10, m%12, 0x31 ; out7 +%endmacro + +INV_TXFM_8X8_FN dct, dct, 12 +INV_TXFM_8X8_FN dct, identity, 12 +INV_TXFM_8X8_FN dct, adst, 12 +INV_TXFM_8X8_FN dct, flipadst, 12 + +cglobal idct_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(idct_8x8_internal_10bpc).pass1 +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call .transpose_8x8 + vpbroadcastd m11, [pd_2048] + call m(idct_8x8_internal_10bpc).main + call .round_shift4 + jmp m(iadst_8x8_internal_12bpc).pass2_end +ALIGN function_align +.write_8x4_start: + vpbroadcastd m11, [pixel_12bpc_max] + lea r6, [strideq*3] + pxor m10, m10 + ret +ALIGN function_align +.transpose_8x8: + TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 + ret +ALIGN function_align +.round_shift4: + vpbroadcastd m1, [pd_8] + REPX {paddd x, m1}, m0, m6, m5, m3 + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + psubd m7, m0, m9 ; out7 + paddd m0, m9 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 + REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7 + ret + +INV_TXFM_8X8_FN adst, dct, 12 +INV_TXFM_8X8_FN adst, adst, 12 +INV_TXFM_8X8_FN adst, flipadst, 12 +INV_TXFM_8X8_FN adst, identity, 12 + +cglobal iadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iadst_8x8_internal_10bpc).pass1 +.pass2: + call .pass2_main +.pass2_end: + packssdw m0, m1 + packssdw m1, m2, m3 + REPX {vpermq x, x, q3120}, m0, m1 + call m(idct_8x8_internal_12bpc).write_8x4_start + call m(idct_8x8_internal_10bpc).write_8x4 + packssdw m0, m4, m5 + packssdw m1, m6, m7 + REPX {vpermq x, x, q3120}, m0, m1 + call m(idct_8x8_internal_10bpc).write_8x4 + RET +ALIGN function_align +.pass2_main: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_8x8_internal_12bpc).transpose_8x8 + vpbroadcastd m11, [pd_2048] +.pass2_main2: + call m(iadst_8x8_internal_10bpc).main2 + pslld m9, m8, 3 ; pd_8 + paddd m0, m9 + psubd m1, m9, m1 ; 8+x + paddd m6, m9 + psubd m7, m9, m7 + REPX {psrad x, 4}, m0, m1, m6, m7 + vpbroadcastd m9, [pd_17408] + psubd m8, m9, m8 ; 17407 + paddd m2, m9 + psubd m3, m8, m3 + paddd m4, m9 + psubd m5, m8, m5 + REPX {psrad x, 15}, m2, m3, m4, m5 + ret + +INV_TXFM_8X8_FN flipadst, dct, 12 +INV_TXFM_8X8_FN flipadst, adst, 12 +INV_TXFM_8X8_FN flipadst, flipadst, 12 +INV_TXFM_8X8_FN flipadst, identity, 12 + +cglobal iflipadst_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iflipadst_8x8_internal_10bpc).pass1 +.pass2: + call m(iadst_8x8_internal_12bpc).pass2_main + packssdw m7, m7, m6 + packssdw m6, m1, m0 + packssdw m1, m5, m4 + vpermq m0, m7, q3120 + vpermq m1, m1, q3120 + call m(idct_8x8_internal_12bpc).write_8x4_start + call m(idct_8x8_internal_10bpc).write_8x4 + packssdw m0, m3, m2 + vpermq m0, m0, q3120 + vpermq m1, m6, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + RET + +INV_TXFM_8X8_FN identity, dct, 12 +INV_TXFM_8X8_FN identity, adst, 12 +INV_TXFM_8X8_FN identity, flipadst, 12 +INV_TXFM_8X8_FN identity, identity, 12 + +cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + jmp m(iidentity_8x8_internal_10bpc).pass1 +.pass2: + packssdw m3, m7 + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(iidentity_8x8_internal_10bpc).pass2_main + +%macro INV_TXFM_8X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth + INV_TXFM_FN %1, %2, %3, 8x16, %4 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 181 + vpbroadcastd m2, [dconly_%4bpc] + mov [cq], eobd ; 0 + or r3d, 16 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly2 +%endif +%endmacro + +INV_TXFM_8X16_FN dct, dct +INV_TXFM_8X16_FN dct, identity, 35 +INV_TXFM_8X16_FN dct, adst +INV_TXFM_8X16_FN dct, flipadst + +cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + vpbroadcastd m14, [pd_2896] + vpbroadcastd m11, [pd_2048] + cmp eobd, 43 + jl .fast + add cq, 32 + call .pass1_main + sub cq, 32 + mova [cq+32* 1], m0 + mova [cq+32* 3], m1 + mova [cq+32* 5], m2 + mova [cq+32* 7], m3 + mova [cq+32* 9], m4 + mova [cq+32*11], m5 + mova [cq+32*13], m6 + mova m15, m7 + call .pass1_main + mova m8, [cq+32* 1] + mova m9, [cq+32* 3] + mova m10, [cq+32* 5] + mova m11, [cq+32* 7] + mova m12, [cq+32* 9] + mova m13, [cq+32*11] + mova m14, [cq+32*13] + jmp tx2q +.fast: + call .pass1_main + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call .transpose + call m(idct_8x16_internal_8bpc).main + vpbroadcastd m12, [pw_2048] + REPX {vpermq x, x, q3120}, m0, m2, m4, m6 + REPX {vpermq x, x, q2031}, m1, m3, m5, m7 +.end: + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_10bpc).write_8x4_start + pmulhrsw m0, m2, m12 + pmulhrsw m1, m3, m12 + call m(idct_8x8_internal_10bpc).write_8x4 + pmulhrsw m0, m4, m12 + pmulhrsw m1, m5, m12 + call m(idct_8x8_internal_10bpc).write_8x4 + pmulhrsw m0, m6, m12 + pmulhrsw m1, m7, m12 + call m(idct_8x8_internal_10bpc).write_8x4 + RET +ALIGN function_align +.transpose: + packssdw m0, m8 + packssdw m1, m9 + packssdw m2, m10 + packssdw m3, m11 + packssdw m4, m12 + packssdw m5, m13 + packssdw m6, m14 + packssdw m7, m15 + lea r6, [deint_shuf+128] + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpcklwd m3, m4, m5 + punpckhwd m4, m5 + punpckhwd m5, m6, m7 + punpcklwd m6, m7 + punpckhdq m7, m3, m6 + punpckldq m3, m6 + punpckhdq m6, m4, m5 + punpckldq m4, m5 + punpckhdq m5, m8, m1 + punpckldq m8, m1 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + vperm2i128 m2, m0, m3, 0x31 + vinserti128 m0, xm3, 1 + vperm2i128 m3, m1, m7, 0x31 + vinserti128 m1, xm7, 1 + vperm2i128 m7, m5, m6, 0x31 + vinserti128 m5, xm6, 1 + vperm2i128 m6, m8, m4, 0x31 + vinserti128 m4, m8, xm4, 1 + ret +ALIGN function_align +.pass1_main: + pmulld m0, m14, [cq+32* 0] + pmulld m1, m14, [cq+32* 2] + pmulld m2, m14, [cq+32* 4] + pmulld m3, m14, [cq+32* 6] + pmulld m4, m14, [cq+32* 8] + pmulld m5, m14, [cq+32*10] + pmulld m6, m14, [cq+32*12] + pmulld m7, m14, [cq+32*14] + call m(idct_8x8_internal_10bpc).main_rect2 + jmp m(idct_8x8_internal_10bpc).round_shift1 +ALIGN function_align +.main_evenhalf: + paddd m1, m6, m7 ; idct8 out1 + psubd m6, m7 ; idct8 out6 + psubd m7, m0, m9 ; idct8 out7 + paddd m0, m9 ; idct8 out0 + paddd m2, m5, m4 ; idct8 out2 + psubd m5, m4 ; idct8 out5 + psubd m4, m3, m8 ; idct8 out4 + paddd m3, m8 ; idct8 out3 + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + ret +.main_oddhalf_fast_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_oddhalf_fast: ; lower half zero + vpbroadcastd m7, [pd_4076] + vpbroadcastd m8, [pd_401] + vpbroadcastd m6, [pd_m1189] + vpbroadcastd m9, [pd_3920] + vpbroadcastd m5, [pd_3612] + vpbroadcastd m10, [pd_1931] + vpbroadcastd m4, [pd_m2598] + vpbroadcastd m15, [pd_3166] + pmulld m7, m0 + pmulld m0, m8 + pmulld m6, m1 + pmulld m1, m9 + pmulld m5, m2 + pmulld m2, m10 + pmulld m4, m3 + pmulld m3, m15 + jmp .main_oddhalf_fast2 +.main_oddhalf_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main_oddhalf: + ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 401, 4076 ; t8a, t15a + ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a + ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a + ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3166, 2598 ; t9a, t14a +.main_oddhalf_fast2: + REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 + REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 + psubd m8, m0, m4 ; t9 + paddd m0, m4 ; t8 + psubd m4, m6, m2 ; t10 + paddd m2, m6 ; t11 + psubd m6, m1, m5 ; t13 + paddd m5, m1 ; t12 + psubd m1, m7, m3 ; t14 + paddd m7, m3 ; t15 + REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7 + REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7 + vpbroadcastd m15, [pd_3784] + vpbroadcastd m10, [pd_1567] + ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15 + ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 2 + psubd m3, m1, m4 ; t10 + paddd m1, m4 ; t9 + psubd m4, m0, m2 ; t11a + paddd m0, m2 ; t8a + psubd m2, m8, m6 ; t13 + paddd m6, m8 ; t14 + psubd m8, m7, m5 ; t12a + paddd m7, m5 ; t15a + REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7 + REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7 + REPX {pmulld x, m14}, m2, m8, m3, m4 + paddd m2, m11 + paddd m8, m11 + paddd m5, m2, m3 ; t13a + psubd m2, m3 ; t10a + psubd m3, m8, m4 ; t11 + paddd m4, m8 ; t12 + REPX {psrad x, 12}, m5, m2, m3, m4 + mova [r6-32*4], m7 + mova [r6-32*3], m6 + mova [r6-32*2], m5 + mova [r6-32*1], m4 + mova [r6+32*0], m3 + mova [r6+32*1], m2 + mova [r6+32*2], m1 + mova [r6+32*3], m0 + ret + +INV_TXFM_8X16_FN adst, dct +INV_TXFM_8X16_FN adst, adst +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, identity, 35 + +cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + vpbroadcastd m14, [pd_2896] + vpbroadcastd m11, [pd_2048] + cmp eobd, 43 + jl .fast + add cq, 32 + call .pass1_main + call m(iadst_8x8_internal_10bpc).main_end + sub cq, 32 + mova [cq+32* 1], m0 + mova [cq+32* 3], m1 + mova [cq+32* 5], m2 + mova [cq+32* 7], m3 + mova [cq+32* 9], m4 + mova [cq+32*11], m5 + mova [cq+32*13], m6 + mova m15, m7 + call .pass1_main + call m(iadst_8x8_internal_10bpc).main_end + mova m8, [cq+32* 1] + mova m9, [cq+32* 3] + mova m10, [cq+32* 5] + mova m11, [cq+32* 7] + mova m12, [cq+32* 9] + mova m13, [cq+32*11] + mova m14, [cq+32*13] + jmp tx2q +.fast: + call .pass1_main + call m(iadst_8x8_internal_10bpc).main_end + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call m(idct_8x16_internal_10bpc).transpose + call m(iadst_8x16_internal_8bpc).main + call m(iadst_8x16_internal_8bpc).main_pass2_end + vpbroadcastd m8, [pw_2048] + vpbroadcastd xm12, [pw_4096] + REPX {vpermq x, x, q2031}, m0, m1, m2, m3 + REPX {vpermq x, x, q3120}, m4, m5, m6, m7 + psubw m12, m8 + jmp m(idct_8x16_internal_10bpc).end +ALIGN function_align +.pass1_main: + pmulld m0, m14, [cq+32* 0] + pmulld m7, m14, [cq+32*14] + pmulld m1, m14, [cq+32* 2] + pmulld m6, m14, [cq+32*12] + pmulld m2, m14, [cq+32* 4] + pmulld m5, m14, [cq+32*10] + pmulld m3, m14, [cq+32* 6] + pmulld m4, m14, [cq+32* 8] + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 + jmp m(iadst_8x8_internal_10bpc).main2 + +INV_TXFM_8X16_FN flipadst, dct +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst +INV_TXFM_8X16_FN flipadst, identity, 35 + +cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +%undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + vpbroadcastd m14, [pd_2896] + vpbroadcastd m11, [pd_2048] + cmp eobd, 43 + jl .fast + add cq, 32 + call m(iadst_8x16_internal_10bpc).pass1_main + call m(iflipadst_8x8_internal_10bpc).main_end + sub cq, 32 + mova [cq+32* 1], m0 + mova [cq+32* 3], m1 + mova [cq+32* 5], m2 + mova [cq+32* 7], m3 + mova [cq+32* 9], m4 + mova [cq+32*11], m5 + mova [cq+32*13], m6 + mova m15, m7 + call m(iadst_8x16_internal_10bpc).pass1_main + call m(iflipadst_8x8_internal_10bpc).main_end + mova m8, [cq+32* 1] + mova m9, [cq+32* 3] + mova m10, [cq+32* 5] + mova m11, [cq+32* 7] + mova m12, [cq+32* 9] + mova m13, [cq+32*11] + mova m14, [cq+32*13] + jmp tx2q +.fast: + call m(iadst_8x16_internal_10bpc).pass1_main + call m(iflipadst_8x8_internal_10bpc).main_end + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call m(idct_8x16_internal_10bpc).transpose + call m(iadst_8x16_internal_8bpc).main + call m(iadst_8x16_internal_8bpc).main_pass2_end + vpbroadcastd m12, [pw_2048] + vpbroadcastd xm13, [pw_4096] + mova m11, m0 + vpermq m0, m7, q2031 + mova m10, m1 + vpermq m1, m6, q2031 + mova m9, m2 + vpermq m2, m5, q2031 + mova m8, m3 + vpermq m3, m4, q2031 + vpermq m4, m8, q3120 + vpermq m5, m9, q3120 + vpermq m6, m10, q3120 + vpermq m7, m11, q3120 + psubw m12, m13 + jmp m(idct_8x16_internal_10bpc).end + +INV_TXFM_8X16_FN identity, dct +INV_TXFM_8X16_FN identity, adst +INV_TXFM_8X16_FN identity, flipadst +INV_TXFM_8X16_FN identity, identity + +%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16384] + pmulhrsw m%2, m%3, m%1 +%if %0 == 4 ; if downshifting by 1 +%ifnum %4 + pmulhrsw m%2, m%4 +%else ; without rounding + psraw m%2, 1 +%endif +%else + paddsw m%1, m%1 +%endif + paddsw m%1, m%2 +%endmacro + +cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m15, [pd_2896] + pmulld m0, m15, [cq+32* 0] + pmulld m8, m15, [cq+32* 1] + pmulld m1, m15, [cq+32* 2] + pmulld m9, m15, [cq+32* 3] + pmulld m2, m15, [cq+32* 4] + pmulld m10, m15, [cq+32* 5] + pmulld m3, m15, [cq+32* 6] + pmulld m11, m15, [cq+32* 7] + pmulld m4, m15, [cq+32* 8] + pmulld m12, m15, [cq+32* 9] + pmulld m5, m15, [cq+32*10] + pmulld m13, m15, [cq+32*11] + pmulld m6, m15, [cq+32*12] + pmulld m14, m15, [cq+32*13] + pmulld m7, m15, [cq+32*14] + pmulld m15, [cq+32*15] + mova [cq], m7 + vpbroadcastd m7, [pd_2048] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + paddd m7, [cq] + REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + packssdw m0, m8 + packssdw m1, m9 + packssdw m2, m10 + packssdw m3, m11 + packssdw m4, m12 + packssdw m5, m13 + packssdw m6, m14 + packssdw m13, m7, m15 + vpbroadcastd m8, [pw_1697x16] + REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 13 + vpbroadcastd m7, [pixel_10bpc_max] + vpbroadcastd m12, [pw_2048] + call .pass2_end + RET +ALIGN function_align +.pass2_end: + punpckhwd m9, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m13 + punpcklwd m6, m13 + punpckhwd m13, m4, m5 + punpcklwd m4, m5 + punpcklwd m5, m2, m3 + punpckhwd m2, m3 + punpckhdq m3, m0, m5 + punpckldq m0, m5 + punpckhdq m11, m9, m2 + punpckldq m9, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckldq m6, m13, m1 + punpckhdq m13, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m8, m9, m6 + punpckhqdq m9, m6 + punpcklqdq m10, m11, m13 + punpckhqdq m11, m13 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(iidentity_8x8_internal_10bpc).write_2x8x2_start + pmulhrsw m0, m12, m2 + pmulhrsw m1, m12, m3 + call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero + pmulhrsw m0, m12, m8 + pmulhrsw m1, m12, m9 + lea dstq, [dstq+strideq*4] + call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero + pmulhrsw m0, m12, m10 + pmulhrsw m1, m12, m11 + call m(iidentity_8x8_internal_10bpc).write_2x8x2_zero + ret + +INV_TXFM_8X16_FN dct, dct, 0, 12 +INV_TXFM_8X16_FN dct, identity, 35, 12 +INV_TXFM_8X16_FN dct, adst, 0, 12 +INV_TXFM_8X16_FN dct, flipadst, 0, 12 + +cglobal idct_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(idct_8x16_internal_10bpc).pass1 +.pass2: + lea r6, [rsp+32*4] + call .transpose + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + mova [cq+32* 8], m0 + mova [cq+32*10], m2 + mova [cq+32*12], m4 + mova [cq+32*14], m6 + pmaxsd m0, m12, [cq+32* 1] + pmaxsd m4, m12, m1 + pmaxsd m1, m12, [cq+32* 3] + pmaxsd m2, m12, [cq+32* 5] + pmaxsd m6, m12, m5 + pmaxsd m5, m12, m3 + pmaxsd m3, m12, [cq+32* 7] + pmaxsd m7, m12 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m11, [pd_2048] + vpbroadcastd m14, [pd_2896] + call m(idct_8x16_internal_10bpc).main_oddhalf + pmaxsd m0, m12, [cq+32* 0] + pmaxsd m1, m12, [cq+32* 2] + pmaxsd m2, m12, [cq+32* 4] + pmaxsd m3, m12, [cq+32* 6] + pmaxsd m4, m12, [cq+32* 8] + pmaxsd m5, m12, [cq+32*10] + pmaxsd m6, m12, [cq+32*12] + pmaxsd m7, m12, [cq+32*14] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf + vpbroadcastd m11, [pd_8] + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_16x8_internal_10bpc).pass1_rotations + REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 +.end: + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + packssdw m4, m8, m9 + packssdw m5, m10, m11 + packssdw m6, m12, m13 + packssdw m7, m14, m15 + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 + call m(idct_8x8_internal_12bpc).write_8x4_start + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m2, q3120 + vpermq m1, m3, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m4, q3120 + vpermq m1, m5, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m6, q3120 + vpermq m1, m7, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + RET +ALIGN function_align +.transpose: + mova [cq+32* 8], m8 + mova [cq+32* 9], m9 + mova [cq+32*10], m10 + mova [cq+32*11], m11 + call m(idct_8x8_internal_12bpc).transpose_8x8 + mova [cq+32* 0], m0 + mova [cq+32* 1], m1 + mova [cq+32* 2], m2 + mova [cq+32* 3], m3 + mova [cq+32* 4], m4 + mova [cq+32* 5], m5 + mova [cq+32* 6], m6 + mova [cq+32* 7], m7 + mova m0, [cq+32* 8] + mova m1, [cq+32* 9] + mova m2, [cq+32*10] + mova m3, [cq+32*11] + mova m4, m12 + mova m5, m13 + mova m6, m14 + mova m7, m15 + jmp m(idct_8x8_internal_12bpc).transpose_8x8 + +INV_TXFM_8X16_FN adst, dct, 0, 12 +INV_TXFM_8X16_FN adst, adst, 0, 12 +INV_TXFM_8X16_FN adst, flipadst, 0, 12 +INV_TXFM_8X16_FN adst, identity, 35, 12 + +cglobal iadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iadst_8x16_internal_10bpc).pass1 +.pass2: + lea r6, [rsp+32*4] + call .pass2_main + call m(iadst_16x8_internal_10bpc).pass1_rotations +.pass2_end: + REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 + jmp m(idct_8x16_internal_12bpc).end +ALIGN function_align +.pass2_main: + call m(idct_8x16_internal_12bpc).transpose + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] + mova [cq+32* 8], m0 + mova [cq+32*11], m3 + mova [cq+32*12], m4 + mova [cq+32*15], m7 + pmaxsd m0, m13, [cq+32* 2] ; 2 + pmaxsd m3, m13, m1 ; 9 + pmaxsd m1, m13, m5 ; 13 + pmaxsd m4, m13, m2 ; 10 + pmaxsd m2, m13, [cq+32* 6] ; 6 + pmaxsd m5, m13, [cq+32* 5] ; 5 + pmaxsd m6, m13, m6 ; 14 + pmaxsd m7, m13, [cq+32* 1] ; 1 + REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m12, [pd_2048] + vpbroadcastd m15, [pd_2896] + call m(iadst_16x8_internal_10bpc).main_part1 + pmaxsd m0, m13, [cq+32* 0] ; 0 + pmaxsd m1, m13, [cq+32*15] ; 15 + pmaxsd m2, m13, [cq+32* 4] ; 4 + pmaxsd m3, m13, [cq+32*11] ; 11 + pmaxsd m4, m13, [cq+32* 8] ; 8 + pmaxsd m5, m13, [cq+32* 7] ; 7 + pmaxsd m6, m13, [cq+32*12] ; 12 + pmaxsd m7, m13, [cq+32* 3] ; 3 + REPX {pminsd x, m14}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(iadst_16x8_internal_10bpc).main_part2 + vpbroadcastd m14, [pd_17408] + psrld m15, 11 ; pd_1 + psubd m13, m14, m15 ; pd_17407 + pslld m15, 3 ; pd_8 + ret + +INV_TXFM_8X16_FN flipadst, dct, 0, 12 +INV_TXFM_8X16_FN flipadst, adst, 0, 12 +INV_TXFM_8X16_FN flipadst, flipadst, 0, 12 +INV_TXFM_8X16_FN flipadst, identity, 35, 12 + +cglobal iflipadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iflipadst_8x16_internal_10bpc).pass1 +.pass2: + lea r6, [rsp+32*4] + call m(iadst_8x16_internal_12bpc).pass2_main + call m(iflipadst_16x8_internal_10bpc).pass1_rotations + jmp m(iadst_8x16_internal_12bpc).pass2_end + +INV_TXFM_8X16_FN identity, dct, 0, 12 +INV_TXFM_8X16_FN identity, adst, 0, 12 +INV_TXFM_8X16_FN identity, flipadst, 0, 12 +INV_TXFM_8X16_FN identity, identity, 0, 12 + +cglobal iidentity_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + jmp m(iidentity_8x16_internal_10bpc).pass1 +.pass2: + call .pass2_main + packssdw m0, m8 + packssdw m1, m9 + packssdw m2, m10 + packssdw m3, m11 + packssdw m4, m12 + packssdw m5, m13 + packssdw m6, m14 + packssdw m13, m7, m15 + vpbroadcastd m7, [pixel_12bpc_max] + vpbroadcastd m12, [pw_16384] + call m(iidentity_8x16_internal_10bpc).pass2_end + RET +ALIGN function_align +.pass2_main: + mova [cq], m7 + vpbroadcastd m7, [clip_18b_min] + REPX {pmaxsd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + pmaxsd m7, [cq] + mova [cq], m15 + vpbroadcastd m15, [clip_18b_max] + REPX {pminsd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + pminsd m15, [cq] + mova [cq], m7 + vpbroadcastd m7, [pd_5793] + REPX {pmulld x, m7}, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + pmulld m7, [cq] + mova [cq], m15 + vpbroadcastd m15, [pd_1024] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + paddd m15, [cq] + REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + ret + +%macro INV_TXFM_16X4_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 16x4, %3 +%ifidn %1_%2, dct_dct + vpbroadcastd m3, [dconly_%3bpc] +%if %3 = 10 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 4 +.dconly2: + add r6d, 384 + sar r6d, 9 +.dconly3: + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + movd xm0, r6d + paddsw xm0, xm3 + vpbroadcastw m0, xm0 +.dconly_loop: + paddsw m1, m0, [dstq+strideq*0] + paddsw m2, m0, [dstq+strideq*1] + psubusw m1, m3 + psubusw m2, m3 + mova [dstq+strideq*0], m1 + mova [dstq+strideq*1], m2 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +%else + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly +%endif +%endif +%endmacro + +INV_TXFM_16X4_FN dct, dct +INV_TXFM_16X4_FN dct, identity +INV_TXFM_16X4_FN dct, adst +INV_TXFM_16X4_FN dct, flipadst + +cglobal idct_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_18b_min] + vpbroadcastd m9, [clip_18b_max] +.pass1: + vbroadcasti128 m0, [cq+16* 0] + vbroadcasti128 m4, [cq+16* 4] + vbroadcasti128 m1, [cq+16* 2] + vbroadcasti128 m7, [cq+16* 6] + vbroadcasti128 m5, [cq+16*10] + vbroadcasti128 m2, [cq+16* 8] + vbroadcasti128 m6, [cq+16*12] + vbroadcasti128 m3, [cq+16*14] + shufpd m0, m4, 0x0c ; 0 4 + shufpd m1, m5, 0x0c ; 2 10 + shufpd m2, m6, 0x0c ; 8 12 + shufpd m3, m7, 0x0c ; 14 6 + call .pass1_main + vbroadcasti128 m10, [cq+16* 1] + vbroadcasti128 m4, [cq+16* 5] + vbroadcasti128 m11, [cq+16*15] + vbroadcasti128 m5, [cq+16*11] + shufpd m10, m4, 0x0c ; 1 5 + shufpd m11, m5, 0x0c ; 15 11 + vbroadcasti128 m5, [cq+16* 9] + vbroadcasti128 m4, [cq+16*13] + shufpd m5, m4, 0x0c ; 9 13 + vbroadcasti128 m6, [cq+16* 7] + vbroadcasti128 m4, [cq+16* 3] + shufpd m6, m4, 0x0c ; 7 3 + call .pass1_main2 + pcmpeqd m4, m4 + REPX {psubd x, m4}, m0, m1, m2, m3 + call .pass1_main3 + REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + call .transpose_4x16_packed + lea r6, [deint_shuf+128] + call m(idct_16x4_internal_8bpc).main +.end: + vpbroadcastd m4, [pw_2048] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + vpbroadcastd m5, [pixel_10bpc_max] +.end2: + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] +.end3: + lea r6, [dstq+strideq*2] + paddw m2, [r6 +strideq*0] + paddw m3, [r6 +strideq*1] + pxor m4, m4 + REPX {mova [cq+32*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 + REPX {pmaxsw x, m4}, m0, m1, m2, m3 + REPX {pminsw x, m5}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [r6 +strideq*0], m2 + mova [r6 +strideq*1], m3 + RET +ALIGN function_align +.pass1_main: + vpbroadcastd m7, [pd_2048] + call m(idct_8x4_internal_10bpc).main + psubd m3, m0, m4 ; idct8 out7 out6 + paddd m0, m4 ; idct8 out0 out1 + paddd m1, m2, m5 ; idct8 out3 out2 + psubd m2, m5 ; idct8 out4 out5 + ret +ALIGN function_align +.pass1_main2: + ITX_MULSUB_2D 10, 11, 4, 12, 13, 7, 401_1931, 4076_3612, 1 + ITX_MULSUB_2D 5, 6, 4, 12, 13, 7, 3166_3920, 2598_1189, 1 + vbroadcasti128 m12, [pd_3784_m3784] + psubd m4, m10, m5 + paddd m10, m5 ; t8 t11 + psignd m4, m12 ; t9 t10 + psubd m5, m11, m6 + paddd m11, m6 ; t15 t12 + psignd m5, m12 ; t14 t13 + vpbroadcastd m6, [pd_1567] + vpbroadcastd m13, [pd_3784] + REPX {pmaxsd x, m8}, m5, m4 + REPX {pminsd x, m9}, m5, m4 + pmulld m12, m5 + pmulld m5, m6 + vbroadcasti128 m6, [pd_1567_m1567] + pmulld m13, m4 + pmulld m4, m6 + REPX {pmaxsd x, m8}, m10, m11, m0, m1 + REPX {pminsd x, m9}, m10, m11, m0, m1 + paddd m12, m7 + paddd m5, m7 + paddd m4, m12 + psubd m5, m13 + psrad m4, 12 ; t14a t10a + psrad m5, 12 ; t9a t13a + vpbroadcastd m12, [pd_2896] + punpckhqdq m6, m11, m5 + punpcklqdq m11, m4 + punpckhqdq m4, m10, m4 + punpcklqdq m10, m5 + psubd m5, m11, m6 ; t12a t13 + paddd m11, m6 ; t15a t14 + psubd m6, m10, m4 ; t11a t10 + paddd m10, m4 ; t8a t9 + REPX {pmaxsd x, m8}, m5, m6 + REPX {pminsd x, m9}, m5, m6 + pmulld m5, m12 + pmulld m6, m12 + REPX {pmaxsd x, m8}, m2, m3, m11, m10 + REPX {pminsd x, m9}, m2, m3, m11, m10 + ret +ALIGN function_align +.pass1_main3: + paddd m5, m7 + psubd m4, m5, m6 + paddd m5, m6 + psrad m4, 12 ; t11 t10a + psrad m5, 12 ; t12 t13a + psubd m7, m0, m11 ; out15 out14 + paddd m0, m11 ; out0 out1 + psubd m6, m1, m5 ; out12 out13 + paddd m1, m5 ; out3 out2 + psubd m5, m2, m4 ; out11 out10 + paddd m2, m4 ; out4 out5 + psubd m4, m3, m10 ; out8 out9 + paddd m3, m10 ; out7 out6 + REPX {pshufd x, x, q1032}, m1, m3, m5, m7 + ret +ALIGN function_align +.transpose_4x16_packed: + vbroadcasti128 m8, [deint_shuf] + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + REPX {pshufb x, m8}, m0, m2, m4, m6 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpckhqdq m2, m4, m6 + punpcklqdq m4, m6 + vperm2i128 m3, m1, m2, 0x31 + vinserti128 m1, xm2, 1 + vperm2i128 m2, m0, m4, 0x31 + vinserti128 m0, xm4, 1 + ret + +INV_TXFM_16X4_FN adst, dct +INV_TXFM_16X4_FN adst, adst +INV_TXFM_16X4_FN adst, flipadst +INV_TXFM_16X4_FN adst, identity + +cglobal iadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + call m(iadst_4x16_internal_10bpc).main + psrad m11, 11 ; pd_1 + REPX {paddd x, m11}, m0, m1, m2, m3 + paddd m4, m5, m11 + paddd m5, m6, m11 + paddd m6, m7, m11 + paddd m7, m8, m11 +.pass1_end: + REPX {pshufd x, x, q1032}, m0, m2, m4, m6 + REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + call m(idct_16x4_internal_10bpc).transpose_4x16_packed + lea r6, [deint_shuf+128] + call m(iadst_16x4_internal_8bpc).main + jmp m(idct_16x4_internal_10bpc).end +ALIGN function_align +.main: + vpbroadcastd m6, [pd_1321] + mova m0, [cq+32*0] + mova m1, [cq+32*1] + vpbroadcastd m7, [pd_2482] + mova m2, [cq+32*6] + mova m3, [cq+32*7] + pmulld m4, m0, m6 + pmulld m5, m1, m6 ; 1321*in0 + pmulld m9, m2, m7 + pmulld m8, m3, m7 ; 2482*in3 + paddd m4, m9 + paddd m8, m5 ; 1321*in0 + 2482*in3 + pmulld m5, m0, m7 + pmulld m9, m1, m7 ; 2482*in0 + paddd m0, m2 + paddd m1, m3 ; in0 + in3 + paddd m7, m6 ; pd_3803 + pmulld m2, m7 + pmulld m3, m7 ; 3803*in3 + psubd m5, m2 + psubd m9, m3 ; 2482*in0 - 3803*in3 + mova m2, [cq+32*4] + pmulld m10, m7, m2 + pmulld m3, m6, m2 + psubd m2, m0 + mova m0, [cq+32*5] + pmulld m7, m0 ; 3803*in2 + pmulld m6, m0 ; 1321*in2 + psubd m0, m1 ; in2 - in0 - in3 + vpbroadcastd m1, [pd_m3344] + paddd m4, m10 + paddd m7, m8 ; t0 + psubd m5, m3 + psubd m9, m6 ; t1 + pmulld m2, m1 + pmulld m0, m1 ; t2 + pmulld m3, m1, [cq+32*2] + pmulld m1, [cq+32*3] ; -t3 + ret +ALIGN function_align +.main_end: + ; expects: m6 = rnd + paddd m5, m6 + paddd m9, m6 + paddd m10, m4, m5 + paddd m4, m6 + paddd m8, m7, m6 + paddd m7, m9 + psubd m4, m3 ; out0 (unshifted) + psubd m5, m3 ; out1 (unshifted) + paddd m2, m6 ; out2 (unshifted) + paddd m3, m10 ; out3 (unshifted) + psubd m8, m1 ; out4 (unshifted) + psubd m9, m1 ; out5 (unshifted) + paddd m6, m0 ; out6 (unshifted) + paddd m7, m1 ; out7 (unshifted) + ret + +INV_TXFM_16X4_FN flipadst, dct +INV_TXFM_16X4_FN flipadst, adst +INV_TXFM_16X4_FN flipadst, flipadst +INV_TXFM_16X4_FN flipadst, identity + +cglobal iflipadst_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + call m(iadst_4x16_internal_10bpc).main + psrad m11, 11 ; pd_1 + paddd m4, m3, m11 + paddd m3, m5, m11 + paddd m5, m2, m11 + paddd m2, m6, m11 + paddd m6, m1, m11 + paddd m1, m7, m11 + paddd m7, m0, m11 + paddd m0, m8, m11 + jmp m(iadst_16x4_internal_10bpc).pass1_end +.pass2: + call m(idct_16x4_internal_10bpc).transpose_4x16_packed + lea r6, [deint_shuf+128] + call m(iadst_16x4_internal_8bpc).main + vpbroadcastd m4, [pw_2048] + pmulhrsw m5, m3, m4 + pmulhrsw m6, m2, m4 + pmulhrsw m2, m1, m4 + pmulhrsw m3, m0, m4 + paddw m0, m5, [dstq+strideq*0] + paddw m1, m6, [dstq+strideq*1] + vpbroadcastd m5, [pixel_10bpc_max] + jmp m(idct_16x4_internal_10bpc).end3 + +INV_TXFM_16X4_FN identity, dct +INV_TXFM_16X4_FN identity, adst +INV_TXFM_16X4_FN identity, flipadst +INV_TXFM_16X4_FN identity, identity + +cglobal iidentity_16x4_internal_10bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m8, [pd_5793] + vpermq m0, [cq+32*0], q3120 ; 0 1 + vpermq m1, [cq+32*1], q3120 ; 2 3 + vpermq m2, [cq+32*2], q3120 ; 4 5 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m4, [cq+32*4], q3120 ; 8 9 + vpermq m5, [cq+32*5], q3120 ; a b + vpermq m6, [cq+32*6], q3120 ; c d + vpermq m7, [cq+32*7], q3120 ; e f + vpbroadcastd m9, [pd_3072] + REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + call m(idct_16x4_internal_10bpc).transpose_4x16_packed + vpbroadcastd m7, [pw_1697x8] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(idct_16x4_internal_10bpc).end + +INV_TXFM_16X4_FN dct, dct, 12 +INV_TXFM_16X4_FN dct, identity, 12 +INV_TXFM_16X4_FN dct, adst, 12 +INV_TXFM_16X4_FN dct, flipadst, 12 + +cglobal idct_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m8, [clip_20b_min] + vpbroadcastd m9, [clip_20b_max] + jmp m(idct_16x4_internal_10bpc).pass1 +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + ; deinterleave + REPX {pshufd x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7 + ; transpose + punpcklqdq m8, m0, m1 + punpckhqdq m0, m1 + punpcklqdq m9, m2, m3 + punpckhqdq m2, m3 + punpcklqdq m10, m4, m5 + punpckhqdq m4, m5 + punpcklqdq m11, m6, m7 + punpckhqdq m6, m7 + vperm2i128 m3, m0, m2, 0x31 ; out6 + vperm2i128 m1, m0, m2, 0x20 ; out2 + vperm2i128 m7, m4, m6, 0x31 ; out7 + vperm2i128 m5, m4, m6, 0x20 ; out3 + vperm2i128 m13, m10, m11, 0x31 ; out5 + vperm2i128 m12, m10, m11, 0x20 ; out1 + vperm2i128 m11, m8, m9, 0x31 ; out4 + vperm2i128 m10, m8, m9, 0x20 ; out0 + call m(idct_4x16_internal_10bpc).pass1_main + pmulld m0, m6, m10 + pmulld m2, m6, m11 + pmulld m4, m6, m12 + pmulld m6, m13 + vpbroadcastd m10, [pd_17408] + call m(idct_4x16_internal_10bpc).pass1_main2 + REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpbroadcastd m5, [pixel_12bpc_max] + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + jmp m(idct_16x4_internal_10bpc).end2 + +INV_TXFM_16X4_FN adst, dct, 12 +INV_TXFM_16X4_FN adst, adst, 12 +INV_TXFM_16X4_FN adst, flipadst, 12 +INV_TXFM_16X4_FN adst, identity, 12 + +cglobal iadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iadst_16x4_internal_10bpc).pass1 +.pass2: + call .pass2_main + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + jmp m(idct_16x4_internal_10bpc).end2 +ALIGN function_align +.pass2_main: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m6, m7 + pmaxsd m8, m4, m12 + pmaxsd m9, m5, m12 + REPX {pminsd x, m13}, m0, m1, m2, m3 + call m(iadst_8x4_internal_12bpc).transpose_4x8 + mova [cq+32*0], m0 + mova [cq+32*2], m1 + mova [cq+32*4], m2 + mova [cq+32*6], m3 + pminsd m0, m8, m13 + pminsd m1, m9, m13 + pminsd m2, m6, m13 + pminsd m3, m7, m13 + call m(iadst_8x4_internal_12bpc).transpose_4x8 + mova [cq+32*1], m0 + mova [cq+32*3], m1 + mova [cq+32*5], m2 + mova [cq+32*7], m3 + call m(iadst_16x4_internal_10bpc).main + vpbroadcastd m6, [pd_2048] + call m(iadst_16x4_internal_10bpc).main_end + psrad m0, m4, 15 + psrad m1, m5, 15 + psrad m2, 15 + psrad m3, 15 + psrad m4, m8, 15 + psrad m5, m9, 15 + psrad m6, 15 + psrad m7, 15 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + vpbroadcastd m4, [pw_16384] + vpbroadcastd m5, [pixel_12bpc_max] + ret + +INV_TXFM_16X4_FN flipadst, dct, 12 +INV_TXFM_16X4_FN flipadst, adst, 12 +INV_TXFM_16X4_FN flipadst, flipadst, 12 +INV_TXFM_16X4_FN flipadst, identity, 12 + +cglobal iflipadst_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(iflipadst_16x4_internal_10bpc).pass1 +.pass2: + call m(iadst_16x4_internal_12bpc).pass2_main + vpermq m7, m0, q3120 + vpermq m6, m1, q3120 + vpermq m1, m2, q3120 + vpermq m0, m3, q3120 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + pmulhrsw m2, m6, m4 + pmulhrsw m3, m7, m4 + jmp m(idct_16x4_internal_10bpc).end2 + +INV_TXFM_16X4_FN identity, dct, 12 +INV_TXFM_16X4_FN identity, adst, 12 +INV_TXFM_16X4_FN identity, flipadst, 12 +INV_TXFM_16X4_FN identity, identity, 12 + +cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 + vpbroadcastd m8, [pd_1697] + vpermq m0, [cq+32*0], q3120 ; 0 1 + vpermq m1, [cq+32*1], q3120 ; 2 3 + vpermq m2, [cq+32*2], q3120 ; 4 5 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpbroadcastd m9, [pd_3072] + pmulld m4, m8, m0 + pmulld m5, m8, m1 + pmulld m6, m8, m2 + pmulld m7, m8, m3 + vpermq m10, [cq+32*4], q3120 ; 8 9 + vpermq m11, [cq+32*5], q3120 ; a b + vpermq m12, [cq+32*6], q3120 ; c d + vpermq m13, [cq+32*7], q3120 ; e f + REPX {paddd x, m9}, m4, m5, m6, m7 + REPX {psrad x, 12}, m4, m5, m6, m7 + paddd m0, m4 + pmulld m4, m8, m10 + paddd m1, m5 + pmulld m5, m8, m11 + paddd m2, m6 + pmulld m6, m8, m12 + paddd m3, m7 + pmulld m7, m8, m13 + REPX {paddd x, m9}, m4, m5, m6, m7 + REPX {psrad x, 12}, m4, m5, m6, m7 + paddd m4, m10 + paddd m5, m11 + paddd m6, m12 + paddd m7, m13 + jmp tx2q +.pass2: + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m8, [pd_5793] + vpbroadcastd m9, [pd_2048] + REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 15}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_16x4_internal_10bpc).transpose_4x16_packed + vpbroadcastd m4, [pw_16384] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + vpbroadcastd m5, [pixel_12bpc_max] + jmp m(idct_16x4_internal_10bpc).end2 + +%macro INV_TXFM_16X8_FN 2-3 10 ; type1, type2, bitdepth + INV_TXFM_FN %1, %2, 0, 16x8, %3 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_%3bpc] + mov [cq], eobd ; 0 + or r3d, 8 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2 +%endif +%endmacro + +INV_TXFM_16X8_FN dct, dct +INV_TXFM_16X8_FN dct, identity +INV_TXFM_16X8_FN dct, adst +INV_TXFM_16X8_FN dct, flipadst + +cglobal idct_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + vpbroadcastd m14, [pd_2896] + pmulld m0, m14, [cq+32* 1] + pmulld m1, m14, [cq+32* 3] + pmulld m2, m14, [cq+32* 5] + pmulld m3, m14, [cq+32* 7] + pmulld m4, m14, [cq+32* 9] + pmulld m5, m14, [cq+32*11] + pmulld m6, m14, [cq+32*13] + pmulld m7, m14, [cq+32*15] + vpbroadcastd m11, [pd_2048] + lea r6, [rsp+32*4] + call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 + pmulld m0, m14, [cq+32* 0] + pmulld m1, m14, [cq+32* 2] + pmulld m2, m14, [cq+32* 4] + pmulld m3, m14, [cq+32* 6] + pmulld m4, m14, [cq+32* 8] + pmulld m5, m14, [cq+32*10] + pmulld m6, m14, [cq+32*12] + pmulld m7, m14, [cq+32*14] + call m(idct_8x8_internal_10bpc).main_rect2 + call m(idct_8x16_internal_10bpc).main_evenhalf + psrld m11, 11 ; pd_1 + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + call .pass1_rotations + REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call .transpose + call m(idct_16x8_internal_8bpc).main + vpbroadcastd m10, [pw_2048] +.end: + pmulhrsw m0, m10 + pmulhrsw m1, m10 + pmulhrsw m2, m10 + pmulhrsw m3, m10 + call .write_16x4_start +.end2: + pmulhrsw m0, m4, m10 + pmulhrsw m1, m5, m10 + pmulhrsw m2, m6, m10 + pmulhrsw m3, m7, m10 + call .write_16x4_zero + RET +ALIGN function_align +.pass1_rotations: + mova m14, [r6-32*4] + mova m13, [r6-32*3] + mova m12, [r6-32*2] + mova m11, [r6-32*1] + mova m10, [r6+32*0] + mova m9, [r6+32*1] + mova m8, [r6+32*2] + psubd m15, m0, m14 ; out15 + paddd m0, m14 ; out0 + psubd m14, m1, m13 ; out14 + paddd m1, m13 ; out1 + psubd m13, m2, m12 ; out13 + paddd m2, m12 ; out2 + psubd m12, m3, m11 ; out12 + paddd m3, m11 ; out3 + psubd m11, m4, m10 ; out11 + paddd m4, m10 ; out4 + psubd m10, m5, m9 ; out10 + paddd m5, m9 ; out5 + psubd m9, m6, m8 ; out9 + paddd m6, m8 ; out6 + psubd m8, m7, [r6+32*3] ; out8 + paddd m7, [r6+32*3] ; out7 + ret +ALIGN function_align +.transpose: + lea r6, [deint_shuf+128] +.transpose2: + packssdw m0, m8 + packssdw m1, m9 + packssdw m2, m10 + packssdw m3, m11 + packssdw m4, m12 + packssdw m5, m13 + packssdw m6, m14 + packssdw m7, m15 +.transpose3: + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + punpckhwd m3, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m6, m7 + punpcklwd m6, m7 + punpckhdq m7, m4, m6 + punpckldq m4, m6 + punpckldq m6, m8, m2 + punpckhdq m8, m2 + punpckhdq m2, m0, m1 + punpckldq m0, m1 + punpckhdq m1, m3, m5 + punpckldq m3, m5 + punpcklqdq m5, m6, m3 + punpckhqdq m6, m3 + punpckhqdq m3, m2, m7 + punpcklqdq m2, m7 + punpcklqdq m7, m8, m1 + punpckhqdq m8, m1 + punpckhqdq m1, m0, m4 + punpcklqdq m0, m4 + vperm2i128 m4, m0, m5, 0x31 + vinserti128 m0, xm5, 1 + vperm2i128 m5, m1, m6, 0x31 + vinserti128 m1, xm6, 1 + vperm2i128 m6, m2, m7, 0x31 + vinserti128 m2, xm7, 1 + vperm2i128 m7, m3, m8, 0x31 + vinserti128 m3, xm8, 1 + ret +ALIGN function_align +.write_16x4_start: + vpbroadcastd m9, [pixel_10bpc_max] + lea r3, [strideq*3] + pxor m8, m8 +.write_16x4_zero: + REPX {mova [cq+32*x], m8}, 0, 1, 2, 3, 4, 5, 6, 7 + add cq, 32*8 +.write_16x4: + paddw m0, [dstq+strideq*0] + paddw m1, [dstq+strideq*1] + paddw m2, [dstq+strideq*2] + paddw m3, [dstq+r3 ] + REPX {pmaxsw x, m8}, m0, m1, m2, m3 + REPX {pminsw x, m9}, m0, m1, m2, m3 + mova [dstq+strideq*0], m0 + mova [dstq+strideq*1], m1 + mova [dstq+strideq*2], m2 + mova [dstq+r3 ], m3 + lea dstq, [dstq+strideq*4] + ret + +INV_TXFM_16X8_FN adst, dct +INV_TXFM_16X8_FN adst, adst +INV_TXFM_16X8_FN adst, flipadst +INV_TXFM_16X8_FN adst, identity + +cglobal iadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] +.pass1: + lea r6, [rsp+32*4] + call .main + vpbroadcastd m14, [pd_3072] + psrld m15, 11 ; pd_1 + psubd m13, m14, m15 ; pd_3071 + call .pass1_rotations +.pass1_end: + REPX {psrad x, 1 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 12}, m4, m5, m6, m7, m8, m9, m10, m11 + jmp tx2q +.pass2: + call m(idct_16x8_internal_10bpc).transpose + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end + vpbroadcastd m10, [pw_2048] + pxor m11, m11 + psubw m11, m10 + pmulhrsw m0, m10 + pmulhrsw m1, m11 + pmulhrsw m2, m10 + pmulhrsw m3, m11 + call m(idct_16x8_internal_10bpc).write_16x4_start + pmulhrsw m0, m4, m10 + pmulhrsw m1, m5, m11 + pmulhrsw m2, m6, m10 + pmulhrsw m3, m7, m11 + call m(idct_16x8_internal_10bpc).write_16x4_zero + RET +ALIGN function_align +.pass1_rotations: + paddd m0, m15 + psubd m1, m15, m1 + paddd m2, m15 + psubd m3, m15, m3 + paddd m4, m14 + psubd m5, m13, m5 + paddd m6, m14 + psubd m7, m13, m7 + paddd m8, m14, m9 + psubd m9, m13, m10 + paddd m10, m14, m11 + psubd m11, m13, m12 + paddd m12, m15, [r6-32*1] + psubd m13, m15, [r6-32*2] + paddd m14, m15, [r6-32*3] + psubd m15, [r6-32*4] + ret +ALIGN function_align +.main: + ; expects: m13 = clip_min m14 = clip_max + vpbroadcastd m15, [pd_2896] + pmulld m0, m15, [cq+32* 2] + pmulld m1, m15, [cq+32*13] + pmulld m2, m15, [cq+32* 6] + pmulld m3, m15, [cq+32* 9] + pmulld m4, m15, [cq+32*10] + pmulld m5, m15, [cq+32* 5] + pmulld m6, m15, [cq+32*14] + pmulld m7, m15, [cq+32* 1] + vpbroadcastd m12, [pd_2048] + REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 + call .main_part1 + pmulld m0, m15, [cq+32* 0] + pmulld m1, m15, [cq+32*15] + pmulld m2, m15, [cq+32* 4] + pmulld m3, m15, [cq+32*11] + pmulld m4, m15, [cq+32* 8] + pmulld m5, m15, [cq+32* 7] + pmulld m6, m15, [cq+32*12] + pmulld m7, m15, [cq+32* 3] + REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main_part2: + ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 201, 4091 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 1751, 3703 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3035, 2751 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 3857, 1380 + psubd m8, m0, m4 ; t8a + paddd m0, m4 ; t0a + psubd m4, m1, m5 ; t9a + paddd m1, m5 ; t1a + psubd m5, m2, m6 ; t12a + paddd m2, m6 ; t4a + psubd m6, m3, m7 ; t13a + paddd m7, m3 ; t5a + REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 + REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7 + vpbroadcastd m11, [pd_4017] + vpbroadcastd m10, [pd_799] + ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11 + ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10 + psubd m3, m0, m2 ; t4 + paddd m0, m2 ; t0 + psubd m2, m1, m7 ; t5 + paddd m1, m7 ; t1 + psubd m7, m4, m6 ; t12a + paddd m4, m6 ; t8a + psubd m6, m8, m5 ; t13a + paddd m5, m8 ; t9a + REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 + REPX {pminsd x, m14}, m3, m2, m7, m6, m0, m1, m4, m5 + vpbroadcastd m11, [pd_3784] + vpbroadcastd m10, [pd_1567] + ITX_MULSUB_2D 3, 2, 8, 9, _, 12, 10, 11 + ITX_MULSUB_2D 7, 6, 8, 9, _, 12, 10, 11 + pminsd m10, m14, [r6-32*4] ; t2 + pminsd m8, m14, [r6-32*3] ; t3 + psubd m9, m0, m10 ; t2a + paddd m0, m10 ; out0 + psubd m10, m1, m8 ; t3a + paddd m1, m8 ; -out15 + pmaxsd m9, m13 + pmaxsd m10, m13 + pminsd m9, m14 + pminsd m10, m14 + mova [r6-32*4], m1 + mova m11, [r6-32*1] ; t7a + mova m1, [r6-32*2] ; t6a + psubd m8, m3, m11 ; t7 + paddd m11, m3 ; out12 + paddd m3, m2, m1 ; -out3 + psubd m2, m1 ; t6 + pmaxsd m8, m13 + pmaxsd m2, m13 + pminsd m8, m14 + pminsd m2, m14 + mova [r6-32*1], m11 + mova [r6-32*3], m2 + mova m1, [r6+32*3] ; t15 + mova m2, [r6+32*2] ; t14 + paddd m12, m7, m1 ; -out13 + psubd m7, m1 ; t15a + psubd m11, m6, m2 ; t14a + paddd m2, m6 ; out2 + pmaxsd m7, m13 + pmaxsd m11, m13 + pminsd m7, m14 + pminsd m11, m14 + mova [r6-32*2], m12 + pminsd m1, m14, [r6+32*0] ; t10a + pminsd m12, m14, [r6+32*1] ; t11a + psubd m6, m4, m1 ; t10 + paddd m1, m4 ; -out1 + psubd m4, m5, m12 ; t11 + paddd m5, m12 ; out14 + vpbroadcastd m12, [pd_1448] + pmaxsd m6, m13 + pmaxsd m4, m13 + pminsd m6, m14 + pminsd m4, m14 + REPX {pmulld x, m12}, m9, m10, m8, m7, m11, m6, m4 + pmulld m12, [r6-32*3] ; t6 + mova [r6-32*3], m5 + paddd m5, m11, m7 ; -out5 (unshifted) + psubd m11, m7 ; out10 (unshifted) + paddd m7, m9, m10 ; -out7 (unshifted) + psubd m9, m10 ; out8 (unshifted) + psubd m10, m6, m4 ; -out9 (unshifted) + paddd m6, m4 ; out6 (unshifted) + paddd m4, m12, m8 ; out4 (unshifted) + psubd m12, m8 ; -out11 (unshifted) + ret +.main_part1: + ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 995, 3973 + ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 2440, 3290 + ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3513, 2106 + ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 4052, 601 + psubd m8, m0, m4 ; t10a + paddd m0, m4 ; t2a + psubd m4, m1, m5 ; t11a + paddd m1, m5 ; t3a + psubd m5, m2, m6 ; t14a + paddd m2, m6 ; t6a + psubd m6, m3, m7 ; t15a + paddd m7, m3 ; t7a + REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7 + REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7 + vpbroadcastd m11, [pd_2276] + vpbroadcastd m10, [pd_3406] + ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11 + ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10 + psubd m3, m0, m2 ; t6 + paddd m0, m2 ; t2 + psubd m2, m1, m7 ; t7 + paddd m1, m7 ; t3 + psubd m7, m4, m6 ; t14a + paddd m4, m6 ; t10a + psubd m6, m8, m5 ; t15a + paddd m5, m8 ; t11a + REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5 + REPX {pminsd x, m14}, m3, m2, m7, m6 ; clip the rest later + vpbroadcastd m11, [pd_1567] + vpbroadcastd m10, [pd_3784] + ITX_MULSUB_2D 2, 3, 8, 9, _, 12, 10, 11 + ITX_MULSUB_2D 6, 7, 8, 9, _, 12, 10, 11 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + ret + +INV_TXFM_16X8_FN flipadst, dct +INV_TXFM_16X8_FN flipadst, adst +INV_TXFM_16X8_FN flipadst, flipadst +INV_TXFM_16X8_FN flipadst, identity + +cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] +.pass1: + lea r6, [rsp+32*4] + call m(iadst_16x8_internal_10bpc).main + vpbroadcastd m14, [pd_3072] + psrld m15, 11 + psubd m13, m14, m15 + call .pass1_rotations + jmp m(iadst_16x8_internal_10bpc).pass1_end +.pass2: + call m(idct_16x8_internal_10bpc).transpose + call m(iadst_16x8_internal_8bpc).main + call m(iadst_16x8_internal_8bpc).main_pass2_end + vpbroadcastd m10, [pw_2048] + pxor m11, m11 + psubw m11, m10 + mova m12, m0 + pmulhrsw m0, m7, m11 + mova m7, m1 + pmulhrsw m1, m6, m10 + mova m6, m2 + pmulhrsw m2, m5, m11 + mova m5, m3 + pmulhrsw m3, m4, m10 + call m(idct_16x8_internal_10bpc).write_16x4_start + pmulhrsw m0, m5, m11 + pmulhrsw m1, m6, m10 + pmulhrsw m2, m7, m11 + pmulhrsw m3, m12, m10 + call m(idct_16x8_internal_10bpc).write_16x4_zero + RET +ALIGN function_align +.pass1_rotations: + psubd m8, m13, m7 + paddd m7, m14, m9 + paddd m9, m14, m6 + psubd m6, m13, m10 + psubd m10, m13, m5 + paddd m5, m14, m11 + paddd m11, m14, m4 + psubd m4, m13, m12 + psubd m12, m15, m3 + paddd m3, m15, [r6-32*1] + paddd m13, m15, m2 + psubd m2, m15, [r6-32*2] + psubd m14, m15, m1 + mova m1, m15 + paddd m15, m0 + psubd m0, m1, [r6-32*4] + paddd m1, [r6-32*3] + ret + +INV_TXFM_16X8_FN identity, dct +INV_TXFM_16X8_FN identity, adst +INV_TXFM_16X8_FN identity, flipadst +INV_TXFM_16X8_FN identity, identity + +cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 +.pass1: + vpbroadcastd m15, [pd_2896] + pmulld m0, m15, [cq+32* 0] + pmulld m1, m15, [cq+32* 1] + pmulld m2, m15, [cq+32* 2] + pmulld m3, m15, [cq+32* 3] + pmulld m4, m15, [cq+32* 4] + pmulld m5, m15, [cq+32* 5] + pmulld m6, m15, [cq+32* 6] + pmulld m7, m15, [cq+32* 7] + pmulld m8, m15, [cq+32* 8] + pmulld m9, m15, [cq+32* 9] + pmulld m10, m15, [cq+32*10] + pmulld m11, m15, [cq+32*11] + pmulld m12, m15, [cq+32*12] + pmulld m13, m15, [cq+32*13] + pmulld m14, m15, [cq+32*14] + pmulld m15, [cq+32*15] + mova [rsp], m7 + vpbroadcastd m7, [pd_2048] + REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + paddd m7, [rsp] + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + mova [rsp], m15 + vpbroadcastd m15, [pd_5793] + REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14 + pmulld m15, [rsp] + mova [rsp], m7 + vpbroadcastd m7, [pd_3072] + REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + paddd m7, [rsp] + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call m(idct_16x8_internal_10bpc).transpose + vpbroadcastd m10, [pw_4096] + jmp m(idct_16x8_internal_10bpc).end + +INV_TXFM_16X8_FN dct, dct, 12 +INV_TXFM_16X8_FN dct, identity, 12 +INV_TXFM_16X8_FN dct, adst, 12 +INV_TXFM_16X8_FN dct, flipadst, 12 + +cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(idct_16x8_internal_10bpc).pass1 +.pass2: + call .pass2_main + RET +ALIGN function_align +.pass2_main: + call m(idct_8x16_internal_12bpc).transpose + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m11, [pd_2048] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_8x8_internal_10bpc).main + call m(idct_8x8_internal_12bpc).round_shift4 + mova [cq+32* 8], m0 + mova [cq+32* 9], m1 + mova [cq+32*10], m2 + mova [cq+32*11], m3 + mova [cq+32*12], m4 + mova [cq+32*13], m5 + mova [cq+32*14], m6 + mova [cq+32*15], m7 + pmaxsd m0, m12, [cq+32*0] + pmaxsd m1, m12, [cq+32*1] + pmaxsd m2, m12, [cq+32*2] + pmaxsd m3, m12, [cq+32*3] + pmaxsd m4, m12, [cq+32*4] + pmaxsd m5, m12, [cq+32*5] + pmaxsd m6, m12, [cq+32*6] + pmaxsd m7, m12, [cq+32*7] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_8x8_internal_10bpc).main + call m(idct_8x8_internal_12bpc).round_shift4 +.end: + packssdw m0, [cq+32* 8] + packssdw m1, [cq+32* 9] + packssdw m2, [cq+32*10] + packssdw m3, [cq+32*11] + packssdw m4, [cq+32*12] + packssdw m5, [cq+32*13] + packssdw m6, [cq+32*14] + packssdw m7, [cq+32*15] + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + call .write_16x4_start + call m(idct_16x8_internal_10bpc).write_16x4_zero + vpermq m0, m4, q3120 + vpermq m1, m5, q3120 + vpermq m2, m6, q3120 + vpermq m3, m7, q3120 + jmp m(idct_16x8_internal_10bpc).write_16x4_zero +ALIGN function_align +.write_16x4_start: + vpbroadcastd m9, [pixel_12bpc_max] + lea r3, [strideq*3] + pxor m8, m8 + ret + +INV_TXFM_16X8_FN adst, dct, 12 +INV_TXFM_16X8_FN adst, adst, 12 +INV_TXFM_16X8_FN adst, flipadst, 12 +INV_TXFM_16X8_FN adst, identity, 12 + +cglobal iadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_20b_min] + vpbroadcastd m14, [clip_20b_max] + jmp m(iadst_16x8_internal_10bpc).pass1 +.pass2: + call .pass2_main + call m(idct_16x8_internal_12bpc).end + RET +ALIGN function_align +.pass2_main: + call m(idct_8x16_internal_12bpc).transpose + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m11, [pd_2048] + REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(iadst_8x8_internal_12bpc).pass2_main2 + mova [cq+32* 8], m0 + mova [cq+32* 9], m1 + mova [cq+32*10], m2 + mova [cq+32*11], m3 + mova [cq+32*12], m4 + mova [cq+32*13], m5 + mova [cq+32*14], m6 + mova [cq+32*15], m7 + pmaxsd m0, m12, [cq+32*0] + pmaxsd m1, m12, [cq+32*1] + pmaxsd m2, m12, [cq+32*2] + pmaxsd m3, m12, [cq+32*3] + pmaxsd m4, m12, [cq+32*4] + pmaxsd m5, m12, [cq+32*5] + pmaxsd m6, m12, [cq+32*6] + pmaxsd m7, m12, [cq+32*7] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(iadst_8x8_internal_12bpc).pass2_main2 + ret + +INV_TXFM_16X8_FN flipadst, dct, 12 +INV_TXFM_16X8_FN flipadst, adst, 12 +INV_TXFM_16X8_FN flipadst, flipadst, 12 +INV_TXFM_16X8_FN flipadst, identity, 12 + +cglobal iflipadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_20b_min] + vpbroadcastd m14, [clip_20b_max] + jmp m(iflipadst_16x8_internal_10bpc).pass1 +.pass2: + call m(iadst_16x8_internal_12bpc).pass2_main + packssdw m13, m0, [cq+32* 8] + packssdw m12, m1, [cq+32* 9] + packssdw m11, m2, [cq+32*10] + packssdw m10, m3, [cq+32*11] + packssdw m3, m4, [cq+32*12] + packssdw m2, m5, [cq+32*13] + packssdw m1, m6, [cq+32*14] + packssdw m0, m7, [cq+32*15] + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 + call m(idct_16x8_internal_12bpc).write_16x4_start + call m(idct_16x8_internal_10bpc).write_16x4_zero + vpermq m0, m10, q3120 + vpermq m1, m11, q3120 + vpermq m2, m12, q3120 + vpermq m3, m13, q3120 + call m(idct_16x8_internal_10bpc).write_16x4_zero + RET + +INV_TXFM_16X8_FN identity, dct, 12 +INV_TXFM_16X8_FN identity, adst, 12 +INV_TXFM_16X8_FN identity, flipadst, 12 +INV_TXFM_16X8_FN identity, identity, 12 + +cglobal iidentity_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 + jmp m(iidentity_16x8_internal_10bpc).pass1 +.pass2: + call m(idct_16x8_internal_10bpc).transpose2 + vpbroadcastd m10, [pw_4096] + pmulhrsw m0, m10 + pmulhrsw m1, m10 + pmulhrsw m2, m10 + pmulhrsw m3, m10 + call m(idct_16x8_internal_12bpc).write_16x4_start + call m(idct_16x8_internal_10bpc).write_16x4_zero + jmp m(idct_16x8_internal_10bpc).end2 + +%macro INV_TXFM_16X16_FN 2-4 0,10 ; type1, type2, eob_offset, bitdepth + INV_TXFM_FN %1, %2, %3, 16x16, %4 +%ifidn %1_%2, dct_dct + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_%4bpc] + mov [cq], eobd ; 0 + or r3d, 16 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3 +%endif +%endmacro + +INV_TXFM_16X16_FN dct, dct +INV_TXFM_16X16_FN dct, identity, 28 +INV_TXFM_16X16_FN dct, adst +INV_TXFM_16X16_FN dct, flipadst + +cglobal idct_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] +.pass1: + vpbroadcastd m11, [pd_2048] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + add cq, 32 + call .main + sub cq, 32 + mova m10, [r6-32*4] + mova m9, [r6-32*3] + mova m8, [r6-32*2] + psubd m15, m0, m10 ; out15 + paddd m0, m10 ; out0 + psubd m10, m1, m9 ; out14 + paddd m1, m9 ; out1 + psubd m9, m2, m8 ; out13 + paddd m2, m8 ; out2 + REPX {psrad x, 2}, m0, m1, m2 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova m2, [r6-32*1] + mova m1, [r6+32*0] + mova m0, [r6+32*1] + REPX {psrad x, 2}, m9, m10, m15 + psubd m8, m3, m2 ; out12 + paddd m3, m2 ; out3 + psubd m2, m4, m1 ; out11 + paddd m4, m1 ; out4 + psubd m1, m5, m0 ; out10 + paddd m5, m0 ; out5 + REPX {psrad x, 2}, m3, m4, m5 + mova [r6-32*1], m3 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova m4, [r6+32*2] + mova m3, [r6+32*3] + REPX {psrad x, 2}, m1, m2, m8 + psubd m5, m6, m4 ; out9 + paddd m6, m4 ; out6 + psubd m4, m7, m3 ; out8 + paddd m7, m3 ; out7 + REPX {psrad x, 2}, m6, m7, m4, m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + add r6, 32*8 + mova [r6-32*4], m4 + mova [r6-32*3], m5 + mova [r6-32*2], m1 + mova [r6-32*1], m2 + mova [r6+32*0], m8 + mova [r6+32*1], m9 + mova [r6+32*2], m10 + mova [r6+32*3], m15 +.fast: + add r6, 32*8 + call .main + mova m14, [r6-32*4] + mova m13, [r6-32*3] + mova m12, [r6-32*2] + mova m11, [r6-32*1] + mova m10, [r6+32*0] + mova m9, [r6+32*1] + mova m8, [r6+32*2] + psubd m15, m0, m14 ; out15 + paddd m0, m14 ; out0 + psubd m14, m1, m13 ; out14 + paddd m1, m13 ; out1 + psubd m13, m2, m12 ; out13 + paddd m2, m12 ; out2 + psubd m12, m3, m11 ; out12 + paddd m3, m11 ; out3 + psubd m11, m4, m10 ; out11 + paddd m4, m10 ; out4 + psubd m10, m5, m9 ; out10 + paddd m5, m9 ; out5 + psubd m9, m6, m8 ; out9 + paddd m6, m8 ; out6 + psubd m8, m7, [r6+32*3] ; out8 + paddd m7, [r6+32*3] ; out7 + sub r6, 32*8 + REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call .transpose + lea r6, [pw_5+128] + mova [rsp], m15 + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] +.end: + call .write_16x16 + RET +ALIGN function_align +.write_16x16: + mova [rsp+gprsize+32*0], m8 + mova [rsp+gprsize+32*1], m9 + mova [rsp+gprsize+32*2], m12 + vpbroadcastd m12, [pw_2048] + pmulhrsw m0, m12 + pmulhrsw m1, m12 + pmulhrsw m2, m12 + pmulhrsw m3, m12 + call m(idct_16x8_internal_10bpc).write_16x4_start +.write_16x16_2: + pmulhrsw m0, m12, m4 + pmulhrsw m1, m12, m5 + pmulhrsw m2, m12, m6 + pmulhrsw m3, m12, m7 + call m(idct_16x8_internal_10bpc).write_16x4_zero + pmulhrsw m0, m12, [rsp+gprsize+32*0] + pmulhrsw m1, m12, [rsp+gprsize+32*1] + pmulhrsw m2, m12, m10 + pmulhrsw m3, m12, m11 + call m(idct_16x8_internal_10bpc).write_16x4_zero + pmulhrsw m0, m12, [rsp+gprsize+32*2] + pmulhrsw m1, m12, m13 + pmulhrsw m2, m12, m14 + pmulhrsw m3, m12, m15 + jmp m(idct_16x8_internal_10bpc).write_16x4_zero +ALIGN function_align +.transpose: + test eobd, eobd + jl .transpose_fast + packssdw m8, [r6-32*4] + packssdw m9, [r6-32*3] + packssdw m10, [r6-32*2] + packssdw m11, [r6-32*1] + packssdw m12, [r6+32*0] + packssdw m13, [r6+32*1] + packssdw m14, [r6+32*2] + packssdw m15, [r6+32*3] + sub r6, 32*8 + packssdw m0, [r6-32*4] + packssdw m1, [r6-32*3] + packssdw m2, [r6-32*2] + packssdw m3, [r6-32*1] + packssdw m4, [r6+32*0] + packssdw m5, [r6+32*1] + packssdw m6, [r6+32*2] + packssdw m7, [r6+32*3] + mova [r6], m8 + punpckhwd m8, m0, m1 + punpcklwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + punpckhwd m3, m6, m7 + punpcklwd m6, m7 + punpcklwd m7, m4, m5 + punpckhwd m4, m5 + punpckldq m5, m8, m2 + punpckhdq m8, m2 + punpckhdq m2, m0, m1 + punpckldq m0, m1 + punpckhdq m1, m7, m6 + punpckldq m7, m6 + punpckhdq m6, m4, m3 + punpckldq m4, m3 + punpckhqdq m3, m2, m1 + punpcklqdq m2, m1 + punpckhqdq m1, m0, m7 + punpcklqdq m0, m7 + punpcklqdq m7, m8, m6 + punpckhqdq m8, m6 + punpckhqdq m6, m5, m4 + punpcklqdq m5, m4 + mova m4, [r6] + mova [r6], m8 + punpcklwd m8, m4, m9 + punpckhwd m4, m9 + punpcklwd m9, m10, m11 + punpckhwd m10, m11 + punpckhwd m11, m14, m15 + punpcklwd m14, m15 + punpckhwd m15, m12, m13 + punpcklwd m12, m13 + punpckldq m13, m4, m10 + punpckhdq m4, m10 + punpckhdq m10, m8, m9 + punpckldq m8, m9 + punpckhdq m9, m12, m14 + punpckldq m12, m14 + punpckhdq m14, m15, m11 + punpckldq m15, m11 + punpckhqdq m11, m10, m9 + punpcklqdq m10, m9 + punpckhqdq m9, m8, m12 + punpcklqdq m8, m12 + punpcklqdq m12, m13, m15 + punpckhqdq m13, m15 + punpckhqdq m15, m4, m14 + punpcklqdq m14, m4, m14 + vperm2i128 m4, m0, m8, 0x31 + vinserti128 m0, xm8, 1 + vinserti128 m8, m5, xm12, 1 + vperm2i128 m12, m5, 0x13 + vperm2i128 m5, m1, m9, 0x31 + vinserti128 m1, xm9, 1 + vinserti128 m9, m6, xm13, 1 + vperm2i128 m13, m6, 0x13 + vperm2i128 m6, m2, m10, 0x31 + vinserti128 m2, xm10, 1 + vinserti128 m10, m7, xm14, 1 + vperm2i128 m14, m7, 0x13 + vperm2i128 m7, m3, m11, 0x31 + vinserti128 m3, xm11, 1 + mova xm11, [r6] + vinserti128 m11, xm15, 1 + vinserti128 m15, [r6+16], 0 + ret +.transpose_fast: + call m(idct_16x8_internal_10bpc).transpose2 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + ret +ALIGN function_align +.main: + mova m0, [cq+64* 1] + mova m1, [cq+64* 3] + mova m2, [cq+64* 5] + mova m3, [cq+64* 7] + mova m4, [cq+64* 9] + mova m5, [cq+64*11] + mova m6, [cq+64*13] + mova m7, [cq+64*15] + call m(idct_8x16_internal_10bpc).main_oddhalf + mova m0, [cq+64* 0] + mova m1, [cq+64* 2] + mova m2, [cq+64* 4] + mova m3, [cq+64* 6] + mova m4, [cq+64* 8] + mova m5, [cq+64*10] + mova m6, [cq+64*12] + mova m7, [cq+64*14] + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf + psrld m10, m11, 10 ; pd_2 + REPX {paddd x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 + ret + +INV_TXFM_16X16_FN adst, dct +INV_TXFM_16X16_FN adst, adst +INV_TXFM_16X16_FN adst, flipadst + +cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] +.pass1: + vpbroadcastd m15, [pd_2896] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + add cq, 32 + call .main + sub cq, 32 + vpbroadcastd m8, [pd_5120] + paddd m4, m8 + paddd m6, m8 + paddd m9, m8 + paddd m11, m8 + vpbroadcastd m8, [pd_5119] + psubd m5, m8, m5 + psubd m7, m8, m7 + psubd m10, m8, m10 + psubd m12, m8, m12 + REPX {psrad x, 13}, m4, m5, m6, m7, m9, m10, m11, m12 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + psrld m4, m15, 10 ; pd_2 + paddd m0, m4 + psubd m1, m4, m1 + paddd m2, m4 + psubd m3, m4, m3 + psubd m7, m4, [r6-32*4] + paddd m6, m4, [r6-32*3] + psubd m5, m4, [r6-32*2] + paddd m4, [r6-32*1] + REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + add r6, 32*8 + mova [r6-32*4], m9 + mova [r6-32*3], m10 + mova [r6-32*2], m11 + mova [r6-32*1], m12 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 +.fast: + add r6, 32*8 + call .main + vpbroadcastd m14, [pd_5120] + vpbroadcastd m13, [pd_5119] + psrld m15, 10 ; pd_2 + paddd m0, m15 + psubd m1, m15, m1 + paddd m2, m15 + psubd m3, m15, m3 + paddd m4, m14 + psubd m5, m13, m5 + paddd m6, m14 + psubd m7, m13, m7 + paddd m8, m14, m9 + psubd m9, m13, m10 + paddd m10, m14, m11 + psubd m11, m13, m12 + paddd m12, m15, [r6-32*1] + psubd m13, m15, [r6-32*2] + paddd m14, m15, [r6-32*3] + psubd m15, [r6-32*4] +.pass1_end: + REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11 + sub r6, 32*8 + jmp tx2q +.pass2: + call m(idct_16x16_internal_10bpc).transpose + lea r6, [pw_5+128] + mova [rsp], m15 + call m(iadst_16x16_internal_8bpc).main + call m(iadst_16x16_internal_8bpc).main_pass2_end + mova [rsp+32*0], m8 + mova [rsp+32*2], m12 + mova [rsp+32*3], m13 + vpbroadcastd m12, [pw_2048] + pxor m13, m13 + psubw m13, m12 + pmulhrsw m0, m12 + pmulhrsw m1, m13, [rsp+32*1] + mova [rsp+32*1], m9 + pmulhrsw m2, m12 + pmulhrsw m3, m13 + call m(idct_16x8_internal_10bpc).write_16x4_start + pmulhrsw m0, m12, m4 + pmulhrsw m1, m13, m5 + pmulhrsw m2, m12, m6 + pmulhrsw m3, m13, m7 + call m(idct_16x8_internal_10bpc).write_16x4_zero + pmulhrsw m0, m12, [rsp+32*0] + pmulhrsw m1, m13, [rsp+32*1] + pmulhrsw m2, m12, m10 + pmulhrsw m3, m13, m11 + call m(idct_16x8_internal_10bpc).write_16x4_zero + pmulhrsw m0, m12, [rsp+32*2] + pmulhrsw m1, m13, [rsp+32*3] + pmulhrsw m2, m12, m14 + pmulhrsw m3, m13, m15 + call m(idct_16x8_internal_10bpc).write_16x4_zero + RET +ALIGN function_align +.main: + mova m0, [cq+64* 2] + mova m1, [cq+64*13] + mova m2, [cq+64* 6] + mova m3, [cq+64* 9] + mova m4, [cq+64*10] + mova m5, [cq+64* 5] + mova m6, [cq+64*14] + mova m7, [cq+64* 1] + vpbroadcastd m12, [pd_2048] + call m(iadst_16x8_internal_10bpc).main_part1 + mova m0, [cq+64* 0] + mova m1, [cq+64*15] + mova m2, [cq+64* 4] + mova m3, [cq+64*11] + mova m4, [cq+64* 8] + mova m5, [cq+64* 7] + mova m6, [cq+64*12] + mova m7, [cq+64* 3] + jmp m(iadst_16x8_internal_10bpc).main_part2 + +INV_TXFM_16X16_FN flipadst, dct +INV_TXFM_16X16_FN flipadst, adst +INV_TXFM_16X16_FN flipadst, flipadst + +cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] +.pass1: + vpbroadcastd m15, [pd_2896] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + add cq, 32 + call m(iadst_16x16_internal_10bpc).main + sub cq, 32 + vpbroadcastd m8, [pd_5120] + paddd m11, m8 + paddd m9, m8 + paddd m6, m8 + paddd m4, m8 + vpbroadcastd m8, [pd_5119] + psubd m12, m8, m12 + psubd m10, m8, m10 + psubd m7, m8, m7 + psubd m5, m8, m5 + REPX {psrad x, 13}, m12, m11, m10, m9, m7, m6, m5, m4 + mova [r6+32*0], m12 + mova [r6+32*1], m11 + mova [r6+32*2], m10 + mova [r6+32*3], m9 + psrld m9, m15, 10 ; pd_2 + psubd m3, m9, m3 + paddd m2, m9 + psubd m1, m9, m1 + paddd m0, m9 + psubd m12, m9, [r6-32*4] + paddd m11, m9, [r6-32*3] + psubd m10, m9, [r6-32*2] + paddd m9, [r6-32*1] + REPX {psrad x, 2 }, m12, m11, m10, m9, m3, m2, m1, m0 + mova [r6-32*4], m12 + mova [r6-32*3], m11 + mova [r6-32*2], m10 + mova [r6-32*1], m9 + add r6, 32*8 + mova [r6-32*4], m7 + mova [r6-32*3], m6 + mova [r6-32*2], m5 + mova [r6-32*1], m4 + mova [r6+32*0], m3 + mova [r6+32*1], m2 + mova [r6+32*2], m1 + mova [r6+32*3], m0 +.fast: + add r6, 32*8 + call m(iadst_16x16_internal_10bpc).main + vpbroadcastd m14, [pd_5120] + vpbroadcastd m13, [pd_5119] + psrld m15, 10 ; pd_2 + psubd m8, m13, m7 + paddd m7, m14, m9 + paddd m9, m14, m6 + psubd m6, m13, m10 + psubd m10, m13, m5 + paddd m5, m14, m11 + paddd m11, m14, m4 + psubd m4, m13, m12 + psubd m12, m15, m3 + paddd m3, m15, [r6-32*1] + paddd m13, m15, m2 + psubd m2, m15, [r6-32*2] + psubd m14, m15, m1 + mova m1, m15 + paddd m15, m0 + psubd m0, m1, [r6-32*4] + paddd m1, [r6-32*3] + jmp m(iadst_16x16_internal_10bpc).pass1_end +.pass2: + call m(idct_16x16_internal_10bpc).transpose + lea r6, [pw_5+128] + mova [rsp], m15 + call m(iadst_16x16_internal_8bpc).main + call m(iadst_16x16_internal_8bpc).main_pass2_end + mova [rsp+32*3], m3 + mova [rsp+32*2], m2 + mova [rsp+32*0], m0 + mova m2, m13 + mova m3, m12 + vpbroadcastd m12, [pw_2048] + pxor m13, m13 + psubw m13, m12 + pmulhrsw m0, m13, m15 + pmulhrsw m1, m12, m14 + pmulhrsw m2, m13 + pmulhrsw m3, m12 + mova m14, m8 + mova m15, m9 + call m(idct_16x8_internal_10bpc).write_16x4_start + pmulhrsw m0, m13, m11 + pmulhrsw m1, m12, m10 + pmulhrsw m2, m13, m15 + pmulhrsw m3, m12, m14 + call m(idct_16x8_internal_10bpc).write_16x4_zero + pmulhrsw m0, m13, m7 + pmulhrsw m1, m12, m6 + pmulhrsw m2, m13, m5 + pmulhrsw m3, m12, m4 + call m(idct_16x8_internal_10bpc).write_16x4_zero + pmulhrsw m0, m13, [rsp+32*3] + pmulhrsw m1, m12, [rsp+32*2] + pmulhrsw m2, m13, [rsp+32*1] + pmulhrsw m3, m12, [rsp+32*0] + call m(idct_16x8_internal_10bpc).write_16x4_zero + RET + +INV_TXFM_16X16_FN identity, dct, -92 +INV_TXFM_16X16_FN identity, identity + +cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m15, [pd_5793] + vpbroadcastd m7, [pd_5120] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + mov r3, -32*8*4 +.righthalf: + pmulld m0, m15, [cq+r3+32*33] + pmulld m1, m15, [cq+r3+32*35] + pmulld m2, m15, [cq+r3+32*37] + pmulld m3, m15, [cq+r3+32*39] + add r6, 32*4 + REPX {paddd x, m7}, m0, m1, m2, m3 + REPX {psrad x, 13}, m0, m1, m2, m3 + mova [r6+32*0], m0 + mova [r6+32*1], m1 + mova [r6+32*2], m2 + mova [r6+32*3], m3 + add r3, 32*8 + jl .righthalf +.fast: + pmulld m0, m15, [cq+64* 0] + pmulld m1, m15, [cq+64* 1] + pmulld m2, m15, [cq+64* 2] + pmulld m3, m15, [cq+64* 3] + pmulld m4, m15, [cq+64* 4] + pmulld m5, m15, [cq+64* 5] + pmulld m6, m15, [cq+64* 6] + pmulld m8, m15, [cq+64* 7] + mova [cq], m8 + pmulld m8, m15, [cq+64* 8] + pmulld m9, m15, [cq+64* 9] + pmulld m10, m15, [cq+64*10] + pmulld m11, m15, [cq+64*11] + pmulld m12, m15, [cq+64*12] + pmulld m13, m15, [cq+64*13] + pmulld m14, m15, [cq+64*14] + pmulld m15, [cq+64*15] + REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \ + m8, m9, m10, m11, m12, m13, m14, m15 + paddd m7, [cq] + REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + jmp tx2q +.pass2: + call m(idct_16x16_internal_10bpc).transpose + + mova [cq+32*0], m15 + mova [cq+32*1], m0 + vpbroadcastd m15, [pw_1697x16] + + REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \ + 8, 9, 10, 11, 12, 13, 14 + mova m0, [cq+32*1] + mova [cq+32*1], m1 + IDTX16 0, 1, 15 + mova m1, [cq+32*0] + pmulhrsw m15, m1 + paddsw m1, m1 + paddsw m15, m1 + mova m1, [cq+32*1] + jmp m(idct_16x16_internal_10bpc).end + +INV_TXFM_16X16_FN dct, dct, 0, 12 +INV_TXFM_16X16_FN dct, identity, 28, 12 +INV_TXFM_16X16_FN dct, adst, 0, 12 +INV_TXFM_16X16_FN dct, flipadst, 0, 12 + +cglobal idct_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + jmp m(idct_16x16_internal_10bpc).pass1 +.pass2: + mova [cq+32* 8], m8 + mova [cq+32* 9], m9 + mova [cq+32*10], m10 + mova [cq+32*11], m11 + mova [cq+32*12], m12 + mova [cq+32*13], m13 + mova [cq+32*14], m14 + mova [cq+32*15], m15 + call .pass2_main + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + packssdw m4, m8, m9 + packssdw m5, m10, m11 + packssdw m6, m12, m13 + packssdw m7, m14, m15 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + mova m0, [cq+32* 8] + mova m1, [cq+32* 9] + mova m2, [cq+32*10] + mova m3, [cq+32*11] + mova m4, [cq+32*12] + mova m5, [cq+32*13] + mova m6, [cq+32*14] + mova m7, [cq+32*15] + mov r5, r6 + add r6, 32*16 + call .pass2_main + jmp m(iadst_16x16_internal_12bpc).end +ALIGN function_align +.write_16x16: + mova [rsp+gprsize+32*0], m8 + mova [rsp+gprsize+32*1], m9 + mova [rsp+gprsize+32*2], m12 + vpbroadcastd m12, [pw_16384] + pmulhrsw m0, m12 + pmulhrsw m1, m12 + pmulhrsw m2, m12 + pmulhrsw m3, m12 + call m(idct_16x8_internal_12bpc).write_16x4_start + call m(idct_16x8_internal_10bpc).write_16x4_zero + jmp m(idct_16x16_internal_10bpc).write_16x16_2 +ALIGN function_align +.pass2_main: + call m(idct_8x8_internal_12bpc).transpose_8x8 + mova [cq+32* 0], m0 + mova [cq+32* 1], m2 + mova [cq+32* 2], m4 + mova [cq+32* 3], m6 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + pmaxsd m0, m12, m1 + pmaxsd m1, m12, m3 + pmaxsd m2, m12, m5 + pmaxsd m3, m12, m7 + REPX {pminsd x, m13}, m0, m1, m2, m3 + test eobd, eobd + jge .pass2_slow + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + jmp .pass2_fast +.pass2_slow: + sub r6, 32*8 + mova m8, [r6-32*4] + mova m4, [r6-32*3] + mova m10, [r6-32*2] + mova m5, [r6-32*1] + mova m12, [r6+32*0] + mova m6, [r6+32*1] + mova m14, [r6+32*2] + mova m7, [r6+32*3] + TRANSPOSE_8X8_DWORD 8, 4, 10, 5, 12, 6, 14, 7, 9, 11, 13, 15 + mova [cq+32* 4], m8 + mova [cq+32* 5], m10 + mova [cq+32* 6], m12 + mova [cq+32* 7], m14 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + REPX {pmaxsd x, m12}, m4, m5, m6, m7 + REPX {pminsd x, m13}, m4, m5, m6, m7 +.pass2_fast: + vpbroadcastd m11, [pd_2048] + vpbroadcastd m14, [pd_2896] + call m(idct_8x16_internal_10bpc).main_oddhalf + pmaxsd m0, m12, [cq+32* 0] + pmaxsd m1, m12, [cq+32* 1] + pmaxsd m2, m12, [cq+32* 2] + pmaxsd m3, m12, [cq+32* 3] + REPX {pminsd x, m13}, m0, m1, m2, m3 + test eobd, eobd + jge .pass2_slow2 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + jmp .pass2_fast2 +.pass2_slow2: + pmaxsd m4, m12, [cq+32* 4] + pmaxsd m5, m12, [cq+32* 5] + pmaxsd m6, m12, [cq+32* 6] + pmaxsd m7, m12, [cq+32* 7] + REPX {pminsd x, m13}, m4, m5, m6, m7 +.pass2_fast2: + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf + psrad m11, 8 ; pd_8 + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_16x8_internal_10bpc).pass1_rotations + REPX {psrad x, 4}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m8, m9, m10, m11, m12, m13, m14, m15 + ret + +INV_TXFM_16X16_FN adst, dct, 0, 12 +INV_TXFM_16X16_FN adst, adst, 0, 12 +INV_TXFM_16X16_FN adst, flipadst, 0, 12 + +cglobal iadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_20b_min] + vpbroadcastd m14, [clip_20b_max] + jmp m(iadst_16x16_internal_10bpc).pass1 +.pass2: + call .pass2_part1 + call m(iadst_16x8_internal_10bpc).pass1_rotations + call .pass2_part2 + call m(iadst_16x8_internal_10bpc).pass1_rotations +.pass2_part3: + REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 +.end: + packssdw m15, m14 + packssdw m14, m13, m12 + packssdw m13, m11, m10 + packssdw m12, m9, m8 + packssdw m11, m7, m6 + packssdw m10, m5, m4 + packssdw m7, m3, m2 + packssdw m6, m1, m0 + vpblendd m0, m6, [r5-32*4], 0x33 + vpblendd m1, m6, [r5-32*4], 0xcc + vpblendd m2, m7, [r5-32*3], 0x33 + vpblendd m3, m7, [r5-32*3], 0xcc + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + call m(idct_16x8_internal_12bpc).write_16x4_start + call m(idct_16x8_internal_10bpc).write_16x4_zero + vpblendd m0, m10, [r5-32*2], 0x33 + vpblendd m1, m10, [r5-32*2], 0xcc + vpblendd m2, m11, [r5-32*1], 0x33 + vpblendd m3, m11, [r5-32*1], 0xcc + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + call m(idct_16x8_internal_10bpc).write_16x4_zero + vpblendd m0, m12, [r5+32*0], 0x33 + vpblendd m1, m12, [r5+32*0], 0xcc + vpblendd m2, m13, [r5+32*1], 0x33 + vpblendd m3, m13, [r5+32*1], 0xcc + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + call m(idct_16x8_internal_10bpc).write_16x4_zero + vpblendd m0, m14, [r5+32*2], 0x33 + vpblendd m1, m14, [r5+32*2], 0xcc + vpblendd m2, m15, [r5+32*3], 0x33 + vpblendd m3, m15, [r5+32*3], 0xcc + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + call m(idct_16x8_internal_10bpc).write_16x4_zero + RET +ALIGN function_align +.pass2_part1: + mova [cq+32* 8], m8 + mova [cq+32* 9], m9 + mova [cq+32*10], m10 + mova [cq+32*11], m11 + mova [cq+32*12], m12 + mova [cq+32*13], m13 + mova [cq+32*14], m14 + mova [cq+32*15], m15 +.pass2_main: + call m(idct_8x8_internal_12bpc).transpose_8x8 + mova [cq+32* 0], m0 + mova [cq+32* 1], m3 + mova [cq+32* 2], m4 + mova [cq+32* 3], m7 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] + pmaxsd m0, m13, m2 + pmaxsd m2, m13, m6 + pmaxsd m5, m13, m5 + pmaxsd m7, m13, m1 + REPX {pminsd x, m14}, m0, m2, m5, m7 + test eobd, eobd + jge .pass2_slow + pxor m1, m1 + REPX {mova x, m1}, m3, m4, m6 + jmp .pass2_fast +.pass2_slow: + sub r6, 32*8 + mova m8, [r6-32*4] + mova m3, [r6-32*3] + mova m4, [r6-32*2] + mova m11, [r6-32*1] + mova m12, [r6+32*0] + mova m1, [r6+32*1] + mova m6, [r6+32*2] + mova m15, [r6+32*3] + TRANSPOSE_8X8_DWORD 8, 3, 4, 11, 12, 1, 6, 15, 13, 9, 10, 14 + mova [cq+32* 4], m8 + mova [cq+32* 5], m11 + mova [cq+32* 6], m12 + mova [cq+32* 7], m15 + vpbroadcastd m13, [clip_18b_min] + vpbroadcastd m14, [clip_18b_max] + REPX {pmaxsd x, m13}, m1, m3, m4, m6 + REPX {pminsd x, m14}, m1, m3, m4, m6 +.pass2_fast: + vpbroadcastd m12, [pd_2048] + vpbroadcastd m15, [pd_2896] + call m(iadst_16x8_internal_10bpc).main_part1 + pmaxsd m0, m13, [cq+32* 0] ; 0 + pmaxsd m7, m13, [cq+32* 1] ; 3 + pmaxsd m2, m13, [cq+32* 2] ; 4 + pmaxsd m5, m13, [cq+32* 3] ; 7 + REPX {pminsd x, m14}, m0, m2, m5, m7 + test eobd, eobd + jge .pass2_slow2 + pxor m1, m1 + REPX {mova x, m1}, m3, m4, m6 + jmp .pass2_fast2 +.pass2_slow2: + pmaxsd m4, m13, [cq+32* 4] ; 8 + pmaxsd m3, m13, [cq+32* 5] ; 11 + pmaxsd m6, m13, [cq+32* 6] ; 12 + pmaxsd m1, m13, [cq+32* 7] ; 15 + REPX {pminsd x, m14}, m1, m3, m4, m6 +.pass2_fast2: + call m(iadst_16x8_internal_10bpc).main_part2 + vpbroadcastd m14, [pd_17408] + psrld m15, 11 ; pd_1 + psubd m13, m14, m15 ; pd_17407 + pslld m15, 3 ; pd_8 + ret +ALIGN function_align +.pass2_part2: + REPX {psrad x, 4 }, m0, m1, m2, m3, m12, m13, m14, m15 + REPX {psrad x, 15}, m4, m5, m6, m7, m8, m9, m10, m11 + packssdw m0, m1 + packssdw m1, m2, m3 + packssdw m2, m4, m5 + packssdw m3, m6, m7 + packssdw m4, m8, m9 + packssdw m5, m10, m11 + packssdw m6, m12, m13 + packssdw m7, m14, m15 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + mova m0, [cq+32* 8] + mova m1, [cq+32* 9] + mova m2, [cq+32*10] + mova m3, [cq+32*11] + mova m4, [cq+32*12] + mova m5, [cq+32*13] + mova m6, [cq+32*14] + mova m7, [cq+32*15] + mov r5, r6 + add r6, 32*16 + jmp .pass2_main + +INV_TXFM_16X16_FN flipadst, dct, 0, 12 +INV_TXFM_16X16_FN flipadst, adst, 0, 12 +INV_TXFM_16X16_FN flipadst, flipadst, 0, 12 + +cglobal iflipadst_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m13, [clip_20b_min] + vpbroadcastd m14, [clip_20b_max] + jmp m(iflipadst_16x16_internal_10bpc).pass1 +.pass2: + call m(iadst_16x16_internal_12bpc).pass2_part1 + call m(iflipadst_16x8_internal_10bpc).pass1_rotations + call m(iadst_16x16_internal_12bpc).pass2_part2 + call m(iflipadst_16x8_internal_10bpc).pass1_rotations + jmp m(iadst_16x16_internal_12bpc).pass2_part3 + +INV_TXFM_16X16_FN identity, dct, -92, 12 +INV_TXFM_16X16_FN identity, identity, 0, 12 + +%macro IDTX16_12BPC 1 ; src + pmulld m6, m7, m%1 + paddd m6, m15 + psrad m6, 12 + paddd m6, m%1 + psrad m%1, m6, 1 +%endmacro + +cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2 + vpbroadcastd m7, [pd_1697] + vpbroadcastd m15, [pd_5120] + lea r6, [rsp+32*4] + sub eobd, 36 + jl .fast + mov r3, -32*8*4 +.righthalf: + mova m10, [cq+r3+32*33] + mova m11, [cq+r3+32*35] + mova m12, [cq+r3+32*37] + mova m13, [cq+r3+32*39] + add r6, 32*4 + pmulld m0, m7, m10 + pmulld m1, m7, m11 + pmulld m2, m7, m12 + pmulld m3, m7, m13 + REPX {paddd x, m15}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 + paddd m0, m10 + paddd m1, m11 + paddd m2, m12 + paddd m3, m13 + REPX {psrad x, 1 }, m0, m1, m2, m3 + mova [r6+32*0], m0 + mova [r6+32*1], m1 + mova [r6+32*2], m2 + mova [r6+32*3], m3 + add r3, 32*8 + jl .righthalf +.fast: + mova m0, [cq+64* 0] + mova m1, [cq+64* 1] + mova m2, [cq+64* 2] + mova m3, [cq+64* 3] + mova m4, [cq+64* 4] + mova m5, [cq+64* 5] + mova m8, [cq+64* 6] + mova m9, [cq+64* 7] + REPX {IDTX16_12BPC x}, 0, 1, 2, 3, 4, 5, 8, 9 + mova [cq+64*0], m8 + mova [cq+64*1], m9 + mova m8, [cq+64* 8] + mova m9, [cq+64* 9] + mova m10, [cq+64*10] + mova m11, [cq+64*11] + mova m12, [cq+64*12] + mova m13, [cq+64*13] + mova m14, [cq+64*14] + REPX {IDTX16_12BPC x}, 8, 9, 10, 11, 12, 13, 14 + mova m6, [cq+64*15] + pmulld m7, m6 + paddd m7, m15 + psrad m7, 12 + paddd m7, m6 + mova m6, [cq+64*0] + psrad m15, m7, 1 + mova m7, [cq+64*1] + jmp tx2q +.pass2: + call m(iidentity_8x16_internal_12bpc).pass2_main + call m(idct_16x16_internal_10bpc).transpose_fast + test eobd, eobd + jl .pass2_fast + mova [cq+32* 8], m0 + mova [cq+32* 9], m1 + mova [cq+32*10], m2 + mova [cq+32*11], m3 + mova [cq+32*12], m4 + mova [cq+32*13], m5 + mova [cq+32*14], m6 + mova [cq+32*15], m7 + mova m8, [r6-32*4] + mova m9, [r6-32*3] + mova m10, [r6-32*2] + mova m11, [r6-32*1] + mova m12, [r6+32*0] + mova m13, [r6+32*1] + mova m14, [r6+32*2] + mova m15, [r6+32*3] + sub r6, 32*8 + mova m0, [r6-32*4] + mova m1, [r6-32*3] + mova m2, [r6-32*2] + mova m3, [r6-32*1] + mova m4, [r6+32*0] + mova m5, [r6+32*1] + mova m6, [r6+32*2] + mova m7, [r6+32*3] + call m(iidentity_8x16_internal_12bpc).pass2_main + call m(idct_16x8_internal_10bpc).transpose2 + mova m8, m0 + mova m9, m1 + mova m10, m2 + mova m11, m3 + mova m12, m4 + mova m13, m5 + mova m14, m6 + mova m15, m7 + mova m0, [cq+32* 8] + mova m1, [cq+32* 9] + mova m2, [cq+32*10] + mova m3, [cq+32*11] + mova m4, [cq+32*12] + mova m5, [cq+32*13] + mova m6, [cq+32*14] + mova m7, [cq+32*15] +.pass2_fast: + call m(idct_16x16_internal_12bpc).write_16x16 + RET + +%macro IDCT32_END 6-7 1 ; in/out1, out2, tmp[1-3], shift, pack + mova m%4, [r6+32*(%1-4)] + mova m%2, [r5+32*(3-%1)] + mova m%5, [r4+32*(%1-4)] + psubd m%3, m%1, m%4 ; idct16 out15 - n + paddd m%1, m%4 ; idct16 out0 + n + pmaxsd m%1, m12 + pmaxsd m%3, m12 + pminsd m%1, m13 + pminsd m%3, m13 + paddd m%1, m11 + paddd m%3, m11 + psubd m%4, m%1, m%2 ; out31 - n + paddd m%1, m%2 ; out0 + n + paddd m%2, m%3, m%5 ; out15 - n + psubd m%3, m%5 ; out16 + n + REPX {psrad x, %6}, m%1, m%3, m%2, m%4 +%if %7 & 1 + packssdw m%1, m%3 ; out0 + n, out16 + n + packssdw m%2, m%4 ; out15 - n, out31 - n +%endif +%endmacro + +cglobal inv_txfm_add_dct_dct_8x32_10bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 7, 16, 32*12, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vbroadcasti128 m14, [idct32_shuf] + mov r4, cq + call .pass1_main + mova [rsp+32*0], m2 + mova [rsp+32*1], m3 + cmp eobd, 43 + jge .eob43 + pxor m4, m4 + REPX {mova x, m4}, [rsp+32*2], m2, m3, m11 + jmp .pass1_end_fast +.eob43: + lea r6, [rsp+32*8] + mova [r6-32*4], m0 + mova [r6-32*3], m1 + call .pass1_main + mova [rsp+32*2], m2 + cmp eobd, 107 + jge .eob107 + mova m11, m3 + mova m2, m0 + mova m3, m1 + mova m0, [r6-32*4] + mova m1, [r6-32*3] + pxor m4, m4 +.pass1_end_fast: + vpbroadcastd m10, [pw_2048] + lea r6, [deint_shuf+128] + REPX {mova x, m4}, m5, m6, m7 + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast + jmp .end +.eob107: + mova [rsp+32*3], m3 + mova [r6-32*2], m0 + mova [r6-32*1], m1 + call .pass1_main + cmp eobd, 171 + jge .eob171 + pshufd m12, m2, q1032 + pshufd m13, m3, q1032 + mova m4, m0 + mova m5, m1 + pxor m6, m6 + REPX {mova x, m6}, m7, m14, m15 + jmp .pass1_end +.eob171: + mova [r6+32*0], m0 + mova [r6+32*1], m1 + mova [r6+32*2], m2 + mova [r6+32*3], m3 + call .pass1_main + pshufd m12, [r6+32*2], q1032 ; out19 out17 + pshufd m13, [r6+32*3], q1032 ; out23 out21 + mova m4, [r6+32*0] ; out16 out18 + mova m5, [r6+32*1] ; out20 out22 + pshufd m14, m2, q1032 ; out27 out25 + pshufd m15, m3, q1032 ; out31 out29 + mova m6, m0 ; out24 out26 + mova m7, m1 ; out28 out30 +.pass1_end: + mova m0, [r6-32*4] ; out0 out2 + mova m1, [r6-32*3] ; out4 out6 + mova m2, [r6-32*2] ; out8 out10 + mova m3, [r6-32*1] ; out12 out14 + lea r6, [deint_shuf+128] + mova m11, [rsp+32*3] ; out13 out15 + vpbroadcastd m10, [pw_2048] + call m(inv_txfm_add_dct_dct_8x32_8bpc).main +.end: ; [rsp+0*32] = m12 + vpbroadcastd m12, [pw_2048] + mov cq, r4 + mova [rsp+32*1], m8 + mova [rsp+32*2], m9 + mova [rsp+32*3], m10 + mova [rsp+32*4], m11 + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_10bpc).write_8x4_start + vpermq m0, m2, q3120 + vpermq m1, m3, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m4, q3120 + vpermq m1, m5, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m6, q3120 + vpermq m1, m7, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [rsp+32*1], q3120 + vpermq m1, [rsp+32*2], q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [rsp+32*3], q3120 + vpermq m1, [rsp+32*4], q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [rsp+32*0], q3120 + vpermq m1, m13, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m14, q3120 + vpermq m1, m15, q2031 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + call m(idct_8x8_internal_10bpc).write_8x4 + RET +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m2, [dconly_10bpc] + mov [cq], eobd ; 0 + or r3d, 32 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 +ALIGN function_align +.pass1_main_part1: + mova m0, [cq+128*0] + mova m1, [cq+128*1] + mova m2, [cq+128*2] + mova m3, [cq+128*3] + mova m4, [cq+128*4] + mova m5, [cq+128*5] + mova m6, [cq+128*6] + mova m7, [cq+128*7] + call m(idct_8x8_internal_10bpc).main + psrld m1, m11, 10 ; pd_2 + REPX {paddd x, m1}, m0, m6, m5, m3 + paddd m1, m6, m7 ; out1 + psubd m6, m7 ; out6 + psubd m7, m0, m9 ; out7 + paddd m0, m9 ; out0 + paddd m2, m5, m4 ; out2 + psubd m5, m4 ; out5 + psubd m4, m3, m8 ; out4 + paddd m3, m8 ; out3 + REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7 + ret +ALIGN function_align +.pass1_main: + call .pass1_main_part1 + add cq, 32 + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + packssdw m6, m7 + pshufb m0, m14 + pshufb m2, m14 + pshufb m4, m14 + pshufb m6, m14 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + vperm2i128 m1, m0, m2, 0x31 ; 4 6 + vinserti128 m0, xm2, 1 ; 0 2 + vinserti128 m2, m3, xm4, 1 ; 1 3 + vperm2i128 m3, m4, 0x31 ; 5 7 + ret +.main_oddhalf_part1_fast_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_oddhalf_part1_fast: ; lower half zero + vpbroadcastd m7, [pd_4091] + vpbroadcastd m8, [pd_201] + vpbroadcastd m6, [pd_m1380] + vpbroadcastd m9, [pd_3857] + vpbroadcastd m5, [pd_3703] + vpbroadcastd m10, [pd_1751] + vpbroadcastd m4, [pd_m2751] + vpbroadcastd m15, [pd_3035] + pmulld m7, m0 + pmulld m0, m8 + pmulld m6, m1 + pmulld m1, m9 + pmulld m5, m2 + pmulld m2, m10 + pmulld m4, m3 + pmulld m3, m15 + jmp .main_oddhalf_part1_fast2 +.main_oddhalf_part1_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31 + ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a + ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a + ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a + ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a +.main_oddhalf_part1_fast2: + REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 + REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 + psubd m8, m0, m4 ; t17 + paddd m0, m4 ; t16 + psubd m4, m6, m2 ; t18 + paddd m6, m2 ; t19 + psubd m2, m1, m5 ; t29 + paddd m1, m5 ; t28 + psubd m5, m7, m3 ; t30 + paddd m7, m3 ; t31 + REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 + REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 + vpbroadcastd m15, [pd_4017] + vpbroadcastd m10, [pd_799] + ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a + ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 2 ; t29a, t18a + psubd m3, m0, m6 ; t19a + paddd m0, m6 ; t16a + psubd m6, m7, m1 ; t28a + paddd m7, m1 ; t31a + psubd m1, m5, m4 ; t18 + paddd m5, m4 ; t17 + psubd m4, m8, m2 ; t29 + paddd m8, m2 ; t30 + REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 + REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 + vpbroadcastd m15, [pd_3784] + vpbroadcastd m10, [pd_1567] + ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a + ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15 ; t19, t28 + mova [r6-32*4], m0 + mova [r6-32*3], m5 + mova [r6-32*2], m4 + mova [r6-32*1], m6 + mova [r6+32*0], m3 + mova [r6+32*1], m1 + mova [r6+32*2], m8 + mova [r6+32*3], m7 + ret +.main_oddhalf_part2_fast_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_oddhalf_part2_fast: ; lower half zero + vpbroadcastd m7, [pd_m601] + vpbroadcastd m8, [pd_4052] + vpbroadcastd m6, [pd_3973] + vpbroadcastd m9, [pd_995] + vpbroadcastd m5, [pd_m2106] + vpbroadcastd m10, [pd_3513] + vpbroadcastd m4, [pd_3290] + vpbroadcastd m15, [pd_2440] + pmulld m7, m0 + pmulld m0, m8 + pmulld m6, m1 + pmulld m1, m9 + pmulld m5, m2 + pmulld m2, m10 + pmulld m4, m3 + pmulld m3, m15 + jmp .main_oddhalf_part2_fast2 +.main_oddhalf_part2_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7 + REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7 +.main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29 + ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a + ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a + ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a + ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a +.main_oddhalf_part2_fast2: + REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3 + REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3 + psubd m8, m0, m4 ; t25 + paddd m0, m4 ; t24 + psubd m4, m6, m2 ; t26 + paddd m6, m2 ; t27 + psubd m2, m1, m5 ; t21 + paddd m1, m5 ; t20 + psubd m5, m7, m3 ; t22 + paddd m7, m3 ; t23 + REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7 + REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7 + vpbroadcastd m15, [pd_2276] + vpbroadcastd m10, [pd_3406] + ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a + ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 2 ; t25a, t22a + psubd m3, m0, m6 ; t27a + paddd m0, m6 ; t24a + psubd m6, m7, m1 ; t20a + paddd m7, m1 ; t23a + psubd m1, m5, m4 ; t21 + paddd m5, m4 ; t22 + psubd m4, m8, m2 ; t26 + paddd m8, m2 ; t25 + REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8 + REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8 + vpbroadcastd m15, [pd_3784] + vpbroadcastd m10, [pd_1567] + ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 2 ; t26a, t21a + ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 2 ; t27, t20 + mova m9, [r6-32*4] ; t16a + mova m10, [r6-32*3] ; t17 + psubd m2, m9, m7 ; t23 + paddd m9, m7 ; t16 + psubd m7, m10, m5 ; t22a + paddd m10, m5 ; t17a + REPX {pmaxsd x, m12}, m9, m10, m2, m7 + REPX {pminsd x, m13}, m9, m10, m2, m7 + mova [r6-32*4], m9 + mova [r6-32*3], m10 + mova m9, [r6-32*2] ; t18a + mova m10, [r6-32*1] ; t19 + psubd m5, m9, m1 ; t21 + paddd m9, m1 ; t18 + psubd m1, m10, m6 ; t20a + paddd m10, m6 ; t19a + REPX {pmaxsd x, m12}, m9, m10, m5, m1 + REPX {pminsd x, m13}, m9, m10, m5, m1 + mova [r6-32*2], m9 + mova [r6-32*1], m10 + mova m9, [r6+32*0] ; t28 + mova m10, [r6+32*1] ; t29a + psubd m6, m9, m3 ; t27a + paddd m9, m3 ; t28a + psubd m3, m10, m4 ; t26 + paddd m10, m4 ; t29 + REPX {pmaxsd x, m12}, m9, m10, m6, m3 + REPX {pminsd x, m13}, m9, m10, m6, m3 + REPX {pmulld x, m14}, m6, m3, m1, m5 + paddd m6, m11 + paddd m3, m11 + psubd m4, m6, m1 ; t20 + paddd m6, m1 ; t27 + psubd m1, m3, m5 ; t21a + paddd m3, m5 ; t26a + REPX {psrad x, 12 }, m4, m1, m3, m6 + mova [r6+32*0], m4 + mova [r6+32*1], m1 + mova m4, [r6+32*2] ; t30 + mova m1, [r6+32*3] ; t31a + psubd m5, m4, m8 ; t25a + paddd m4, m8 ; t30a + psubd m8, m1, m0 ; t24 + paddd m1, m0 ; t31 + REPX {pmaxsd x, m12}, m8, m5, m4, m1 + REPX {pminsd x, m13}, m8, m5, m4, m1 + REPX {pmulld x, m14}, m5, m8, m7, m2 + paddd m5, m11 + paddd m8, m11 + psubd m0, m5, m7 ; t22 + paddd m5, m7 ; t25 + psubd m7, m8, m2 ; t23a + paddd m2, m8 ; t24a + REPX {psrad x, 12 }, m0, m7, m2, m5 + mova [r6+32*2], m0 + mova [r6+32*3], m7 + mov r4, r6 + add r6, 32*8 + mova [r6-32*4], m2 + mova [r6-32*3], m5 + mova [r6-32*2], m3 + mova [r6-32*1], m6 + mova [r6+32*0], m9 + mova [r6+32*1], m10 + mova [r6+32*2], m4 + mova [r6+32*3], m1 + mov r5, r6 + add r6, 32*8 + ret +ALIGN function_align +.main_end: + psrld m11, 10 ; pd_2 + IDCT32_END 0, 15, 8, 9, 10, 2 + IDCT32_END 1, 14, 8, 9, 10, 2 + punpckhwd m8, m0, m1 ; 16 17 + punpcklwd m0, m1 ; 0 1 + punpcklwd m1, m14, m15 ; 14 15 + punpckhwd m14, m15 ; 30 31 + mova [r5+32*3], m8 + mova [r5+32*2], m14 + IDCT32_END 2, 15, 8, 9, 10, 2 + IDCT32_END 3, 14, 8, 9, 10, 2 + punpckhwd m8, m2, m3 ; 18 19 + punpcklwd m2, m3 ; 2 3 + punpcklwd m3, m14, m15 ; 12 13 + punpckhwd m14, m15 ; 28 29 + mova [r5+32*1], m8 + mova [r5+32*0], m14 + IDCT32_END 4, 15, 8, 9, 10, 2 + IDCT32_END 5, 14, 8, 9, 10, 2 + punpckhwd m8, m4, m5 ; 20 21 + punpcklwd m4, m5 ; 4 5 + punpcklwd m5, m14, m15 ; 10 11 + punpckhwd m14, m15 ; 26 27 + mova [r5-32*1], m8 + mova [r5-32*2], m14 + IDCT32_END 6, 15, 8, 9, 10, 2 + IDCT32_END 7, 14, 8, 9, 10, 2 + punpckhwd m8, m6, m7 ; 22 23 + punpcklwd m6, m7 ; 6 7 + punpcklwd m7, m14, m15 ; 8 9 + punpckhwd m14, m15 ; 24 25 + mova [r5-32*3], m8 + mova [r5-32*4], m14 +.transpose: + punpckhdq m15, m3, m1 + punpckldq m3, m1 + punpckhdq m1, m4, m6 + punpckldq m4, m6 + punpckhdq m6, m0, m2 + punpckldq m0, m2 + punpckhdq m2, m7, m5 + punpckldq m7, m5 + punpcklqdq m5, m2, m15 + punpckhqdq m2, m15 + punpckhqdq m15, m7, m3 + punpcklqdq m7, m3 + punpckhqdq m3, m6, m1 + punpcklqdq m6, m1 + punpckhqdq m1, m0, m4 + punpcklqdq m0, m4 + vperm2i128 m4, m0, m7, 0x31 + vinserti128 m0, xm7, 1 + vperm2i128 m7, m3, m2, 0x31 + vinserti128 m3, xm2, 1 + vinserti128 m2, m6, xm5, 1 + vperm2i128 m6, m5, 0x31 + vperm2i128 m5, m1, m15, 0x31 + vinserti128 m1, xm15, 1 + ret + +cglobal inv_txfm_add_identity_identity_8x32_10bpc, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_10bpc_max] +.pass1: + vpbroadcastd m5, [pw_5] + pxor m6, m6 + mov r6d, eobd + add eobb, 21 + cmovc eobd, r6d ; 43, 107, 171 -> 64, 128, 192 + lea r6, [strideq*3] + lea r5, [strideq*5] + lea r4, [strideq+r6*2] ; strideq*7 +.loop: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + REPX {paddsw x, m5}, m0, m1, m2, m3 + REPX {psraw x, 3 }, m0, m1, m2, m3 + call .main_zero + add cq, 32 + lea dstq, [dstq+strideq*8] + sub eobd, 64 + jge .loop + RET +ALIGN function_align +.main_zero: + REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 +.main: + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m4 + punpcklwd m0, m4 + punpckhwd m4, m2, m1 + punpcklwd m2, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + mova xm4, [dstq+strideq*0] + vinserti128 m4, [dstq+strideq*4], 1 + paddw m0, m4 + mova xm4, [dstq+strideq*1] + vinserti128 m4, [dstq+r5 ], 1 + paddw m1, m4 + mova xm4, [dstq+strideq*2] + vinserti128 m4, [dstq+r6*2 ], 1 + paddw m2, m4 + mova xm4, [dstq+r6 ] + vinserti128 m4, [dstq+r4 ], 1 + paddw m3, m4 + REPX {pmaxsw x, m6}, m0, m1, m2, m3 + REPX {pminsw x, m7}, m0, m1, m2, m3 + mova [dstq+strideq*0], xm0 + vextracti128 [dstq+strideq*4], m0, 1 + mova [dstq+strideq*1], xm1 + vextracti128 [dstq+r5 ], m1, 1 + mova [dstq+strideq*2], xm2 + vextracti128 [dstq+r6*2 ], m2, 1 + mova [dstq+r6 ], xm3 + vextracti128 [dstq+r4 ], m3, 1 + ret + +cglobal inv_txfm_add_dct_dct_8x32_12bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + mov r4, cq + lea r6, [rsp+32*4] + call .pass1_main + cmp eobd, 43 + jge .eob43 + jmp .pass2_fast +.eob43: + call .pass1_main + cmp eobd, 107 + jge .eob107 +.pass2_fast: + mov cq, r4 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + pmaxsd m0, m12, [cq+128*1+ 0] + pmaxsd m1, m12, [cq+128*7+ 0] + pmaxsd m2, m12, [cq+128*1+32] + pmaxsd m3, m12, [cq+128*7+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + vpbroadcastd m14, [pd_2896] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast + pmaxsd m0, m12, [cq+128*3+ 0] + pmaxsd m1, m12, [cq+128*5+ 0] + pmaxsd m2, m12, [cq+128*3+32] + pmaxsd m3, m12, [cq+128*5+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast + pmaxsd m0, m12, [cq+128*2+ 0] + pmaxsd m1, m12, [cq+128*6+ 0] + pmaxsd m2, m12, [cq+128*2+32] + pmaxsd m3, m12, [cq+128*6+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + call m(idct_8x16_internal_10bpc).main_oddhalf_fast + pmaxsd m0, m12, [cq+128*0+ 0] + pmaxsd m1, m12, [cq+128*4+ 0] + pmaxsd m2, m12, [cq+128*0+32] + pmaxsd m3, m12, [cq+128*4+32] + REPX {pminsd x, m13}, m0, m1, m2, m3 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf + jmp .pass2_end +.eob107: + call .pass1_main + cmp eobd, 171 + jge .eob171 + jmp .pass2 +.eob171: + call .pass1_main +.pass2: + mov cq, r4 + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + pmaxsd m0, m12, [cq+128*1+ 0] + pmaxsd m1, m12, [cq+128*7+ 0] + pmaxsd m2, m12, [cq+128*1+32] + pmaxsd m3, m12, [cq+128*7+32] + pmaxsd m4, m12, [cq+128*1+64] + pmaxsd m5, m12, [cq+128*7+64] + pmaxsd m6, m12, [cq+128*1+96] + pmaxsd m7, m12, [cq+128*7+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + vpbroadcastd m14, [pd_2896] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 + pmaxsd m0, m12, [cq+128*3+ 0] + pmaxsd m1, m12, [cq+128*5+ 0] + pmaxsd m2, m12, [cq+128*3+32] + pmaxsd m3, m12, [cq+128*5+32] + pmaxsd m4, m12, [cq+128*3+64] + pmaxsd m5, m12, [cq+128*5+64] + pmaxsd m6, m12, [cq+128*3+96] + pmaxsd m7, m12, [cq+128*5+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2 + pmaxsd m0, m12, [cq+128*2+ 0] + pmaxsd m1, m12, [cq+128*6+ 0] + pmaxsd m2, m12, [cq+128*2+32] + pmaxsd m3, m12, [cq+128*6+32] + pmaxsd m4, m12, [cq+128*2+64] + pmaxsd m5, m12, [cq+128*6+64] + pmaxsd m6, m12, [cq+128*2+96] + pmaxsd m7, m12, [cq+128*6+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_8x16_internal_10bpc).main_oddhalf + pmaxsd m0, m12, [cq+128*0+ 0] + pmaxsd m1, m12, [cq+128*4+ 0] + pmaxsd m2, m12, [cq+128*0+32] + pmaxsd m3, m12, [cq+128*4+32] + pmaxsd m4, m12, [cq+128*0+64] + pmaxsd m5, m12, [cq+128*4+64] + pmaxsd m6, m12, [cq+128*0+96] + pmaxsd m7, m12, [cq+128*4+96] + REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf +.pass2_end: + psrld m11, 8 ; pd_8 + IDCT32_END 0, 15, 8, 9, 10, 4 + IDCT32_END 1, 14, 8, 9, 10, 4 + punpckhqdq m8, m0, m1 ; 16 17 (interleaved) + punpcklqdq m0, m1 ; 0 1 (interleaved) + punpcklqdq m1, m14, m15 ; 14 15 (interleaved) + punpckhqdq m14, m15 ; 30 31 (interleaved) + mova [r5+32*3], m8 + mova [r5+32*2], m14 + IDCT32_END 2, 15, 8, 9, 10, 4 + IDCT32_END 3, 14, 8, 9, 10, 4 + punpckhqdq m8, m2, m3 ; 18 19 (interleaved) + punpcklqdq m2, m3 ; 2 3 (interleaved) + punpcklqdq m3, m14, m15 ; 12 13 (interleaved) + punpckhqdq m14, m15 ; 28 29 (interleaved) + mova [r5+32*1], m8 + mova [r5+32*0], m14 + IDCT32_END 4, 15, 8, 9, 10, 4 + IDCT32_END 5, 14, 8, 9, 10, 4 + punpckhqdq m8, m4, m5 ; 20 21 (interleaved) + punpcklqdq m4, m5 ; 4 5 (interleaved) + punpcklqdq m5, m14, m15 ; 10 11 (interleaved) + punpckhqdq m14, m15 ; 26 27 (interleaved) + mova [r5-32*1], m8 + mova [r5-32*2], m14 + IDCT32_END 6, 15, 8, 9, 10, 4 + IDCT32_END 7, 14, 8, 9, 10, 4 + punpckhqdq m8, m6, m7 ; 22 23 (interleaved) + punpcklqdq m6, m7 ; 6 7 (interleaved) + punpcklqdq m7, m14, m15 ; 8 9 (interleaved) + punpckhqdq m14, m15 ; 24 25 (interleaved) + mova [r5-32*3], m8 + mova [r5-32*4], m14 + mova m15, m1 +.end: + vpermq m0, m0, q3120 + vpermq m1, m2, q3120 + call m(idct_8x8_internal_12bpc).write_8x4_start + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m4, q3120 + vpermq m1, m6, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m7, q3120 + vpermq m1, m5, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, m3, q3120 + vpermq m1, m15, q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [r5+32*3], q3120 + vpermq m1, [r5+32*1], q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [r5-32*1], q3120 + vpermq m1, [r5-32*3], q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [r5-32*4], q3120 + vpermq m1, [r5-32*2], q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + vpermq m0, [r5+32*0], q3120 + vpermq m1, [r5+32*2], q3120 + call m(idct_8x8_internal_10bpc).write_8x4 + RET +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m2, [dconly_12bpc] + mov [cq], eobd ; 0 + or r3d, 32 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_8x8_10bpc).dconly3 +ALIGN function_align +.pass1_main: + call m(inv_txfm_add_dct_dct_8x32_10bpc).pass1_main_part1 + TRANSPOSE_8X8_DWORD 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15 + mova [cq+128*0], m0 + mova [cq+128*1], m1 + mova [cq+128*2], m2 + mova [cq+128*3], m3 + mova [cq+128*4], m4 + mova [cq+128*5], m5 + mova [cq+128*6], m6 + mova [cq+128*7], m7 + add cq, 32 + ret +ALIGN function_align +.main_end: + psrld m11, 10 ; pd_2 + IDCT32_END 0, 15, 8, 9, 10, 2, 0 + mova [cq+32*16], m8 + mova [cq+32*31], m9 + IDCT32_END 1, 14, 8, 9, 10, 2, 0 + mova [cq+32*17], m8 + mova [cq+32*30], m9 + mova [cq+32*14], m14 + IDCT32_END 2, 14, 8, 9, 10, 2, 0 + mova [cq+32*18], m8 + mova [cq+32*29], m9 + mova [cq+32*13], m14 + IDCT32_END 3, 14, 8, 9, 10, 2, 0 + mova [cq+32*19], m8 + mova [cq+32*28], m9 + mova [cq+32*12], m14 + IDCT32_END 4, 14, 8, 9, 10, 2, 0 + mova [cq+32*20], m8 + mova [cq+32*27], m9 + mova [cq+32* 0], m0 + mova [cq+32* 1], m1 + mova [cq+32* 2], m2 + IDCT32_END 5, 10, 0, 1, 2, 2, 0 + mova [cq+32*21], m0 + mova [cq+32*26], m1 + IDCT32_END 6, 9, 0, 1, 2, 2, 0 + mova [cq+32*22], m0 + mova [cq+32*25], m1 + IDCT32_END 7, 8, 0, 1, 2, 2, 0 + mova [cq+32*23], m0 + mova [cq+32*24], m1 + mova m0, [cq+32* 0] + mova m1, [cq+32* 1] + mova m2, [cq+32* 2] + mova m11, m14 + mova m12, [cq+32*12] + mova m13, [cq+32*13] + mova m14, [cq+32*14] + ret + +cglobal inv_txfm_add_identity_identity_8x32_12bpc, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(inv_txfm_add_identity_identity_8x32_10bpc).pass1 + +cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jnz .full + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] + mov [cq], eobd ; 0 + or r3d, 8 +.dconly: + add r6d, 640 + sar r6d, 10 +.dconly2: + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + movd xm0, r6d + paddsw xm0, xm3 + vpbroadcastw m0, xm0 +.dconly_loop: + paddsw m1, m0, [dstq+32*0] + paddsw m2, m0, [dstq+32*1] + psubusw m1, m3 + psubusw m2, m3 + mova [dstq+32*0], m1 + mova [dstq+32*1], m2 + add dstq, strideq + dec r3d + jg .dconly_loop + RET +.full: + PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob + lea r6, [rsp+32*4] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + call .pass1 + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end + lea r6, [deint_shuf+128] + vpbroadcastd m11, [pw_2048] + mov r4, dstq + call .pass2 + mova m0, [r5+32*3] ; 16 17 + mova m1, [r5+32*2] ; 30 31 + mova m2, [r5+32*1] ; 18 19 + mova m3, [r5+32*0] ; 28 29 + mova m4, [r5-32*1] ; 20 21 + mova m5, [r5-32*2] ; 26 27 + mova m6, [r5-32*3] ; 22 23 + mova m7, [r5-32*4] ; 24 25 + call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose + lea dstq, [r4+32] + call .pass2 + RET +ALIGN function_align +.pass2: + call m(idct_16x8_internal_8bpc).main + REPX {pmulhrsw x, m11}, m0, m1, m2, m3 + call m(idct_16x8_internal_10bpc).write_16x4_start + pmulhrsw m0, m11, m4 + pmulhrsw m1, m11, m5 + pmulhrsw m2, m11, m6 + pmulhrsw m3, m11, m7 + jmp m(idct_16x8_internal_10bpc).write_16x4_zero +ALIGN function_align +.pass1: + mova m0, [cq+32* 1] + mova m1, [cq+32* 7] + mova m2, [cq+32* 9] + mova m3, [cq+32*15] + mova m4, [cq+32*17] + mova m5, [cq+32*23] + mova m6, [cq+32*25] + mova m7, [cq+32*31] + vpbroadcastd m11, [pd_2048] + vpbroadcastd m14, [pd_2896] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 + mova m0, [cq+32* 3] + mova m1, [cq+32* 5] + mova m2, [cq+32*11] + mova m3, [cq+32*13] + mova m4, [cq+32*19] + mova m5, [cq+32*21] + mova m6, [cq+32*27] + mova m7, [cq+32*29] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2 + mova m0, [cq+32* 2] + mova m1, [cq+32* 6] + mova m2, [cq+32*10] + mova m3, [cq+32*14] + mova m4, [cq+32*18] + mova m5, [cq+32*22] + mova m6, [cq+32*26] + mova m7, [cq+32*30] + call m(idct_8x16_internal_10bpc).main_oddhalf + mova m0, [cq+32* 0] + mova m1, [cq+32* 4] + mova m2, [cq+32* 8] + mova m3, [cq+32*12] + mova m4, [cq+32*16] + mova m5, [cq+32*20] + mova m6, [cq+32*24] + mova m7, [cq+32*28] + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf + ret + +cglobal inv_txfm_add_identity_identity_32x8_10bpc, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_10bpc_max] +.pass1: + vpbroadcastd m5, [pw_4096] + pxor m6, m6 + mov r6d, eobd + add eobb, 21 + cmovc eobd, r6d + lea r6, [strideq*3] + lea r5, [strideq*5] + lea r4, [strideq+r6*2] ; strideq*7 +.loop: + mova m0, [cq+32*0] + packssdw m0, [cq+32*1] + mova m1, [cq+32*2] + packssdw m1, [cq+32*3] + REPX {mova [cq+32*x], m6}, 0, 1, 2, 3 + add cq, 32*8 + mova m2, [cq-32*4] + packssdw m2, [cq-32*3] + mova m3, [cq-32*2] + packssdw m3, [cq-32*1] + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + REPX {mova [cq+32*x], m6}, -4, -3, -2, -1 + call m(inv_txfm_add_identity_identity_8x32_10bpc).main + add dstq, 16 + sub eobd, 64 + jge .loop + RET + +cglobal inv_txfm_add_dct_dct_32x8_12bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jnz .full + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_12bpc] + mov [cq], eobd ; 0 + or r3d, 8 + jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly +.full: + PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob + lea r6, [rsp+32*4] + vpbroadcastd m12, [clip_20b_min] + vpbroadcastd m13, [clip_20b_max] + call m(inv_txfm_add_dct_dct_32x8_10bpc).pass1 + call m(inv_txfm_add_dct_dct_8x32_12bpc).main_end + mov r4, dstq + call m(idct_16x8_internal_12bpc).pass2_main + mova m0, [cq+32* 0] ; 16 + mova m1, [cq+32* 1] ; 17 + mova m2, [cq+32* 2] ; 18 + mova m3, [cq+32* 3] ; 19 + mova m4, [cq+32* 4] ; 20 + mova m5, [cq+32* 5] ; 21 + mova m6, [cq+32* 6] ; 22 + mova m7, [cq+32* 7] ; 23 + mova m8, [cq+32* 8] ; 24 + mova m9, [cq+32* 9] ; 25 + mova m10, [cq+32*10] ; 26 + mova m11, [cq+32*11] ; 27 + mova m12, [cq+32*12] ; 28 + mova m13, [cq+32*13] ; 29 + mova m14, [cq+32*14] ; 30 + mova m15, [cq+32*15] ; 31 + lea dstq, [r4+32] + call m(idct_16x8_internal_12bpc).pass2_main + RET + +cglobal inv_txfm_add_identity_identity_32x8_12bpc, 4, 7, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(inv_txfm_add_identity_identity_32x8_10bpc).pass1 + +%macro IDCT32_PASS2_END 6 ; coefs[1-2], tmp[1-2], offset[1-2] + mova m%4, [%2] + paddsw m%3, m%1, m%4 + psubsw m%1, m%4 +%if %1 == 0 + pxor m6, m6 +%endif + pmulhrsw m%3, m15 + pmulhrsw m%1, m15 + paddw m%3, [dstq+%5] + paddw m%1, [r2+%6] + pmaxsw m%3, m6 + pmaxsw m%1, m6 + pminsw m%3, m7 + pminsw m%1, m7 + mova [dstq+%5], m%3 + mova [r2+%6], m%1 +%endmacro + +cglobal inv_txfm_add_dct_dct_16x32_10bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*36, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*16] + lea r4, [r6+32*8] + lea r5, [r6+32*16] + call .main + sub eobd, 44 + jge .eob44 + vperm2i128 m2, m0, m3, 0x31 ; 5 + vinserti128 m0, xm3, 1 ; 1 + vperm2i128 m3, m1, m4, 0x31 ; 7 + vinserti128 m1, xm4, 1 ; 3 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + REPX {mova [r6+32*x], m4}, 0, 1, 2, 3 + jmp .fast +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] + mov [cq], eobd ; 0 + or r3d, 32 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly2 +.eob44: + mova [r4+16*0], xm0 + mova [r4+16*1], xm3 + mova [r4+16*2], xm1 + mova [r4+16*3], xm4 + vextracti128 [r4+16*4], m0, 1 + vextracti128 [r4+16*5], m3, 1 + vextracti128 [r4+16*6], m1, 1 + vextracti128 [r4+16*7], m4, 1 + call .main + sub eobd, 107 + jge .eob151 + vperm2i128 m7, m1, m4, 0x31 ; 15 + vinserti128 m5, m1, xm4, 1 ; 11 + vperm2i128 m6, m0, m3, 0x31 ; 13 + vinserti128 m4, m0, xm3, 1 ; 9 + mova m0, [r4+32*0] + mova m1, [r4+32*1] + mova m2, [r4+32*2] + mova m3, [r4+32*3] +.fast: + lea r6, [pw_5+128] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 + jmp .idct16 +.eob151: + mova [r4-16*8], xm0 + mova [r4-16*7], xm3 + mova [r4-16*6], xm1 + mova [r4-16*5], xm4 + vextracti128 [r4-16*4], m0, 1 + vextracti128 [r4-16*3], m3, 1 + vextracti128 [r4-16*2], m1, 1 + vextracti128 [r4-16*1], m4, 1 + call .main + sub eobd, 128 + jge .eob279 + vperm2i128 m10, m0, m3, 0x31 ; 21 + vinserti128 m8, m0, xm3, 1 ; 17 + vperm2i128 m11, m1, m4, 0x31 ; 23 + vinserti128 m9, m1, xm4, 1 ; 19 + pxor m12, m12 + REPX {mova x, m12}, m13, m14, m15 + REPX {mova [r6+32*x], m12}, 0, 1, 2, 3 + jmp .full +.eob279: + mova [r5+16*0], xm0 + mova [r5+16*1], xm3 + mova [r5+16*2], xm1 + mova [r5+16*3], xm4 + vextracti128 [r5+16*4], m0, 1 + vextracti128 [r5+16*5], m3, 1 + vextracti128 [r5+16*6], m1, 1 + vextracti128 [r5+16*7], m4, 1 + call .main + vperm2i128 m14, m0, m3, 0x31 ; 29 + vinserti128 m12, m0, xm3, 1 ; 25 + vperm2i128 m15, m1, m4, 0x31 ; 31 + vinserti128 m13, m1, xm4, 1 ; 27 + mova m8, [r5+32*0] + mova m9, [r5+32*1] + mova m10, [r5+32*2] + mova m11, [r5+32*3] +.full: + mova m0, [r4+32*0] + mova m1, [r4+32*1] + mova m2, [r4+32*2] + mova m3, [r4+32*3] + mova m4, [r4-32*4] + mova m5, [r4-32*3] + mova m6, [r4-32*2] + mova m7, [r4-32*1] + lea r6, [pw_5 + 128] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + lea r3, [rsp+32*8] + mova m8, [r3+32*0] + mova m9, [r3+32*1] + mova m10, [r3+32*2] + mova m11, [r3+32*3] + mova m12, [r3-32*4] + mova m13, [r3-32*3] + mova m14, [r3-32*2] + mova m15, [r3-32*1] +.idct16: + lea r3, [rsp+32*16] + mova m0, [r3+32*0] + mova m1, [r3+32*1] + mova m2, [r3+32*2] + mova m3, [r3+32*3] + mova m4, [r3-32*4] + mova m5, [r3-32*3] + mova m6, [r3-32*2] + mova m7, [r3-32*1] + mova [rsp], m15 + call m(idct_16x16_internal_8bpc).main + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + call .pass2_end + RET +ALIGN function_align +.main: + pmulld m0, m14, [cq+128* 1] + pmulld m1, m14, [cq+128* 3] + pmulld m2, m14, [cq+128* 5] + pmulld m3, m14, [cq+128* 7] + pmulld m4, m14, [cq+128* 9] + pmulld m5, m14, [cq+128*11] + pmulld m6, m14, [cq+128*13] + pmulld m7, m14, [cq+128*15] + call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 + pmulld m0, m14, [cq+128* 0] + pmulld m1, m14, [cq+128* 2] + pmulld m2, m14, [cq+128* 4] + pmulld m3, m14, [cq+128* 6] + pmulld m4, m14, [cq+128* 8] + pmulld m5, m14, [cq+128*10] + pmulld m6, m14, [cq+128*12] + pmulld m7, m14, [cq+128*14] + call m(idct_8x8_internal_10bpc).main_rect2 + call m(idct_8x16_internal_10bpc).main_evenhalf + psrld m15, m11, 11 ; pd_1 + mova m8, [r6-32*4] + mova m9, [r6-32*3] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 + psubd m10, m0, m8 ; out15 + paddd m0, m8 ; out0 + mova m8, [r6-32*2] + paddd m15, m1, m9 ; out1 + psubd m1, m9 ; out14 + mova m9, [r6-32*1] + REPX {psrad x, 1}, m0, m15, m10, m1 + packssdw m0, m15 + packssdw m1, m10 + psubd m10, m2, m8 ; out13 + paddd m2, m8 ; out2 + mova m8, [r6+32*0] + paddd m15, m3, m9 ; out3 + psubd m3, m9 ; out12 + mova m9, [r6+32*1] + REPX {psrad x, 1}, m2, m15, m10, m3 + packssdw m2, m15 + packssdw m3, m10 + psubd m10, m4, m8 ; out11 + paddd m4, m8 ; out4 + mova m8, [r6+32*2] + paddd m15, m5, m9 ; out5 + psubd m5, m9 ; out10 + mova m9, [r6+32*3] + REPX {psrad x, 1}, m4, m10, m15, m5 + packssdw m4, m15 + packssdw m5, m10 + psubd m10, m6, m8 ; out9 + paddd m6, m8 ; out6 + paddd m15, m7, m9 ; out7 + psubd m7, m9 ; out8 + REPX {psrad x, 1}, m6, m10, m15, m7 + packssdw m6, m15 + packssdw m7, m10 + punpckhwd m8, m0, m2 + punpcklwd m0, m2 + punpckhwd m2, m3, m1 + punpcklwd m3, m1 + punpckhwd m1, m4, m6 + punpcklwd m4, m6 + punpcklwd m6, m7, m5 + punpckhwd m7, m5 + pxor m5, m5 + mov r7d, 128*13 +.main_zero_loop: + mova [cq+r7-128*1], m5 + mova [cq+r7+128*0], m5 + mova [cq+r7+128*1], m5 + mova [cq+r7+128*2], m5 + sub r7d, 128*4 + jg .main_zero_loop + add cq, 32 + punpcklwd m5, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m1 + punpckhwd m4, m1 + punpckhwd m1, m0, m8 + punpcklwd m0, m8 + punpckhwd m8, m6, m7 + punpcklwd m6, m7 + punpcklqdq m7, m1, m4 + punpckhqdq m1, m4 + punpckhqdq m4, m8, m3 + punpcklqdq m8, m3 + punpckhqdq m3, m6, m5 + punpcklqdq m6, m5 + punpcklqdq m5, m0, m2 + punpckhqdq m0, m2 + mova [r6+16*0], xm5 + mova [r6+16*1], xm6 + mova [r6+16*2], xm7 + mova [r6+16*3], xm8 + vextracti128 [r6+16*4], m5, 1 + vextracti128 [r6+16*5], m6, 1 + vextracti128 [r6+16*6], m7, 1 + vextracti128 [r6+16*7], m8, 1 + sub r6, 32*4 + ret +ALIGN function_align +.pass2_end: + mova [rsp+gprsize+32*0], m6 + mova [rsp+gprsize+32*2], m7 + mova [rsp+gprsize+32*3], m15 + vpbroadcastd m15, [pw_2048] + vpbroadcastd m7, [pixel_10bpc_max] + IDCT32_PASS2_END 0, r5+32*3, 1, 6, strideq*0, r3*4 + IDCT32_PASS2_END 4, r5-32*1, 0, 1, strideq*4, strideq*8 + IDCT32_PASS2_END 8, r4+32*3, 0, 4, strideq*8, strideq*4 + IDCT32_PASS2_END 12, r4-32*1, 0, 4, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m1, [rsp+gprsize+32*1] + IDCT32_PASS2_END 1, r5+32*2, 0, 4, strideq*0, r3*4 + IDCT32_PASS2_END 5, r5-32*2, 0, 4, strideq*4, strideq*8 + IDCT32_PASS2_END 9, r4+32*2, 0, 4, strideq*8, strideq*4 + IDCT32_PASS2_END 13, r4-32*2, 0, 4, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m1, [rsp+gprsize+32*0] + IDCT32_PASS2_END 2, r5+32*1, 0, 4, strideq*0, r3*4 + IDCT32_PASS2_END 1, r5-32*3, 0, 4, strideq*4, strideq*8 + IDCT32_PASS2_END 10, r4+32*1, 0, 4, strideq*8, strideq*4 + IDCT32_PASS2_END 14, r4-32*3, 0, 4, r3*4, strideq*0 + add dstq, strideq + sub r2, strideq + mova m1, [rsp+gprsize+32*2] + mova m2, [rsp+gprsize+32*3] + IDCT32_PASS2_END 3, r5+32*0, 0, 4, strideq*0, r3*4 + IDCT32_PASS2_END 1, r5-32*4, 0, 4, strideq*4, strideq*8 + IDCT32_PASS2_END 11, r4+32*0, 0, 4, strideq*8, strideq*4 + IDCT32_PASS2_END 2, r4-32*4, 0, 4, r3*4, strideq*0 + ret + +cglobal inv_txfm_add_identity_identity_16x32_10bpc, 4, 7, 12, dst, stride, c, eob + vpbroadcastd m7, [pixel_10bpc_max] +.pass1: + vpbroadcastd m8, [pw_2896x8] + vpbroadcastd m9, [pw_1697x16] + vpbroadcastd m11, [pw_8192] + lea r6, [strideq*5] + pxor m6, m6 + paddw m10, m11, m11 ; pw_16384 + mov r5, dstq + call .main + sub eobd, 36 + jl .ret + add cq, 128*8 + lea dstq, [r5+16] + call .main + sub cq, 128*8-32 + lea dstq, [r5+strideq*8] + mov r5, dstq + call .main + sub eobd, 107 ; eob < 143 + jl .ret + add cq, 128*8 + lea dstq, [r5+16] + call .main + sub cq, 128*8-32 + lea dstq, [r5+strideq*8] + mov r5, dstq + call .main + sub eobd, 128 ; eob < 271 + jl .ret + add cq, 128*8 + lea dstq, [r5+16] + call .main + sub cq, 128*8-32 + lea dstq, [r5+strideq*8] + mov r5, dstq + call .main + sub eobd, 128 ; eob < 399 + jl .ret + add cq, 128*8 + lea dstq, [r5+16] + call .main +.ret: + RET +ALIGN function_align +.main: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 + REPX {IDTX16 x, 4, 9, 10}, 0, 1, 2, 3 + REPX {pmulhrsw x, m11}, m0, m1, m2, m3 + REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 +.main2: + punpckhwd m4, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m2, m3 + punpcklwd m2, m3 + punpckhwd m3, m0, m4 + punpcklwd m0, m4 + punpcklwd m4, m2, m1 + punpckhwd m2, m1 + punpckhqdq m1, m0, m4 + punpcklqdq m0, m4 + call m(iidentity_8x8_internal_10bpc).write_2x8x2 + punpcklqdq m0, m3, m2 + punpckhqdq m1, m3, m2 + jmp m(iidentity_8x8_internal_10bpc).write_2x8x2 + +cglobal inv_txfm_add_identity_identity_16x32_12bpc, 4, 7, 12, dst, stride, c, eob + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(inv_txfm_add_identity_identity_16x32_10bpc).pass1 + +cglobal inv_txfm_add_dct_dct_32x16_10bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*40, dst, stride, c, eob +%undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + lea r6, [rsp+32*4] + call .main + cmp eobd, 36 + jge .full + call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] + lea r6, [pw_5+128] + mov r7, dstq + call m(idct_16x16_internal_8bpc).main + call .write_16x16 + mova m0, [r5+32*3] + mova m1, [r5+32*2] + mova m2, [r5+32*1] + mova m3, [r5+32*0] + mova m4, [r5-32*1] + mova m5, [r5-32*2] + mova m6, [r5-32*3] + mova m7, [r5-32*4] + call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp] + jmp .end +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] + mov [cq], eobd ; 0 + or r3d, 16 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 + jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2 +.full: + add cq, 32 + mova [r4+32*3], m0 + mova [r4+32*2], m1 + mova [r4+32*1], m2 + mova [r4+32*0], m3 + mova [r4-32*1], m4 + mova [r4-32*2], m5 + mova [r4-32*3], m6 + mova [r4-32*4], m7 + call .main + sub r4, 32*16 ; topleft 16x8 + call .transpose_16x16 + lea r6, [pw_5+128] + mov r7, dstq + call m(idct_16x16_internal_8bpc).main + call .write_16x16 + mova m0, [r5+32*3] + mova m1, [r5+32*2] + mova m2, [r5+32*1] + mova m3, [r5+32*0] + mova m4, [r5-32*1] + mova m5, [r5-32*2] + mova m6, [r5-32*3] + mova m7, [r5-32*4] + add r4, 32*8 ; bottomleft 16x8 + call .transpose_16x16 +.end: + lea dstq, [r7+32] + call m(idct_16x16_internal_8bpc).main + call .write_16x16 + RET +ALIGN function_align +.transpose_16x16: + punpckhdq m8, m3, m1 + punpckldq m3, m1 + punpckhdq m1, m0, m2 + punpckldq m0, m2 + punpckhdq m2, m7, m5 + punpckldq m7, m5 + punpckhdq m5, m4, m6 + punpckldq m4, m6 + punpckhqdq m6, m0, m4 + punpcklqdq m0, m4 + punpckhqdq m4, m1, m5 + punpcklqdq m1, m5 + punpckhqdq m5, m7, m3 + punpcklqdq m7, m3 + punpckhqdq m3, m2, m8 + punpcklqdq m2, m8 + vinserti128 m8, m0, xm7, 1 + vperm2i128 m12, m0, m7, 0x31 + vinserti128 m9, m6, xm5, 1 + vperm2i128 m13, m6, m5, 0x31 + vinserti128 m10, m1, xm2, 1 + vperm2i128 m14, m1, m2, 0x31 + vinserti128 m11, m4, xm3, 1 + vperm2i128 m15, m4, m3, 0x31 + mova m0, [r4+32*3] + mova m1, [r4+32*2] + mova m2, [r4+32*1] + mova m3, [r4+32*0] + mova m4, [r4-32*1] + mova m5, [r4-32*2] + mova m6, [r4-32*3] + mova m7, [r4-32*4] + mova [rsp+gprsize], m15 + jmp m(inv_txfm_add_dct_dct_8x32_10bpc).transpose +ALIGN function_align +.main: + vpbroadcastd m14, [pd_2896] + vpbroadcastd m11, [pd_2048] + pmulld m0, m14, [cq+64* 1] + pmulld m1, m14, [cq+64* 7] + pmulld m2, m14, [cq+64* 9] + pmulld m3, m14, [cq+64*15] + pmulld m4, m14, [cq+64*17] + pmulld m5, m14, [cq+64*23] + pmulld m6, m14, [cq+64*25] + pmulld m7, m14, [cq+64*31] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2 + pmulld m0, m14, [cq+64* 3] + pmulld m1, m14, [cq+64* 5] + pmulld m2, m14, [cq+64*11] + pmulld m3, m14, [cq+64*13] + pmulld m4, m14, [cq+64*19] + pmulld m5, m14, [cq+64*21] + pmulld m6, m14, [cq+64*27] + pmulld m7, m14, [cq+64*29] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2 + pmulld m0, m14, [cq+64* 2] + pmulld m1, m14, [cq+64* 6] + pmulld m2, m14, [cq+64*10] + pmulld m3, m14, [cq+64*14] + pmulld m4, m14, [cq+64*18] + pmulld m5, m14, [cq+64*22] + pmulld m6, m14, [cq+64*26] + pmulld m7, m14, [cq+64*30] + call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 + pmulld m0, m14, [cq+64* 0] + pmulld m1, m14, [cq+64* 4] + pmulld m2, m14, [cq+64* 8] + pmulld m3, m14, [cq+64*12] + pmulld m4, m14, [cq+64*16] + pmulld m5, m14, [cq+64*20] + pmulld m6, m14, [cq+64*24] + pmulld m7, m14, [cq+64*28] + call m(idct_8x8_internal_10bpc).main_rect2 + call m(idct_8x16_internal_10bpc).main_evenhalf + pxor m8, m8 + mov r7d, 64*30 +.main_zero_loop: + mova [cq+r7-64*2], m8 + mova [cq+r7-64*1], m8 + mova [cq+r7+64*0], m8 + mova [cq+r7+64*1], m8 + sub r7d, 64*4 + jg .main_zero_loop +.main_end: + psrld m11, 11 ; pd_1 + IDCT32_END 0, 15, 8, 9, 10, 1 + IDCT32_END 1, 14, 8, 9, 10, 1 + punpckhwd m8, m0, m1 ; 16 17 + punpcklwd m0, m1 ; 0 1 + punpcklwd m1, m14, m15 ; 14 15 + punpckhwd m14, m15 ; 30 31 + mova [r5+32*3], m8 + mova [r5+32*2], m14 + IDCT32_END 2, 15, 8, 9, 10, 1 + IDCT32_END 3, 14, 8, 9, 10, 1 + punpckhwd m8, m2, m3 ; 18 19 + punpcklwd m2, m3 ; 2 3 + punpcklwd m3, m14, m15 ; 12 13 + punpckhwd m14, m15 ; 28 29 + mova [r5+32*1], m8 + mova [r5+32*0], m14 + IDCT32_END 4, 15, 8, 9, 10, 1 + IDCT32_END 5, 14, 8, 9, 10, 1 + punpckhwd m8, m4, m5 ; 20 21 + punpcklwd m4, m5 ; 4 5 + punpcklwd m5, m14, m15 ; 10 11 + punpckhwd m14, m15 ; 26 27 + mova [r5-32*1], m8 + mova [r5-32*2], m14 + IDCT32_END 6, 15, 8, 9, 10, 1 + IDCT32_END 7, 14, 8, 9, 10, 1 + punpckhwd m8, m6, m7 ; 22 23 + punpcklwd m6, m7 ; 6 7 + punpcklwd m7, m14, m15 ; 8 9 + punpckhwd m14, m15 ; 24 25 + mova [r5-32*3], m8 + mova [r5-32*4], m14 + ret +ALIGN function_align +.write_16x16: + mova m1, [rsp+gprsize+32*1] + mova [rsp+gprsize+32*0], m8 + mova [rsp+gprsize+32*1], m9 + mova [rsp+gprsize+32*2], m12 + vpbroadcastd m12, [pw_2048] + vpbroadcastd m9, [pixel_10bpc_max] + lea r3, [strideq*3] + pxor m8, m8 + pmulhrsw m0, m12 + pmulhrsw m1, m12 + pmulhrsw m2, m12 + pmulhrsw m3, m12 + call m(idct_16x8_internal_10bpc).write_16x4 + pmulhrsw m0, m12, m4 + pmulhrsw m1, m12, m5 + pmulhrsw m2, m12, m6 + pmulhrsw m3, m12, m7 + call m(idct_16x8_internal_10bpc).write_16x4 + pmulhrsw m0, m12, [rsp+gprsize+32*0] + pmulhrsw m1, m12, [rsp+gprsize+32*1] + pmulhrsw m2, m12, m10 + pmulhrsw m3, m12, m11 + call m(idct_16x8_internal_10bpc).write_16x4 + pmulhrsw m0, m12, [rsp+gprsize+32*2] + pmulhrsw m1, m12, m13 + pmulhrsw m2, m12, m14 + pmulhrsw m3, m12, m15 + jmp m(idct_16x8_internal_10bpc).write_16x4 + +cglobal inv_txfm_add_identity_identity_32x16_10bpc, 4, 7, 11, dst, stride, c, eob + vpbroadcastd m7, [pixel_10bpc_max] +.pass1: + vpbroadcastd m8, [pw_2896x8] + vpbroadcastd m9, [pw_1697x16] + vpbroadcastd m10, [pw_4096] + lea r6, [strideq*5] + pxor m6, m6 + mov r5, dstq + call .main + sub eobd, 36 + jl .ret + add cq, 32 + lea dstq, [dstq+strideq*4] + call .main + add cq, 64*8-32 + lea dstq, [r5+16*1] + call .main + sub eobd, 107 ; eob < 143 + jl .ret + add cq, 32 + lea dstq, [dstq+strideq*4] + call .main + add cq, 64*8-32 + lea dstq, [r5+16*2] + call .main + sub eobd, 128 ; eob < 271 + jl .ret + add cq, 32 + lea dstq, [dstq+strideq*4] + call .main + add cq, 64*8-32 + lea dstq, [r5+16*3] + call .main + sub eobd, 128 ; eob < 399 + jl .ret + add cq, 32 + lea dstq, [dstq+strideq*4] + call .main +.ret: + RET +ALIGN function_align +.main: + mova m0, [cq+64*0] + packssdw m0, [cq+64*1] + mova m1, [cq+64*2] + packssdw m1, [cq+64*3] + mova m2, [cq+64*4] + packssdw m2, [cq+64*5] + mova m3, [cq+64*6] + packssdw m3, [cq+64*7] + REPX {pmulhrsw x, m8 }, m0, m1, m2, m3 + REPX {paddsw x, x }, m0, m1, m2, m3 + REPX {IDTX16 x, 4, 9, _ }, 0, 1, 2, 3 + REPX {pmulhrsw x, m10}, m0, m1, m2, m3 + REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 + jmp m(inv_txfm_add_identity_identity_16x32_10bpc).main2 + +cglobal inv_txfm_add_identity_identity_32x16_12bpc, 4, 7, 11, dst, stride, c, eob + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(inv_txfm_add_identity_identity_32x16_10bpc).pass1 + +cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*83, dst, stride, c, eob +%undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + lea r6, [rsp+32*7] + call .main + cmp eobd, 36 + jl .fast + call .main + cmp eobd, 136 + jl .fast + call .main + cmp eobd, 300 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] + mov [cq], eobd ; 0 + or r3d, 32 + jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly +.fast: + lea r4, [rsp+32*71] + pxor m0, m0 +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r3, [rsp+32*3] + mov r4, r6 + lea r5, [r6+32*8] + lea r6, [pw_5+128] + call .pass2_oddhalf + call .pass2_evenhalf + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq + call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end + sub dstq, r3 + lea r2, [r2+r3+32] + add dstq, 32 + lea r3, [rsp+32*11] + call .pass2_oddhalf + call .pass2_evenhalf + lea r3, [strideq*3] + call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end + RET +ALIGN function_align +.main: + mova m0, [cq+128* 1] + mova m1, [cq+128* 7] + mova m2, [cq+128* 9] + mova m3, [cq+128*15] + mova m4, [cq+128*17] + mova m5, [cq+128*23] + mova m6, [cq+128*25] + mova m7, [cq+128*31] + vpbroadcastd m11, [pd_2048] + vpbroadcastd m14, [pd_2896] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1 + mova m0, [cq+128* 3] + mova m1, [cq+128* 5] + mova m2, [cq+128*11] + mova m3, [cq+128*13] + mova m4, [cq+128*19] + mova m5, [cq+128*21] + mova m6, [cq+128*27] + mova m7, [cq+128*29] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2 + mova m0, [cq+128* 2] + mova m1, [cq+128* 6] + mova m2, [cq+128*10] + mova m3, [cq+128*14] + mova m4, [cq+128*18] + mova m5, [cq+128*22] + mova m6, [cq+128*26] + mova m7, [cq+128*30] + call m(idct_8x16_internal_10bpc).main_oddhalf + mova m0, [cq+128* 0] + mova m1, [cq+128* 4] + mova m2, [cq+128* 8] + mova m3, [cq+128*12] + mova m4, [cq+128*16] + mova m5, [cq+128*20] + mova m6, [cq+128*24] + mova m7, [cq+128*28] + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_end + pxor m15, m15 + mov r7d, 128*29 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + add cq, 32 + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + mova m0, [r5+32*3] + mova m1, [r5+32*2] + mova m2, [r5+32*1] + mova m3, [r5+32*0] + mova m4, [r5-32*1] + mova m5, [r5-32*2] + mova m6, [r5-32*3] + mova m7, [r5-32*4] + call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose + mova [r5-32*4], m0 + mova [r5-32*3], m1 + mova [r5-32*2], m2 + mova [r5-32*1], m3 + mova [r5+32*0], m4 + mova [r5+32*1], m5 + mova [r5+32*2], m6 + mova [r5+32*3], m7 + ret +ALIGN function_align +.pass2_oddhalf: + mova m0, [r3+32* 1] ; 1 + mova m1, [r3+32* 3] ; 3 + mova m2, [r3+32* 5] ; 5 + mova m3, [r3+32* 7] ; 7 + mova m4, [r3+32*17] ; 9 + mova m5, [r3+32*19] ; 11 + mova m6, [r3+32*21] ; 13 + mova m7, [r3+32*23] ; 15 + mova m8, [r3+32*33] ; 17 + mova m9, [r3+32*35] ; 19 + mova m10, [r3+32*37] ; 21 + mova m11, [r3+32*39] ; 23 + mova m12, [r3+32*49] ; 25 + mova m13, [r3+32*51] ; 27 + mova m14, [r3+32*53] ; 29 + mova m15, [r3+32*55] ; 31 + jmp m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf +ALIGN function_align +.pass2_evenhalf: + mova m0, [r3+32* 0] ; 0 + mova m1, [r3+32* 2] ; 2 + mova m2, [r3+32* 4] ; 4 + mova m3, [r3+32* 6] ; 6 + mova m4, [r3+32*16] ; 8 + mova m5, [r3+32*18] ; 10 + mova m6, [r3+32*20] ; 12 + mova m7, [r3+32*22] ; 14 + mova m8, [r3+32*32] ; 16 + mova m9, [r3+32*34] ; 18 + mova m10, [r3+32*36] ; 20 + mova m11, [r3+32*38] ; 22 + mova m12, [r3+32*48] ; 24 + mova m13, [r3+32*50] ; 26 + mova m14, [r3+32*52] ; 28 + mova m15, [r3+32*54] ; 30 + mova [rsp+gprsize], m15 + jmp m(idct_16x16_internal_8bpc).main + +cglobal inv_txfm_add_identity_identity_32x32_10bpc, 4, 8, 8, dst, stride, c, eob +%undef cmp + vpbroadcastd m7, [pixel_10bpc_max] +.pass1: + vpbroadcastd m5, [pw_8192] + pxor m6, m6 + lea r6, [strideq*3] + lea r5, [strideq*5] + lea r4, [strideq+r6*2] ; strideq*7 + call .main ; 0 + cmp eobd, 36 + jl .ret + add cq, 128*8 ; 0 1 + mov r7, dstq ; 1 + add dstq, 16 + call .main + call .main2 + cmp eobd, 136 + jl .ret + add cq, 128*16-32 ; 0 1 2 + lea dstq, [r7+16*2] ; 1 2 + call .main ; 2 + call .main2 + call .main2 + cmp eobd, 300 + jl .ret + add cq, 128*24-64 ; 0 1 2 3 + add r7, 16*3 ; 1 2 3 + mov dstq, r7 ; 2 3 + call .main ; 3 + call .main2 + call .main2 + call .main2 + cmp eobd, 535 + jl .ret + add cq, 128*24-64 ; 0 1 2 3 + lea dstq, [r7+strideq*8] ; 1 2 3 4 + mov r7, dstq ; 2 3 4 + call .main ; 3 4 + call .main2 + call .main2 + cmp eobd, 755 + jl .ret + add cq, 128*16-32 ; 0 1 2 3 + lea dstq, [r7+strideq*8] ; 1 2 3 4 + call .main ; 2 3 4 5 + call .main2 ; 3 4 5 + cmp eobd, 911 + jl .ret + add cq, 128*8 ; 0 1 2 3 + add dstq, 16 ; 1 2 3 4 + call .main ; 2 3 4 5 +.ret: ; 3 4 5 6 + RET +ALIGN function_align +.main2: + sub cq, 128*8-32 + lea dstq, [dstq+strideq*8-16] +.main: + mova m0, [cq+128*0] + packssdw m0, [cq+128*1] + mova m1, [cq+128*2] + packssdw m1, [cq+128*3] + mova m2, [cq+128*4] + packssdw m2, [cq+128*5] + mova m3, [cq+128*6] + packssdw m3, [cq+128*7] + REPX {pmulhrsw x, m5}, m0, m1, m2, m3 + jmp m(inv_txfm_add_identity_identity_8x32_10bpc).main_zero + +cglobal inv_txfm_add_identity_identity_32x32_12bpc, 4, 8, 8, dst, stride, c, eob + vpbroadcastd m7, [pixel_12bpc_max] + jmp m(inv_txfm_add_identity_identity_32x32_10bpc).pass1 + +%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4]) +%if %1 & 1 + mova m%5, [r5-32*(51-%1)] ; idct16 out 0+n + mova m%4, [r4-32*(14+%1)] ; idct32 out31-n +%else + mova m%5, [r4-32*(45-%1)] + mova m%4, [r5-32*(20+%1)] +%endif + paddsw m%6, m%5, m%4 ; idct32 out 0+n + psubsw m%5, m%4 ; idct32 out31-n + paddsw m%4, m%5, m%3 ; out31-n + psubsw m%5, m%3 ; out32+n + paddsw m%3, m%6, m%2 ; out 0+n + psubsw m%6, m%2 ; out63-n + REPX {pmulhrsw x, m14}, m%5, m%6, m%4, m%3 +%if %1 & 1 + %define %%d0 r2 + %define %%d1 dstq +%else + %define %%d0 dstq + %define %%d1 r2 +%endif + paddw m%3, [%%d0+%7 ] + paddw m%4, [%%d1+%8 ] + paddw m%5, [%%d0+%9 ] + paddw m%6, [%%d1+%10] + pxor m%2, m%2 + REPX {pmaxsw x, m%2}, m%3, m%4, m%5, m%6 + vpbroadcastd m%2, [pixel_10bpc_max] + REPX {pminsw x, m%2}, m%3, m%4, m%5, m%6 + mova [%%d0+%7 ], m%3 + mova [%%d1+%8 ], m%4 + mova [%%d0+%9 ], m%5 + mova [%%d1+%10], m%6 +%endmacro + +cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 10, 16, 32*98, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*6] + call .main + sub eobd, 44 + jl .fast + call .main + sub eobd, 107 + jl .fast + call .main + sub eobd, 128 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] + mov [cq], eobd ; 0 + or r3d, 64 + add r6d, 640 + sar r6d, 10 + jmp m(inv_txfm_add_dct_dct_16x4_10bpc).dconly3 +.fast: + lea r4, [rsp+32*38] + pxor m0, m0 +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r6, [pw_5+128] + mova m0, [rsp+32* 2] ; in0 + mova m1, [rsp+32* 6] ; in4 + mova m2, [rsp+32*10] ; in8 + mova m3, [rsp+32*14] ; in12 + mova m4, [rsp+32*18] ; in16 + mova m5, [rsp+32*22] ; in20 + mova m6, [rsp+32*26] ; in24 + mova m7, [rsp+32*30] ; in28 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + lea r4, [rsp+32*38] + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + add r4, 32*8 + mova [r4-32*4], m8 + mova [r4-32*3], m9 + mova [r4-32*2], m10 + mova [r4-32*1], m11 + mova [r4+32*0], m12 + mova [r4+32*1], m13 + mova [r4+32*2], m14 + mova [r4+32*3], m15 + mova m0, [rsp+32* 4] ; in2 + mova m1, [rsp+32* 8] ; in6 + mova m2, [rsp+32*12] ; in10 + mova m3, [rsp+32*16] ; in14 + mova m4, [rsp+32*20] ; in18 + mova m5, [rsp+32*24] ; in22 + mova m6, [rsp+32*28] ; in26 + mova m7, [rsp+32*32] ; in30 + lea r5, [r4+32*16] + add r4, 32*8 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + mova m0, [rsp+32* 3] ; in1 + mova m1, [rsp+32*33] ; in31 + mova m2, [rsp+32*19] ; in17 + mova m3, [rsp+32*17] ; in15 + mova m4, [rsp+32*11] ; in9 + mova m5, [rsp+32*25] ; in23 + mova m6, [rsp+32*27] ; in25 + mova m7, [rsp+32* 9] ; in7 + lea r6, [idct64_mul - 8] + add r4, 32*16 + add r5, 32*32 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + mova m0, [rsp+32* 7] ; in5 + mova m1, [rsp+32*29] ; in27 + mova m2, [rsp+32*23] ; in21 + mova m3, [rsp+32*13] ; in11 + mova m4, [rsp+32*15] ; in13 + mova m5, [rsp+32*21] ; in19 + mova m6, [rsp+32*31] ; in29 + mova m7, [rsp+32* 5] ; in3 + add r6, 8 + add r4, 32*8 + sub r5, 32*8 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + lea r8, [strideq*4] + lea r9, [strideq*5] + lea r3, [r9+strideq*1] ; stride*6 + lea r7, [r9+strideq*2] ; stride*7 + call .main_part2_pass2 + RET +ALIGN function_align +.main: + mova m0, [cq+128* 1] + mova m1, [cq+128* 3] + mova m2, [cq+128* 5] + mova m3, [cq+128* 7] + mova m4, [cq+128* 9] + mova m5, [cq+128*11] + mova m6, [cq+128*13] + mova m7, [cq+128*15] + call m(idct_8x16_internal_10bpc).main_oddhalf + mova m0, [cq+128* 0] + mova m1, [cq+128* 2] + mova m2, [cq+128* 4] + mova m3, [cq+128* 6] + mova m4, [cq+128* 8] + mova m5, [cq+128*10] + mova m6, [cq+128*12] + mova m7, [cq+128*14] + call m(idct_8x8_internal_10bpc).main + call m(idct_8x16_internal_10bpc).main_evenhalf + pxor m15, m15 + mov r7d, 128*13 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + add cq, 32 + psrld m15, m11, 10 ; pd_2 + mova m8, [r6-32*4] + mova m9, [r6+32*3] + REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7 + psubd m10, m0, m8 ; out15 + paddd m0, m8 ; out0 + mova m8, [r6-32*3] + psubd m15, m7, m9 ; out8 + paddd m7, m9 ; out7 + mova m9, [r6+32*2] + REPX {psrad x, 2}, m0, m15, m10, m7 + packssdw m0, m15 + packssdw m7, m10 + psubd m10, m1, m8 ; out14 + paddd m1, m8 ; out1 + mova m8, [r6-32*2] + psubd m15, m6, m9 ; out9 + paddd m6, m9 ; out6 + mova m9, [r6+32*1] + REPX {psrad x, 2}, m1, m15, m10, m6 + packssdw m1, m15 + packssdw m6, m10 + psubd m10, m2, m8 ; out13 + paddd m2, m8 ; out2 + mova m8, [r6-32*1] + psubd m15, m5, m9 ; out10 + paddd m5, m9 ; out5 + mova m9, [r6+32*0] + REPX {psrad x, 2}, m2, m15, m10, m5 + packssdw m2, m15 + packssdw m5, m10 + psubd m10, m3, m8 ; out12 + paddd m3, m8 ; out3 + psubd m15, m4, m9 ; out11 + paddd m4, m9 ; out4 + REPX {psrad x, 2}, m3, m15, m10, m4 + packssdw m3, m15 + packssdw m4, m10 + call m(idct_16x8_internal_10bpc).transpose3 + mova [r6-32*4], m0 + mova [r6-32*3], m1 + mova [r6-32*2], m2 + mova [r6-32*1], m3 + mova [r6+32*0], m4 + mova [r6+32*1], m5 + mova [r6+32*2], m6 + mova [r6+32*3], m7 + add r6, 32*8 + ret +.main_part2_pass2: + vpbroadcastd m11, [pw_1567_3784] + vpbroadcastd m12, [pw_m3784_1567] + vpbroadcastd m13, [pw_2896_2896] + lea r6, [pw_5+128] + lea r2, [dstq+r7] +.main_part2_pass2_loop: + vpbroadcastd m14, [pw_m2896_2896] + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_internal + vpbroadcastd m14, [pw_2048] + IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*8, r7*8 + IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*8, r7*8 + IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8 + IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8 + add dstq, strideq + sub r2, strideq + cmp r4, r5 + jne .main_part2_pass2_loop + ret +ALIGN function_align +.main_part1_rect2: + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 +.main_part1: ; idct64 steps 1-5 + ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + vpbroadcastd m7, [r5+4*0] + vpbroadcastd m8, [r5+4*1] + vpbroadcastd m6, [r5+4*2] + vpbroadcastd m9, [r5+4*3] + vpbroadcastd m5, [r5+4*4] + vpbroadcastd m10, [r5+4*5] + vpbroadcastd m4, [r5+4*6] + vpbroadcastd m15, [r5+4*7] + pmulld m7, m0 ; t63a + pmulld m0, m8 ; t32a + pmulld m6, m1 ; t62a + pmulld m1, m9 ; t33a + pmulld m5, m2 ; t61a + pmulld m2, m10 ; t34a + pmulld m4, m3 ; t60a + pmulld m3, m15 ; t35a + vpbroadcastd m10, [r5+4*8] + vpbroadcastd m15, [r5+4*9] + REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3 + REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4 + psubd m8, m0, m1 ; t33 + paddd m0, m1 ; t32 + psubd m1, m7, m6 ; t62 + paddd m7, m6 ; t63 + psubd m6, m3, m2 ; t34 + paddd m3, m2 ; t35 + psubd m2, m4, m5 ; t61 + paddd m4, m5 ; t60 + REPX {pmaxsd x, m12}, m8, m1, m6, m2 + REPX {pminsd x, m13}, m8, m1, m6, m2 + ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a + ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 2 ; t61a, t34a + REPX {pmaxsd x, m12}, m0, m3, m7, m4 + REPX {pminsd x, m13}, m0, m3, m7, m4 + vpbroadcastd m10, [r5+4*10] + vpbroadcastd m15, [r5+4*11] + psubd m5, m0, m3 ; t35a + paddd m0, m3 ; t32a + psubd m3, m7, m4 ; t60a + paddd m7, m4 ; t63a + psubd m4, m1, m6 ; t34 + paddd m1, m6 ; t33 + psubd m6, m8, m2 ; t61 + paddd m8, m2 ; t62 + REPX {pmaxsd x, m12}, m5, m3, m4, m6 + REPX {pminsd x, m13}, m5, m3, m4, m6 + ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15 ; t35, t60 + ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a + REPX {pmaxsd x, m12}, m0, m7, m1, m8 + REPX {pminsd x, m13}, m0, m7, m1, m8 + add r5, 4*12 + mova [r6-32*4], m0 + mova [r6+32*3], m7 + mova [r6-32*3], m1 + mova [r6+32*2], m8 + mova [r6-32*2], m6 + mova [r6+32*1], m4 + mova [r6-32*1], m3 + mova [r6+32*0], m5 + add r6, 32*8 + ret +.main_part2: ; idct64 steps 6-9 + lea r5, [r6+32*3] + sub r6, 32*4 + vpbroadcastd m10, [pd_1567] + vpbroadcastd m15, [pd_3784] +.main_part2_loop: + mova m0, [r6-32*32] ; t32a + mova m1, [r5-32*24] ; t39a + mova m2, [r5-32*32] ; t63a + mova m3, [r6-32*24] ; t56a + mova m4, [r6-32*16] ; t40a + mova m5, [r5-32* 8] ; t47a + mova m6, [r5-32*16] ; t55a + mova m7, [r6-32* 8] ; t48a + psubd m8, m0, m1 ; t39 + paddd m0, m1 ; t32 + psubd m1, m2, m3 ; t56 + paddd m2, m3 ; t63 + psubd m3, m5, m4 ; t40 + paddd m5, m4 ; t47 + psubd m4, m7, m6 ; t55 + paddd m7, m6 ; t48 + REPX {pmaxsd x, m12}, m8, m1, m3, m4 + REPX {pminsd x, m13}, m8, m1, m3, m4 + ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a + ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 2 ; t55a, t40a + REPX {pmaxsd x, m12}, m0, m2, m5, m7 + REPX {pminsd x, m13}, m0, m5, m2, m7 + psubd m6, m2, m7 ; t48a + paddd m2, m7 ; t63a + psubd m7, m0, m5 ; t47a + paddd m0, m5 ; t32a + psubd m5, m8, m4 ; t55 + paddd m8, m4 ; t56 + psubd m4, m1, m3 ; t40 + paddd m1, m3 ; t39 + REPX {pmaxsd x, m12}, m6, m7, m5, m4 + REPX {pminsd x, m13}, m6, m7, m5, m4 + REPX {pmulld x, m14}, m6, m7, m5, m4 + REPX {pmaxsd x, m12}, m2, m0, m8, m1 + REPX {pminsd x, m13}, m2, m0, m8, m1 + paddd m6, m11 + paddd m5, m11 + psubd m3, m6, m7 ; t47 + paddd m6, m7 ; t48 + psubd m7, m5, m4 ; t40a + paddd m5, m4 ; t55a + REPX {psrad x, 12}, m3, m6, m7, m5 + mova [r5-32* 8], m2 + mova [r6-32*32], m0 + mova [r6-32* 8], m8 + mova [r5-32*32], m1 + mova [r5-32*24], m3 + mova [r6-32*16], m6 + mova [r6-32*24], m7 + mova [r5-32*16], m5 + add r6, 32 + sub r5, 32 + cmp r6, r5 + jl .main_part2_loop + ret + +cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 11, 16, 32*134, dst, stride, c, eob +%undef cmp + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + lea r6, [rsp+32*6] + call .main + cmp eobd, 36 + jl .fast + call .main + cmp eobd, 136 + jl .fast + call .main + cmp eobd, 300 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 181 + vpbroadcastd m3, [dconly_10bpc] + mov [cq], eobd ; 0 + or r3d, 64 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 + jmp m(inv_txfm_add_dct_dct_32x8_10bpc).dconly2 +.fast: + lea r4, [rsp+32*70] + pxor m0, m0 +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r6, [pw_5 + 128] + mov r10, rsp + lea r8, [strideq*4] + lea r9, [strideq*5] + lea r3, [r9+strideq*1] ; stride*6 + lea r7, [r9+strideq*2] ; stride*7 +.pass2_loop: + mova m0, [r10+32* 2] ; in0 + mova m1, [r10+32* 6] ; in4 + mova m2, [r10+32*18] ; in8 + mova m3, [r10+32*22] ; in12 + mova m4, [r10+32*34] ; in16 + mova m5, [r10+32*38] ; in20 + mova m6, [r10+32*50] ; in24 + mova m7, [r10+32*54] ; in28 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + lea r4, [rsp+32*70] + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + add r4, 32*8 + mova [r4-32*4], m8 + mova [r4-32*3], m9 + mova [r4-32*2], m10 + mova [r4-32*1], m11 + mova [r4+32*0], m12 + mova [r4+32*1], m13 + mova [r4+32*2], m14 + mova [r4+32*3], m15 + mova m0, [r10+32* 4] ; in2 + mova m1, [r10+32* 8] ; in6 + mova m2, [r10+32*20] ; in10 + mova m3, [r10+32*24] ; in14 + mova m4, [r10+32*36] ; in18 + mova m5, [r10+32*40] ; in22 + mova m6, [r10+32*52] ; in26 + mova m7, [r10+32*56] ; in30 + lea r5, [r4+32*16] + add r4, 32*8 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + mova m0, [r10+32* 3] ; in1 + mova m1, [r10+32*57] ; in31 + mova m2, [r10+32*35] ; in17 + mova m3, [r10+32*25] ; in15 + mova m4, [r10+32*19] ; in9 + mova m5, [r10+32*41] ; in23 + mova m6, [r10+32*51] ; in25 + mova m7, [r10+32* 9] ; in7 + lea r6, [idct64_mul - 8] + add r4, 32*16 + add r5, 32*32 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + mova m0, [r10+32* 7] ; in5 + mova m1, [r10+32*53] ; in27 + mova m2, [r10+32*39] ; in21 + mova m3, [r10+32*21] ; in11 + mova m4, [r10+32*23] ; in13 + mova m5, [r10+32*37] ; in19 + mova m6, [r10+32*55] ; in29 + mova m7, [r10+32* 5] ; in3 + add r6, 8 + add r4, 32*8 + sub r5, 32*8 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2 + add r10, 32*8 + sub r4, 32*98 ; rsp+32*16 + sub dstq, r8 + add dstq, 32 + cmp r10, r4 + jl .pass2_loop + RET +ALIGN function_align +.main: + vpbroadcastd m14, [pd_2896] + vpbroadcastd m11, [pd_2048] + pmulld m0, m14, [cq+128* 1] + pmulld m1, m14, [cq+128* 7] + pmulld m2, m14, [cq+128* 9] + pmulld m3, m14, [cq+128*15] + pmulld m4, m14, [cq+128*17] + pmulld m5, m14, [cq+128*23] + pmulld m6, m14, [cq+128*25] + pmulld m7, m14, [cq+128*31] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_rect2 + pmulld m0, m14, [cq+128* 3] + pmulld m1, m14, [cq+128* 5] + pmulld m2, m14, [cq+128*11] + pmulld m3, m14, [cq+128*13] + pmulld m4, m14, [cq+128*19] + pmulld m5, m14, [cq+128*21] + pmulld m6, m14, [cq+128*27] + pmulld m7, m14, [cq+128*29] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_rect2 + pmulld m0, m14, [cq+128* 2] + pmulld m1, m14, [cq+128* 6] + pmulld m2, m14, [cq+128*10] + pmulld m3, m14, [cq+128*14] + pmulld m4, m14, [cq+128*18] + pmulld m5, m14, [cq+128*22] + pmulld m6, m14, [cq+128*26] + pmulld m7, m14, [cq+128*30] + call m(idct_8x16_internal_10bpc).main_oddhalf_rect2 + pmulld m0, m14, [cq+128* 0] + pmulld m1, m14, [cq+128* 4] + pmulld m2, m14, [cq+128* 8] + pmulld m3, m14, [cq+128*12] + pmulld m4, m14, [cq+128*16] + pmulld m5, m14, [cq+128*20] + pmulld m6, m14, [cq+128*24] + pmulld m7, m14, [cq+128*28] + pxor m15, m15 + mov r7d, 128*29 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + add cq, 32 + call m(idct_8x8_internal_10bpc).main_rect2 + call m(idct_8x16_internal_10bpc).main_evenhalf + call m(inv_txfm_add_dct_dct_32x16_10bpc).main_end + call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + mova m0, [r5+32*3] + mova m1, [r5+32*2] + mova m2, [r5+32*1] + mova m3, [r5+32*0] + mova m4, [r5-32*1] + mova m5, [r5-32*2] + mova m6, [r5-32*3] + mova m7, [r5-32*4] + call m(inv_txfm_add_dct_dct_8x32_10bpc).transpose + mova [r5-32*4], m0 + mova [r5-32*3], m1 + mova [r5-32*2], m2 + mova [r5-32*1], m3 + mova [r5+32*0], m4 + mova [r5+32*1], m5 + mova [r5+32*2], m6 + mova [r5+32*3], m7 + ret + +cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jnz .normal + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 16 +.dconly: + add r6d, 640 + sar r6d, 10 +.dconly2: + vpbroadcastd m5, [dconly_10bpc] + imul r6d, 181 + add r6d, 2176 + sar r6d, 12 + movd xm0, r6d + paddsw xm0, xm5 + vpbroadcastw m0, xm0 +.dconly_loop: + paddsw m1, m0, [dstq+32*0] + paddsw m2, m0, [dstq+32*1] + paddsw m3, m0, [dstq+32*2] + paddsw m4, m0, [dstq+32*3] + REPX {psubusw x, m5}, m1, m2, m3, m4 + mova [dstq+32*0], m1 + mova [dstq+32*1], m2 + mova [dstq+32*2], m3 + mova [dstq+32*3], m4 + add dstq, strideq + dec r3d + jg .dconly_loop + RET +.normal: + PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*4] + call .main + call .shift_transpose + cmp eobd, 36 + jl .fast + call .main + call .shift_transpose + jmp .pass2 +.fast: + pxor m0, m0 + mov r3d, 4 +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + dec r3d + jg .fast_loop +.pass2: + lea r7, [r6-32*64] + lea r4, [r6-32*32] + lea r6, [pw_5+128] + mov r5, dstq +.pass2_loop: + mova m0, [r7-32*4] + mova m1, [r7-32*3] + mova m2, [r7-32*2] + mova m3, [r7-32*1] + mova m4, [r7+32*0] + mova m5, [r7+32*1] + mova m6, [r7+32*2] + mova m7, [r7+32*3] + add r7, 32*32 + mova m8, [r7-32*4] + mova m9, [r7-32*3] + mova m10, [r7-32*2] + mova m11, [r7-32*1] + mova m12, [r7+32*0] + mova m13, [r7+32*1] + mova m14, [r7+32*2] + mova m15, [r7+32*3] + sub r7, 32*24 + mova [rsp], m15 + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + call m(inv_txfm_add_dct_dct_32x16_10bpc).write_16x16 + add r5, 32 + mov dstq, r5 + cmp r7, r4 + jl .pass2_loop + RET +ALIGN function_align +.main: + lea r5, [idct64_mul_16bpc] + mova m0, [cq+64* 1] + mova m1, [cq+64*31] + mova m2, [cq+64*17] + mova m3, [cq+64*15] + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 + mova m0, [cq+64* 7] + mova m1, [cq+64*25] + mova m2, [cq+64*23] + mova m3, [cq+64* 9] + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 + mova m0, [cq+64* 5] + mova m1, [cq+64*27] + mova m2, [cq+64*21] + mova m3, [cq+64*11] + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 + mova m0, [cq+64* 3] + mova m1, [cq+64*29] + mova m2, [cq+64*19] + mova m3, [cq+64*13] + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2 + mova m0, [cq+64* 2] + mova m1, [cq+64*14] + mova m2, [cq+64*18] + mova m3, [cq+64*30] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast + mova m0, [cq+64* 6] + mova m1, [cq+64*10] + mova m2, [cq+64*22] + mova m3, [cq+64*26] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast + mova m0, [cq+64* 4] + mova m1, [cq+64*12] + mova m2, [cq+64*20] + mova m3, [cq+64*28] + call m(idct_8x16_internal_10bpc).main_oddhalf_fast + mova m0, [cq+64* 0] + mova m1, [cq+64* 8] + mova m2, [cq+64*16] + mova m3, [cq+64*24] + pxor m15, m15 + mov r7d, 64*30 +.main_zero_loop: + mova [cq+r7-64*2], m15 + mova [cq+r7-64*1], m15 + mova [cq+r7+64*0], m15 + mova [cq+r7+64*1], m15 + sub r7d, 64*4 + jg .main_zero_loop +.main_end: + psrld m15, m11, 10 ; pd_2 +.main_end2: + add cq, 32 + pxor m4, m4 + REPX {mova x, m4}, m5, m6, m7 + call m(idct_8x8_internal_10bpc).main + add r6, 32*8 + call m(idct_8x16_internal_10bpc).main_evenhalf + mova [r6+32*2], m1 + mova [r6+32*1], m2 + mova [r6+32*0], m3 + mova [r6-32*1], m4 + mova [r6-32*2], m5 + mova [r6-32*3], m6 + mova [r6-32*4], m7 + jmp .main_end_loop_start +.main_end_loop: + mova m0, [r6+32* 3] ; idct8 0 + n +.main_end_loop_start: + mova m1, [r5+32* 4] ; idct16 15 - n + mova m2, [r5-32*12] ; idct32 16 + n + mova m3, [r6-32*13] ; idct32 31 - n + mova m4, [r6-32*29] ; idct64 63 - n + mova m5, [r5-32*28] ; idct64 48 + n + mova m6, [r6-32*45] ; idct64 47 - n + mova m7, [r5-32*44] ; idct64 32 + n + paddd m8, m0, m1 ; idct16 out0 + n + psubd m0, m1 ; idct16 out15 - n + REPX {pmaxsd x, m12}, m8, m0 + REPX {pminsd x, m13}, m8, m0 + paddd m1, m8, m3 ; idct32 out0 + n + psubd m8, m3 ; idct32 out31 - n + paddd m3, m0, m2 ; idct32 out15 - n + psubd m0, m2 ; idct32 out16 + n + REPX {pmaxsd x, m12}, m1, m8, m3, m0 + REPX {pminsd x, m13}, m1, m3, m8, m0 + REPX {paddd x, m15}, m1, m3, m0, m8 + paddd m2, m1, m4 ; idct64 out0 + n (unshifted) + psubd m1, m4 ; idct64 out63 - n (unshifted) + paddd m4, m3, m5 ; idct64 out15 - n (unshifted) + psubd m3, m5 ; idct64 out48 + n (unshifted) + paddd m5, m0, m6 ; idct64 out16 + n (unshifted) + psubd m0, m6 ; idct64 out47 - n (unshifted) + paddd m6, m8, m7 ; idct64 out31 - n (unshifted) + psubd m8, m7 ; idct64 out32 + n (unshifted) + mova [r5-32*44], m2 + mova [r6+32* 3], m1 + mova [r6-32*45], m4 + mova [r5+32* 4], m3 + mova [r5-32*28], m5 + mova [r6-32*13], m0 + mova [r6-32*29], m6 + mova [r5-32*12], m8 + add r5, 32 + sub r6, 32 + cmp r5, r6 + jl .main_end_loop + ret +.shift_transpose: +%macro IDCT64_SHIFT_TRANSPOSE 1 ; shift + sub r6, 32*48 + mov r5, r6 +%%loop: + mova m0, [r6-32* 4] + mova m4, [r6+32* 4] + mova m1, [r6-32* 3] + mova m5, [r6+32* 5] + mova m2, [r6-32* 2] + mova m6, [r6+32* 6] + mova m3, [r6-32* 1] + mova m7, [r6+32* 7] + REPX {psrad x, %1}, m0, m4, m1, m5, m2, m6, m3, m7 + packssdw m0, m4 + packssdw m1, m5 + packssdw m2, m6 + packssdw m3, m7 + mova m4, [r6+32* 0] + mova m6, [r6+32* 8] + mova m5, [r6+32* 1] + mova m7, [r6+32* 9] + REPX {psrad x, %1}, m4, m6, m5, m7 + packssdw m4, m6 + packssdw m5, m7 + mova m6, [r6+32* 2] + mova m8, [r6+32*10] + mova m7, [r6+32* 3] + mova m9, [r6+32*11] + REPX {psrad x, %1}, m6, m8, m7, m9 + packssdw m6, m8 + packssdw m7, m9 + call m(idct_16x8_internal_10bpc).transpose3 + mova [r5-32*4], m0 + mova [r5-32*3], m1 + mova [r5-32*2], m2 + mova [r5-32*1], m3 + mova [r5+32*0], m4 + mova [r5+32*1], m5 + mova [r5+32*2], m6 + mova [r5+32*3], m7 + add r6, 32*16 + add r5, 32*8 + cmp r5, r4 + jl %%loop + mov r6, r4 +%endmacro + IDCT64_SHIFT_TRANSPOSE 2 + ret + +cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 8, 16, 32*163, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*7] + call .main + cmp eobd, 36 + jl .fast + call .main + cmp eobd, 136 + jl .fast + call .main + cmp eobd, 300 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 32 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 384 + sar r6d, 9 + jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly2 +.fast: + pxor m0, m0 + lea r4, [rsp+32*135] +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r7, [r6-32*32] + lea r5, [r6+32*8] + lea r6, [pw_5+128] + imul r2, strideq, 19 + lea r3, [strideq*3] + add r2, dstq +.pass2_loop: + mova m0, [r7-32*99] + mova m1, [r7-32*97] + mova m2, [r7-32*95] + mova m3, [r7-32*93] + mova m4, [r7-32*67] + mova m5, [r7-32*65] + mova m6, [r7-32*63] + mova m7, [r7-32*61] + mova m8, [r7-32*35] + mova m9, [r7-32*33] + mova m10, [r7-32*31] + mova m11, [r7-32*29] + mova m12, [r7-32* 3] + mova m13, [r7-32* 1] + mova m14, [r7+32* 1] + mova m15, [r7+32* 3] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + mova m0, [r7-32*100] + mova m1, [r7-32*98] + mova m2, [r7-32*96] + mova m3, [r7-32*94] + mova m4, [r7-32*68] + mova m5, [r7-32*66] + mova m6, [r7-32*64] + mova m7, [r7-32*62] + mova m8, [r7-32*36] + mova m9, [r7-32*34] + mova m10, [r7-32*32] + mova m11, [r7-32*30] + mova m12, [r7-32* 4] + mova m13, [r7-32* 2] + mova m14, [r7+32* 0] + mova m15, [r7+32* 2] + add r7, 32*8 + mova [rsp], m15 + call m(idct_16x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_16x32_10bpc).pass2_end + sub dstq, r3 + lea r2, [r2+r3+32] + add dstq, 32 + cmp r7, r4 + jl .pass2_loop + RET +ALIGN function_align +.main: + lea r5, [idct64_mul_16bpc] + pmulld m0, m14, [cq+128* 1] + pmulld m1, m14, [cq+128*31] + pmulld m2, m14, [cq+128*17] + pmulld m3, m14, [cq+128*15] + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 + pmulld m0, m14, [cq+128* 7] + pmulld m1, m14, [cq+128*25] + pmulld m2, m14, [cq+128*23] + pmulld m3, m14, [cq+128* 9] + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 + pmulld m0, m14, [cq+128* 5] + pmulld m1, m14, [cq+128*27] + pmulld m2, m14, [cq+128*21] + pmulld m3, m14, [cq+128*11] + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 + pmulld m0, m14, [cq+128* 3] + pmulld m1, m14, [cq+128*29] + pmulld m2, m14, [cq+128*19] + pmulld m3, m14, [cq+128*13] + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1_rect2 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2 + pmulld m0, m14, [cq+128* 2] + pmulld m1, m14, [cq+128*14] + pmulld m2, m14, [cq+128*18] + pmulld m3, m14, [cq+128*30] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast_rect2 + pmulld m0, m14, [cq+128* 6] + pmulld m1, m14, [cq+128*10] + pmulld m2, m14, [cq+128*22] + pmulld m3, m14, [cq+128*26] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast_rect2 + pmulld m0, m14, [cq+128* 4] + pmulld m1, m14, [cq+128*12] + pmulld m2, m14, [cq+128*20] + pmulld m3, m14, [cq+128*28] + call m(idct_8x16_internal_10bpc).main_oddhalf_fast_rect2 + pmulld m0, m14, [cq+128* 0] + pmulld m1, m14, [cq+128* 8] + pmulld m2, m14, [cq+128*16] + pmulld m3, m14, [cq+128*24] + pxor m15, m15 + mov r7d, 128*29 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + psrld m15, m11, 11 ; pd_1 + REPX {paddd x, m11}, m0, m1, m2, m3 + REPX {psrad x, 12 }, m0, m1, m2, m3 + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end2 + IDCT64_SHIFT_TRANSPOSE 1 + ret + +cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst, stride, c, eob + test eobd, eobd + jz .dconly + PROLOGUE 0, 11, 16, 32*195, dst, stride, c, eob +%undef cmp + vpbroadcastd m11, [pd_2048] + vpbroadcastd m12, [clip_18b_min] + vpbroadcastd m13, [clip_18b_max] + vpbroadcastd m14, [pd_2896] + lea r6, [rsp+32*7] + call .main + cmp eobd, 36 + jl .fast + call .main + cmp eobd, 136 + jl .fast + call .main + cmp eobd, 300 + jl .fast + call .main + jmp .pass2 +.dconly: + imul r6d, [cq], 181 + mov [cq], eobd ; 0 + or r3d, 64 + jmp m(inv_txfm_add_dct_dct_64x16_10bpc).dconly +.fast: + pxor m0, m0 + lea r4, [rsp+32*135] +.fast_loop: + REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3 + add r6, 32*8 + cmp r6, r4 + jl .fast_loop +.pass2: + lea r10, [r6-32*32] + lea r6, [pw_5+128] + lea r8, [strideq*4] + lea r9, [strideq*5] + lea r3, [r9+strideq*1] ; stride*6 + lea r7, [r9+strideq*2] ; stride*7 +.pass2_loop: + mova m0, [r10-32*100] ; in0 + mova m1, [r10-32*96] ; in4 + mova m2, [r10-32*68] ; in8 + mova m3, [r10-32*64] ; in12 + mova m4, [r10-32*36] ; in16 + mova m5, [r10-32*32] ; in20 + mova m6, [r10-32* 4] ; in24 + mova m7, [r10+32* 0] ; in28 + pxor m8, m8 + REPX {mova x, m8}, m9, m10, m11, m12, m13, m14 + mova [rsp], m8 + call m(idct_16x16_internal_8bpc).main + mova m1, [rsp+32*1] + mova [r4-32*4], m0 + mova [r4-32*3], m1 + mova [r4-32*2], m2 + mova [r4-32*1], m3 + mova [r4+32*0], m4 + mova [r4+32*1], m5 + mova [r4+32*2], m6 + mova [r4+32*3], m7 + add r4, 32*8 + mova [r4-32*4], m8 + mova [r4-32*3], m9 + mova [r4-32*2], m10 + mova [r4-32*1], m11 + mova [r4+32*0], m12 + mova [r4+32*1], m13 + mova [r4+32*2], m14 + mova [r4+32*3], m15 + mova m0, [r10-32*98] ; in2 + mova m1, [r10-32*94] ; in6 + mova m2, [r10-32*66] ; in10 + mova m3, [r10-32*62] ; in14 + mova m4, [r10-32*34] ; in18 + mova m5, [r10-32*30] ; in22 + mova m6, [r10-32* 2] ; in26 + mova m7, [r10+32* 2] ; in30 + lea r5, [r4+32*16] + add r4, 32*8 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + mova m0, [r10-32*99] ; in1 + mova m1, [r10+32* 3] ; in31 + mova m2, [r10-32*35] ; in17 + mova m3, [r10-32*61] ; in15 + mova m4, [r10-32*67] ; in9 + mova m5, [r10-32*29] ; in23 + mova m6, [r10-32* 3] ; in25 + mova m7, [r10-32*93] ; in7 + lea r6, [idct64_mul - 8] + add r4, 32*16 + add r5, 32*32 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + mova m0, [r10-32*95] ; in5 + mova m1, [r10-32* 1] ; in27 + mova m2, [r10-32*31] ; in21 + mova m3, [r10-32*65] ; in11 + mova m4, [r10-32*63] ; in13 + mova m5, [r10-32*33] ; in19 + mova m6, [r10+32* 1] ; in29 + mova m7, [r10-32*97] ; in3 + add r6, 8 + add r4, 32*8 + sub r5, 32*8 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2_pass2 + add r10, 32*8 + sub dstq, r8 + sub r4, 32*44 + add dstq, 32 + cmp r10, r4 + jl .pass2_loop + RET +ALIGN function_align +.main: + lea r5, [idct64_mul_16bpc] + mova m0, [cq+128* 1] + mova m1, [cq+128*31] + mova m2, [cq+128*17] + mova m3, [cq+128*15] + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 + mova m0, [cq+128* 7] + mova m1, [cq+128*25] + mova m2, [cq+128*23] + mova m3, [cq+128* 9] + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 + mova m0, [cq+128* 5] + mova m1, [cq+128*27] + mova m2, [cq+128*21] + mova m3, [cq+128*11] + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 + mova m0, [cq+128* 3] + mova m1, [cq+128*29] + mova m2, [cq+128*19] + mova m3, [cq+128*13] + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part1 + call m(inv_txfm_add_dct_dct_16x64_10bpc).main_part2 + mova m0, [cq+128* 2] + mova m1, [cq+128*14] + mova m2, [cq+128*18] + mova m3, [cq+128*30] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part1_fast + mova m0, [cq+128* 6] + mova m1, [cq+128*10] + mova m2, [cq+128*22] + mova m3, [cq+128*26] + call m(inv_txfm_add_dct_dct_8x32_10bpc).main_oddhalf_part2_fast + mova m0, [cq+128* 4] + mova m1, [cq+128*12] + mova m2, [cq+128*20] + mova m3, [cq+128*28] + call m(idct_8x16_internal_10bpc).main_oddhalf_fast + mova m0, [cq+128* 0] + mova m1, [cq+128* 8] + mova m2, [cq+128*16] + mova m3, [cq+128*24] + pxor m15, m15 + mov r7d, 128*29 +.main_zero_loop: + mova [cq+r7-128*1], m15 + mova [cq+r7+128*0], m15 + mova [cq+r7+128*1], m15 + mova [cq+r7+128*2], m15 + sub r7d, 128*4 + jg .main_zero_loop + call m(inv_txfm_add_dct_dct_64x16_10bpc).main_end + jmp m(inv_txfm_add_dct_dct_64x16_10bpc).shift_transpose + +%endif ; ARCH_X86_64 -- cgit v1.2.3