diff options
Diffstat (limited to 'third_party/dav1d/src/x86/itx_avx512.asm')
-rw-r--r-- | third_party/dav1d/src/x86/itx_avx512.asm | 7324 |
1 files changed, 7324 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/itx_avx512.asm b/third_party/dav1d/src/x86/itx_avx512.asm new file mode 100644 index 0000000000..a3d4ebdfb4 --- /dev/null +++ b/third_party/dav1d/src/x86/itx_avx512.asm @@ -0,0 +1,7324 @@ +; Copyright © 2020, VideoLAN and dav1d authors +; Copyright © 2020, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "config.asm" +%include "ext/x86/x86inc.asm" + +%if ARCH_X86_64 + +SECTION_RODATA 64 +const \ +int8_permA, db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 + db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55 + db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59 + db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63 +int8_permB: db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51 + db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59 + db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55 + db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63 +int16_perm: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39 + db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47 + db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55 + db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63 +dup16_perm: db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7 + db 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15 + db 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23 + db 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, 30, 31, 30, 31 +idtx_16x4p: db 0, 1, 4, 5, 16, 17, 20, 21, 2, 3, 6, 7, 18, 19, 22, 23 + db 32, 33, 36, 37, 48, 49, 52, 53, 34, 35, 38, 39, 50, 51, 54, 55 + db 8, 9, 12, 13, 24, 25, 28, 29, 10, 11, 14, 15, 26, 27, 30, 31 + db 40, 41, 44, 45, 56, 57, 60, 61, 42, 43, 46, 47, 58, 59, 62, 63 +idct_8x32p: db 60, 61, 4, 5, 32, 33, 0, 1, 28, 29, 36, 37, 56, 57, 8, 9 + db 12, 13, 52, 53, 24, 25, 40, 41, 44, 45, 20, 21, 48, 49, 16, 17 + db 62, 63, 2, 3, 6, 7, 58, 59, 54, 55, 10, 11, 14, 15, 50, 51 + db 46, 47, 18, 19, 22, 23, 42, 43, 38, 39, 26, 27, 30, 31, 34, 35 +idct_16x32p: db 6, 7, 58, 59, 38, 39, 26, 27, 32, 33, 0, 1, 30, 31, 34, 35 + db 46, 47, 18, 19, 22, 23, 42, 43, 24, 25, 40, 41, 44, 45, 20, 21 + db 62, 63, 2, 3, 48, 49, 16, 17, 56, 57, 8, 9, 14, 15, 50, 51 + db 54, 55, 10, 11, 60, 61, 4, 5, 12, 13, 52, 53, 28, 29, 36, 37 +end_16x32p: db 0, 32, 1, 48, 2, 36, 3, 52, 16, 40, 17, 56, 18, 44, 19, 60 + db 4, 33, 5, 49, 6, 37, 7, 53, 20, 41, 21, 57, 22, 45, 23, 61 + db 8, 35, 9, 51, 10, 39, 11, 55, 24, 43, 25, 59, 26, 47, 27, 63 + db 12, 34, 13, 50, 14, 38, 15, 54, 28, 42, 29, 58, 30, 46, 31, 62 + +; packed 4-bit qword shuffle indices +permA: dq 0x1c0d0d1ce0d94040, 0x5849495868fb6262 + dq 0x3e2f2f3ef1c85151, 0x7a6b6b7a79ea7373 + dq 0x94858594a451c8d9, 0xd0c1c1d02c73eafb + dq 0xb6a7a7b6b540d9c8, 0xf2e3e3f23d62fbea +permB: dq 0x40acbd0fcadb0f40, 0x518e9f3ce8f99604 + dq 0xc824352d56128751, 0xd906171e74301e15 + dq 0x6271604b03472d62, 0x735342782165b426 + dq 0xeaf9e8699f8ea573, 0xfbdbca5abdac3c37 +permC: dq 0x9d409d041551c2e0, 0xbf62bf263773a486 + dq 0xc88c8c15409dd3f1, 0xeaaeae3762bfb597 + dq 0x04d9158c8cc84a68, 0x26fb37aeaeea2c0e + dq 0x5115049dd9045b79, 0x733726bffb263d1f +permD: dq 0x0cda098800041504, 0x0edb09b2028c3726 + dq 0x0f11fa9c01150415, 0x0988f326039d2637 + dq 0x05640f1108269d8c, 0x05290edb0aaebfae + dq 0x0005000509378c9d, 0xffffffff0bbfaebf + +pd_0to15: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +gather8a: dd 0, 2, 1, 3, 8, 10, 9, 11 +gather8b: dd 0, 1, 4, 5, 8, 9, 12, 13 +gather8c: dd 0, 4, 2, 6, 12, 8, 14, 10 +gather8d: dd 0, 3, 1, 2, 8, 11, 9, 10 + +int_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +int_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7 +int_shuf3: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 +int_shuf4: db 8, 9, 0, 1, 12, 13, 4, 5, 10, 11, 2, 3, 14, 15, 6, 7 +deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 +int_mshift: db 12, 20, 0, 0, 44, 52, 0, 0 + +pb_32: times 4 db 32 +pw_2048: times 2 dw 2048 +pw_4096: times 2 dw 4096 +pw_8192: times 2 dw 8192 +pw_16384: times 2 dw 16384 +pw_1697x16: times 2 dw 1697*16 +pw_1697x8: times 2 dw 1697*8 +pw_2896x8: times 2 dw 2896*8 +pd_2048: dd 2048 + +%define pw_5 (permD+52) +%define pd_m1 (permD+60) +%define pw_3803_1321 (permD+44) +%define pw_2482_3803 (permD+12) +%define pw_2440_3290 (permD+ 4) +%define pw_m3290_2440 (permD+28) +%define pw_3857_1380 (permD+36) +%define pw_m1380_3857 (permD+20) + +pw_8192_m8192: dw 8192, -8192 +pw_m8192_8192: dw -8192, 8192 +pw_16384_m16384: dw 16384, -16384 +pw_m16384_16384: dw -16384, 16384 + +pw_m1321_2482: dw -1321, 2482 +pw_m3344_3344: dw -3344, 3344 +pw_2482_3344: dw 2482, 3344 +pw_m3803_3344: dw -3803, 3344 +pd_3344: dd 3344 +pw_m1321_m3344: dw -1321, -3344 +pw_2896_m2896: dw 2896, -2896 + +pw_1567_m3784: dw 1567, -3784 +pw_3784_m1567: dw 3784, -1567 +pw_4017_m799: dw 4017, -799 +pw_2276_m3406: dw 2276, -3406 +pw_m799_m4017: dw -799, -4017 +pw_m3406_m2276: dw -3406, -2276 + +%macro COEF_PAIR 2-3 0 +pw_%1_%2: dw %1, %2 +pw_m%2_%1: dw -%2, %1 +%if %3 +pw_m%1_m%2: dw -%1, -%2 +%endif +%endmacro + +COEF_PAIR 2896, 2896 +COEF_PAIR 1567, 3784, 1 +COEF_PAIR 3784, 1567 +COEF_PAIR 201, 4091 +COEF_PAIR 995, 3973 +COEF_PAIR 1751, 3703 +COEF_PAIR 3035, 2751 +COEF_PAIR 3513, 2106 +COEF_PAIR 4052, 601 +COEF_PAIR 3166, 2598, 1 +COEF_PAIR 3920, 1189, 1 +COEF_PAIR 2276, 3406 +COEF_PAIR 4017, 799 + +%macro COEF_X8 1-* +%rep %0 + dw %1*8, %1*8 + %rotate 1 +%endrep +%endmacro + +pw_m2276x8: COEF_X8 -2276 +pw_3406x8: COEF_X8 3406 +pw_4017x8: COEF_X8 4017 +pw_799x8: COEF_X8 799 +pw_3784x8: COEF_X8 3784 +pw_1567x8: COEF_X8 1567 + +pw_4076x8: COEF_X8 4076 +pw_401x8: COEF_X8 401 +pw_m2598x8: COEF_X8 -2598 +pw_3166x8: COEF_X8 3166 +pw_3612x8: COEF_X8 3612 +pw_1931x8: COEF_X8 1931 +pw_m1189x8: COEF_X8 -1189 +pw_3920x8: COEF_X8 3920 + +pw_4091x8: COEF_X8 4091 +pw_201x8: COEF_X8 201 +pw_m2751x8: COEF_X8 -2751 +pw_3035x8: COEF_X8 3035 +pw_3703x8: COEF_X8 3703 +pw_1751x8: COEF_X8 1751 +pw_m1380x8: COEF_X8 -1380 +pw_3857x8: COEF_X8 3857 +pw_3973x8: COEF_X8 3973 +pw_995x8: COEF_X8 995 +pw_m2106x8: COEF_X8 -2106 +pw_3513x8: COEF_X8 3513 +pw_3290x8: COEF_X8 3290 +pw_2440x8: COEF_X8 2440 +pw_m601x8: COEF_X8 -601 +pw_4052x8: COEF_X8 4052 + +pw_401_4076x8: dw 401*8, 4076*8 +pw_m2598_3166x8: dw -2598*8, 3166*8 +pw_1931_3612x8: dw 1931*8, 3612*8 +pw_m1189_3920x8: dw -1189*8, 3920*8 +pw_799_4017x8: dw 799*8, 4017*8 +pw_m2276_3406x8: dw -2276*8, 3406*8 + +pw_201_4091x8: dw 201*8, 4091*8 +pw_m601_4052x8: dw -601*8, 4052*8 +pw_995_3973x8: dw 995*8, 3973*8 +pw_m1380_3857x8: dw -1380*8, 3857*8 +pw_1751_3703x8: dw 1751*8, 3703*8 +pw_m2106_3513x8: dw -2106*8, 3513*8 +pw_2440_3290x8: dw 2440*8, 3290*8 +pw_m2751_3035x8: dw -2751*8, 3035*8 + +pw_101_4095x8: dw 101*8, 4095*8 +pw_m2824_2967x8: dw -2824*8, 2967*8 +pw_1660_3745x8: dw 1660*8, 3745*8 +pw_m1474_3822x8: dw -1474*8, 3822*8 +pw_897_3996x8: dw 897*8, 3996*8 +pw_m2191_3461x8: dw -2191*8, 3461*8 +pw_2359_3349x8: dw 2359*8, 3349*8 +pw_m700_4036x8: dw -700*8, 4036*8 +pw_501_4065x8: dw 501*8, 4065*8 +pw_m2520_3229x8: dw -2520*8, 3229*8 +pw_2019_3564x8: dw 2019*8, 3564*8 +pw_m1092_3948x8: dw -1092*8, 3948*8 +pw_1285_3889x8: dw 1285*8, 3889*8 +pw_m1842_3659x8: dw -1842*8, 3659*8 +pw_2675_3102x8: dw 2675*8, 3102*8 +pw_m301_4085x8: dw -301*8, 4085*8 + +idct64_mul: COEF_X8 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474 +COEF_PAIR 401, 4076, 1 +COEF_PAIR 799, 4017 + COEF_X8 -700, 4036, 2359, 3349, -2191, 3461, 897, 3996 +dw -2598, -3166, 3166, -2598, 2598, 3166, -4017, -799, 799, -4017 + COEF_X8 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092 +COEF_PAIR 1931, 3612, 1 +COEF_PAIR 3406, 2276 + COEF_X8 -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889 +dw -1189, -3920, 3920, -1189, 1189, 3920, -2276, -3406, 3406, -2276 + +SECTION .text + +%define o_base int8_permA+64*18 +%define o(x) (r5 - (o_base) + (x)) +%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) + +; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack, +; 16 = special_mul1, 32 = special_mul2 +%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags + mova m%2, m%4 +%if %7 & 16 + vpdpwssd m%2, m%1, [o(pw_%5)] {bcstd} + mova m%3, m%4 +%if %7 & 32 + vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd} +%else + vpdpwssd m%3, m%1, m%6 +%endif +%elif %7 & 32 + vpdpwssd m%2, m%1, m%5 + mova m%3, m%4 + vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd} +%elif %6 < 32 + vpdpwssd m%2, m%1, m%5 + mova m%3, m%4 + vpdpwssd m%3, m%1, m%6 +%elif %7 & 1 + vpdpwssd m%2, m%1, [o(pw_%5_%6)] {bcstd} + mova m%3, m%4 + vpdpwssd m%3, m%1, [o(pw_m%6_%5)] {bcstd} +%else + vpdpwssd m%2, m%1, [o(pw_m%6_%5)] {bcstd} + mova m%3, m%4 + vpdpwssd m%3, m%1, [o(pw_%5_%6)] {bcstd} +%endif +%if %7 & 2 + psrld m%2, 12 + pslld m%3, 4 + vpshrdd m%1, m%3, m%2, 16 +%elif %7 & 4 + ; compared to using shifts (as above) this has better throughput, + ; but worse latency and requires setting up the opmask/index + ; registers, so only use this method for the larger transforms + pslld m%1, m%2, 4 + vpmultishiftqb m%1{k7}, m13, m%3 +%else + psrad m%2, 12 + psrad m%3, 12 +%if %7 & 8 == 0 + packssdw m%1, m%3, m%2 +%endif +%endif +%endmacro + +; flags: same as ITX_MUL2X_PACK +%macro ITX_MUL4X_PACK 10-11 0 ; dst/src, tmp[1-2], coef_tmp[1-2], rnd, coef[1-4], flags +%if %11 & 1 + vpbroadcastd m%4, [o(pw_%9_%10)] + vpbroadcastd m%4{k1}, [o(pw_%7_%8)] + vpbroadcastd m%5, [o(pw_m%10_%9)] + vpbroadcastd m%5{k1}, [o(pw_m%8_%7)] +%else + vpbroadcastd m%4, [o(pw_m%10_%9)] + vpbroadcastd m%4{k1}, [o(pw_m%8_%7)] + vpbroadcastd m%5, [o(pw_%9_%10)] + vpbroadcastd m%5{k1}, [o(pw_%7_%8)] +%endif + ITX_MUL2X_PACK %1, %2, %3, %6, %4, %5, %11 +%endmacro + +; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12 +; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12 +%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2 + punpcklwd m%3, m%2, m%1 + punpckhwd m%2, m%1 +%if %7 < 32 + mova m%1, m%5 + vpdpwssd m%1, m%3, m%7 + mova m%4, m%5 + vpdpwssd m%4, m%2, m%7 +%else + mova m%1, m%5 + vpdpwssd m%1, m%3, [o(pw_m%7_%6)] {bcstd} + mova m%4, m%5 + vpdpwssd m%4, m%2, [o(pw_m%7_%6)] {bcstd} +%endif + psrad m%1, 12 + psrad m%4, 12 + packssdw m%1, m%4 + mova m%4, m%5 +%if %7 < 32 + vpdpwssd m%4, m%2, m%6 + mova m%2, m%5 + vpdpwssd m%2, m%3, m%6 +%else + vpdpwssd m%4, m%2, [o(pw_%6_%7)] {bcstd} + mova m%2, m%5 + vpdpwssd m%2, m%3, [o(pw_%6_%7)] {bcstd} +%endif + psrad m%4, 12 + psrad m%2, 12 +%if %0 == 8 + packssdw m%8, m%2, m%4 +%else + packssdw m%2, m%4 +%endif +%endmacro + +%macro WRAP_XMM 1+ + %xdefine %%reset RESET_MM_PERMUTATION + INIT_XMM cpuname + DEFINE_MMREGS xmm + AVX512_MM_PERMUTATION + %1 + %%reset +%endmacro + +%macro WRAP_YMM 1+ + INIT_YMM cpuname + %1 + INIT_ZMM cpuname +%endmacro + +%macro ITX4_END 4-5 2048 ; row[1-4], rnd +%if %5 + vpbroadcastd m2, [o(pw_%5)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 +%endif + lea r2, [dstq+strideq*2] +%assign %%i 1 +%rep 4 + %if %1 & 2 + CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1) + %else + CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1) + %endif + %assign %%i %%i + 1 + %rotate 1 +%endrep + movd m2, [%%row_adr1] + pinsrd m2, [%%row_adr2], 1 + movd m3, [%%row_adr3] + pinsrd m3, [%%row_adr4], 1 + pmovzxbw m2, m2 + pmovzxbw m3, m3 + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + movd [%%row_adr1], m0 + pextrd [%%row_adr2], m0, 1 + pextrd [%%row_adr3], m0, 2 + pextrd [%%row_adr4], m0, 3 + ret +%endmacro + +%macro INV_TXFM_FN 3 ; type1, type2, size +cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, 0, dst, stride, c, eob, tx2, base + %define %%p1 m(i%1_%3_internal_8bpc) + lea baseq, [o_base] + ; Jump to the 1st txfm function if we're not taking the fast path, which + ; in turn performs an indirect jump to the 2nd txfm function. + lea tx2q, [m(i%2_%3_internal_8bpc).pass2] +%ifidn %1_%2, dct_dct + test eobd, eobd + jnz %%p1 +%else + ; jump to the 1st txfm function unless it's located directly after this + times ((%%end - %%p1) >> 31) & 1 jmp %%p1 +ALIGN function_align +%%end: +%endif +%endmacro + +%macro INV_TXFM_4X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x4 +%ifidn %1_%2, dct_dct + vpbroadcastw m0, [cq] + vpbroadcastd m1, [o(pw_2896x8)] + pmulhrsw m0, m1 + mov [cq], eobd + pmulhrsw m0, m1 + mova m1, m0 + jmp m(iadst_4x4_internal_8bpc).end2 +%endif +%endmacro + +%macro IDCT4_1D_PACKED 0 + vpbroadcastd m4, [o(pd_2048)] + punpckhwd m2, m1, m0 + punpcklwd m1, m0 + ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784 + ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896 + paddsw m0, m1, m2 ; out0 out1 + psubsw m1, m2 ; out3 out2 +%endmacro + +%macro IADST4_1D_PACKED 0 + punpcklwd m4, m1, m0 ; in2 in0 + punpckhwd m5, m1, m0 ; in3 in1 +.main2: + vpbroadcastd m3, [o(pd_2048)] + mova m0, m3 + vpdpwssd m0, m4, [o(pw_3803_1321)] {bcstd} + mova m2, m3 + vpdpwssd m2, m4, [o(pw_m1321_2482)] {bcstd} + mova m1, m3 + vpdpwssd m1, m4, [o(pw_m3344_3344)] {bcstd} + vpdpwssd m3, m4, [o(pw_2482_3803)] {bcstd} + vpdpwssd m0, m5, [o(pw_2482_3344)] {bcstd} + vpdpwssd m2, m5, [o(pw_m3803_3344)] {bcstd} + vpdpwssd m1, m5, [o(pd_3344)] {bcstd} + vpdpwssd m3, m5, [o(pw_m1321_m3344)] {bcstd} + REPX {psrad x, 12}, m0, m2, m1, m3 + packssdw m0, m2 ; out0 out1 + packssdw m1, m3 ; out2 out3 +%endmacro + +INIT_XMM avx512icl +INV_TXFM_4X4_FN dct, dct +INV_TXFM_4X4_FN dct, adst +INV_TXFM_4X4_FN dct, flipadst +INV_TXFM_4X4_FN dct, identity + +cglobal idct_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + IDCT4_1D_PACKED + mova m2, [o(deint_shuf)] + shufps m3, m0, m1, q1331 + shufps m0, m0, m1, q0220 + pshufb m0, m2 + pshufb m1, m3, m2 + jmp tx2q +.pass2: + IDCT4_1D_PACKED + pxor ymm16, ymm16 + mova [cq], ymm16 + ITX4_END 0, 1, 3, 2 + +INV_TXFM_4X4_FN adst, dct +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst +INV_TXFM_4X4_FN adst, identity + +cglobal iadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + call .main + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call .main +.end: + pxor ymm16, ymm16 + mova [cq], ymm16 +.end2: + ITX4_END 0, 1, 2, 3 +ALIGN function_align +.main: + IADST4_1D_PACKED + ret + +INV_TXFM_4X4_FN flipadst, dct +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst +INV_TXFM_4X4_FN flipadst, identity + +cglobal iflipadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + call m(iadst_4x4_internal_8bpc).main + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + jmp tx2q +.pass2: + call m(iadst_4x4_internal_8bpc).main +.end: + pxor ymm16, ymm16 + mova [cq], ymm16 +.end2: + ITX4_END 3, 2, 1, 0 + +INV_TXFM_4X4_FN identity, dct +INV_TXFM_4X4_FN identity, adst +INV_TXFM_4X4_FN identity, flipadst +INV_TXFM_4X4_FN identity, identity + +cglobal iidentity_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m0, [cq+16*0] + mova m1, [cq+16*1] + vpbroadcastd m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + jmp tx2q +.pass2: + vpbroadcastd m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_4x4_internal_8bpc).end + +%macro INV_TXFM_4X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x8 +%ifidn %1_%2, dct_dct + movd xmm1, [o(pw_2896x8)] + pmulhrsw xmm0, xmm1, [cq] + movd xmm2, [o(pw_2048)] + pmulhrsw xmm0, xmm1 + pmulhrsw xmm0, xmm1 + pmulhrsw xmm0, xmm2 + vpbroadcastw ym0, xmm0 + mova ym1, ym0 + jmp m(iadst_4x8_internal_8bpc).end3 +%endif +%endmacro + +%macro IDCT8_1D_PACKED 0 + punpckhwd m5, m3, m0 ; in7 in1 + punpckhwd m4, m1, m2 ; in3 in5 + punpcklwd m3, m1 ; in6 in2 + punpcklwd m2, m0 ; in4 in0 +.main2: + vpbroadcastd m6, [o(pd_2048)] + ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a + ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a + ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2 + psubsw m0, m5, m4 ; t5a t6a (interleaved) + paddsw m4, m5 ; t4 t7 (interleaved) + ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1 + ITX_MUL2X_PACK 0, 1, 5, 6, 2896, 2896, 1 ; t6 t5 +%if mmsize > 16 + vbroadcasti32x4 m1, [o(deint_shuf)] + pshufb m4, m1 +%else + pshufb m4, [o(deint_shuf)] +%endif + psubsw m1, m2, m3 ; tmp3 tmp2 + paddsw m3, m2 ; tmp0 tmp1 + punpckhqdq m2, m4, m0 ; t7 t6 + punpcklqdq m4, m0 ; t4 t5 + paddsw m0, m3, m2 ; out0 out1 + psubsw m3, m2 ; out7 out6 + psubsw m2, m1, m4 ; out4 out5 + paddsw m1, m4 ; out3 out2 +%endmacro + +%macro IADST8_1D_PACKED 1 ; pass + vpbroadcastd m6, [o(pd_2048)] +%if %1 == 1 + ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a + ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a + ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a + ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a + psubsw m4, m0, m2 ; t5 t4 + paddsw m0, m2 ; t1 t0 + psubsw m5, m1, m3 ; t6 t7 + paddsw m1, m3 ; t2 t3 + ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a + ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a +%if mmsize > 16 + vbroadcasti32x4 m2, [o(deint_shuf)] +%else + mova m2, [o(deint_shuf)] +%endif + vprord m1, 16 + psubsw m3, m0, m1 ; t3 t2 + paddsw m0, m1 ; -out7 out0 + psubsw m1, m4, m5 ; t7 t6 + paddsw m4, m5 ; out6 -out1 + pshufb m0, m2 + pshufb m4, m2 + mova m2, m6 + vpdpwssd m2, m3, [o(pw_m2896_2896)] {bcstd} + mova m5, m6 + vpdpwssd m5, m1, [o(pw_m2896_2896)] {bcstd} + psrad m2, 12 + psrad m5, 12 + packssdw m2, m5 ; out4 -out5 + mova m5, m6 + vpdpwssd m5, m3, [o(pw_2896_2896)] {bcstd} + mova m3, m6 + vpdpwssd m3, m1, [o(pw_2896_2896)] {bcstd} + psrad m5, 12 + psrad m3, 12 + packssdw m1, m3, m5 ; out2 -out3 +%else + punpckhwd m0, m4, m3 ; 0 7 + punpckhwd m1, m5, m2 ; 2 5 + punpcklwd m2, m5 ; 4 3 + punpcklwd m3, m4 ; 6 1 + ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a + ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a + ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a + ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a + psubsw m4, m0, m2 ; t4 t5 + paddsw m0, m2 ; t0 t1 + psubsw m5, m1, m3 ; t6 t7 + paddsw m1, m3 ; t2 t3 + shufps m2, m5, m4, q1032 + punpckhwd m4, m2 + punpcklwd m5, m2 + ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784 ; t4a t5a + ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 1 ; t6a t7a + psubsw m2, m0, m1 ; t2 t3 + paddsw m0, m1 ; out0 -out7 + psubsw m1, m4, m5 ; t6 t7 + paddsw m4, m5 ; -out1 out6 + vpbroadcastd m5, [o(pw_2896x8)] + punpckhqdq m3, m2, m1 ; t3 t7 + punpcklqdq m2, m1 ; t2 t6 + paddsw m1, m2, m3 ; t2+t3 t6+t7 + psubsw m2, m3 ; t2-t3 t6-t7 + punpckhqdq m3, m4, m0 ; out6 -out7 + punpcklqdq m0, m4 ; out0 -out1 + pmulhrsw m2, m5 ; out4 -out5 + pshufd m1, m1, q1032 + pmulhrsw m1, m5 ; out2 -out3 +%endif +%endmacro + +INIT_YMM avx512icl +INV_TXFM_4X8_FN dct, dct +INV_TXFM_4X8_FN dct, identity +INV_TXFM_4X8_FN dct, adst +INV_TXFM_4X8_FN dct, flipadst + +cglobal idct_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(pw_2896x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + IDCT4_1D_PACKED + vbroadcasti32x4 m2, [o(deint_shuf)] + shufps m3, m0, m1, q1331 + shufps m0, m0, m1, q0220 + pshufb m0, m2 + pshufb m1, m3, m2 + jmp tx2q +.pass2: + vextracti32x4 xm2, m0, 1 + vextracti32x4 xm3, m1, 1 + call .main + vpbroadcastd m4, [o(pw_2048)] + vinserti32x4 m0, m0, xm2, 1 + vinserti32x4 m1, m1, xm3, 1 + pshufd m1, m1, q1032 + jmp m(iadst_4x8_internal_8bpc).end2 +ALIGN function_align +.main: + WRAP_XMM IDCT8_1D_PACKED + ret + +INV_TXFM_4X8_FN adst, dct +INV_TXFM_4X8_FN adst, adst +INV_TXFM_4X8_FN adst, flipadst +INV_TXFM_4X8_FN adst, identity + +cglobal iadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(pw_2896x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + call m(iadst_8x4_internal_8bpc).main + punpckhwd m3, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + vextracti32x4 xm2, m0, 1 + vextracti32x4 xm3, m1, 1 + pshufd xm4, xm0, q1032 + pshufd xm5, xm1, q1032 + call .main_pass2 + vpbroadcastd m4, [o(pw_2048)] + vinserti32x4 m0, xm2, 1 + vinserti32x4 m1, xm3, 1 + pxor m5, m5 + psubw m5, m4 +.end: + punpcklqdq m4, m5 +.end2: + pmulhrsw m0, m4 + pmulhrsw m1, m4 +.end3: + vpbroadcastd m3, strided + pmulld m5, m3, [o(pd_0to15)] + kxnorb k1, k1, k1 + kmovb k2, k1 + vpgatherdd m3{k1}, [dstq+m5] + pxor m4, m4 + mova [cq], zmm20 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpscatterdd [dstq+m5]{k2}, m0 + RET +ALIGN function_align +.main_pass1: + punpckhwd xm0, xm4, xm3 ; 0 7 + punpckhwd xm1, xm5, xm2 ; 2 5 + punpcklwd xm2, xm5 ; 4 3 + punpcklwd xm3, xm4 ; 6 1 + WRAP_XMM IADST8_1D_PACKED 1 + punpcklqdq xm3, xm4, xm0 ; out6 -out7 + punpckhqdq xm0, xm4 ; out0 -out1 + ret +ALIGN function_align +.main_pass2: + WRAP_XMM IADST8_1D_PACKED 2 + ret + +INV_TXFM_4X8_FN flipadst, dct +INV_TXFM_4X8_FN flipadst, adst +INV_TXFM_4X8_FN flipadst, flipadst +INV_TXFM_4X8_FN flipadst, identity + +cglobal iflipadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 + vpermq m1, [cq+32*1], q3120 + vpbroadcastd m2, [o(pw_2896x8)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 + call m(iadst_8x4_internal_8bpc).main + punpcklwd m3, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m3 + punpckhwd m1, m3 + jmp tx2q +.pass2: + vextracti32x4 xm2, m0, 1 + vextracti32x4 xm3, m1, 1 + pshufd xm4, xm0, q1032 + pshufd xm5, xm1, q1032 + call m(iadst_4x8_internal_8bpc).main_pass2 + vpbroadcastd m5, [o(pw_2048)] + vinserti32x4 m3, xm1, 1 + vinserti32x4 m2, xm0, 1 + pxor m4, m4 + psubw m4, m5 + pshufd m0, m3, q1032 + pshufd m1, m2, q1032 + jmp m(iadst_4x8_internal_8bpc).end + +INIT_ZMM avx512icl +INV_TXFM_4X8_FN identity, dct +INV_TXFM_4X8_FN identity, adst +INV_TXFM_4X8_FN identity, flipadst +INV_TXFM_4X8_FN identity, identity + +cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpbroadcastd m0, [o(pw_2896x8)] + pmulhrsw m0, [cq] + mova m1, [o(int8_permB)] + vpbroadcastd m2, [o(pw_1697x8)] + vpermb m0, m1, m0 + pmulhrsw m2, m0 + paddsw m0, m2 + vextracti32x8 ym1, m0, 1 + jmp tx2q +.pass2: + vpbroadcastd ym4, [o(pw_4096)] + jmp m(iadst_4x8_internal_8bpc).end2 + +%macro INV_TXFM_4X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x16 +%ifidn %1_%2, dct_dct + movsx r6d, word [cq] + mov [cq], eobd + imul r6d, 181 + add r6d, 128+256 + sar r6d, 8+1 + imul r6d, 181 + add r6d, 128+2048 + sar r6d, 8+4 + vpbroadcastw m0, r6d + mova m1, m0 + jmp m(iadst_4x16_internal_8bpc).end3 +%endif +%endmacro + +%macro IDCT16_1D_PACKED 0 + punpckhwd m8, m7, m0 ; dct16 in15 in1 + punpcklwd m9, m4, m0 ; dct4 in2 in0 + punpckhwd m0, m3, m4 ; dct16 in7 in9 + punpcklwd m7, m1 ; dct8 in7 in1 + punpckhwd m1, m6 ; dct16 in3 in13 + punpcklwd m3, m5 ; dct8 in3 in5 + punpckhwd m5, m2 ; dct16 in11 in5 + punpcklwd m6, m2 ; dct4 in3 in1 +cglobal_label .main2 + vpbroadcastd m10, [o(pd_2048)] +.main3: + vpbroadcastq m13, [o(int_mshift)] + vpcmpub k7, m13, m10, 6 ; 0x33... + ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 5 ; t8a t15a + ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 5 ; t9a t14a + ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 5 ; t10a t13a + ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 5 ; t11a t12a + ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 5 ; t4a t7a + ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 5 ; t5a t6a +.main4: + psubsw m2, m8, m0 ; t9 t14 + paddsw m8, m0 ; t8 t15 + psubsw m4, m1, m5 ; t10 t13 + paddsw m1, m5 ; t11 t12 + ITX_MUL2X_PACK 6, 0, 5, 10, 1567, 3784 ; t3 t2 + psubsw m0, m8, m1 ; t11a t12a + paddsw m8, m1 ; t8a t15a + psubsw m1, m7, m3 ; t5a t6a + paddsw m7, m3 ; t4 t7 +.main5: + ITX_MUL2X_PACK 2, 3, 5, 10, 1567, 3784, 5 ; t9a t14a + ITX_MUL2X_PACK 4, 3, 5, 10, m3784, 1567, 5 ; t10a t13a +%if mmsize > 16 + vbroadcasti32x4 m5, [o(deint_shuf)] +%else + mova m5, [o(deint_shuf)] +%endif + vpbroadcastd m11, [o(pw_m2896_2896)] + vpbroadcastd m12, [o(pw_2896_2896)] + paddsw m3, m2, m4 ; t9 t14 + psubsw m2, m4 ; t10 t13 + pshufb m8, m5 + pshufb m7, m5 + pshufb m3, m5 + ITX_MUL2X_PACK 9, 4, 5, 10, 11, 12 ; t0 t1 + ITX_MUL2X_PACK 1, 4, 5, 10, 12, 11 ; t5 t6 + ITX_MUL2X_PACK 0, 4, 5, 10, 11, 12, 8 ; t11 t12 + ITX_MUL2X_PACK 2, 0, 11, 10, 11, 12, 8 ; t10a t13a + punpckhqdq m2, m7, m1 ; t7 t6 + punpcklqdq m7, m1 ; t4 t5 + psubsw m1, m9, m6 ; dct4 out3 out2 + paddsw m9, m6 ; dct4 out0 out1 + packssdw m5, m11 ; t12 t13a + packssdw m4, m0 ; t11 t10a + punpckhqdq m0, m8, m3 ; t15a t14 + punpcklqdq m8, m3 ; t8a t9 + psubsw m3, m9, m2 ; dct8 out7 out6 + paddsw m9, m2 ; dct8 out0 out1 + psubsw m2, m1, m7 ; dct8 out4 out5 + paddsw m1, m7 ; dct8 out3 out2 + psubsw m7, m9, m0 ; out15 out14 + paddsw m0, m9 ; out0 out1 + psubsw m6, m1, m5 ; out12 out13 + paddsw m1, m5 ; out3 out2 + psubsw m5, m2, m4 ; out11 out10 + paddsw m2, m4 ; out4 out5 + psubsw m4, m3, m8 ; out8 out9 + paddsw m3, m8 ; out7 out6 +%endmacro + +INV_TXFM_4X16_FN dct, dct +INV_TXFM_4X16_FN dct, identity +INV_TXFM_4X16_FN dct, adst +INV_TXFM_4X16_FN dct, flipadst + +cglobal idct_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova ym1, [cq+32*2] + vinserti32x8 m1, [cq+32*0], 1 + mova m0, [o(int16_perm)] + mova ym2, [cq+32*3] + vinserti32x8 m2, [cq+32*1], 1 + vpbroadcastd m4, [o(pd_2048)] + vpermb m1, m0, m1 ; c0 a0 c1 a1 c2 a2 c3 a3 + vpermb m2, m0, m2 ; d0 b0 d1 b1 d2 b2 d3 b3 + ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896, 2 + ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784, 2 + vpbroadcastd m4, [o(pw_16384)] + psubsw m3, m1, m2 + paddsw m1, m2 ; out0 out1 + vprord m3, 16 ; out2 out3 + punpckldq m0, m1, m3 + punpckhdq m1, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + jmp tx2q +.pass2: + vextracti32x4 xm2, ym0, 1 + vextracti32x4 xm3, ym1, 1 + vextracti32x4 xm4, m0, 2 + vextracti32x4 xm5, m1, 2 + vextracti32x4 xm6, m0, 3 + vextracti32x4 xm7, m1, 3 + call .main + vinserti32x4 ym0, xm2, 1 + vinserti32x4 ym1, xm3, 1 + vinserti32x4 ym4, xm6, 1 + vinserti32x4 ym5, xm7, 1 + vinserti32x8 m0, ym4, 1 + vinserti32x8 m1, ym5, 1 + vpbroadcastd m5, [o(pw_2048)] + pshufd m1, m1, q1032 + jmp m(iadst_4x16_internal_8bpc).end2 +ALIGN function_align +.main: + WRAP_XMM IDCT16_1D_PACKED + ret + +INV_TXFM_4X16_FN adst, dct +INV_TXFM_4X16_FN adst, adst +INV_TXFM_4X16_FN adst, flipadst +INV_TXFM_4X16_FN adst, identity + +cglobal iadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m1, [o(permB)] + vpermq m0, m1, [cq+64*0] + vpermq m1, m1, [cq+64*1] + call m(iadst_16x4_internal_8bpc).main + vpbroadcastd m3, [o(pw_16384)] + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + pmulhrsw m2, m3 + pmulhrsw m0, m3 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + jmp tx2q +.pass2: + call .main + vpbroadcastd m5, [o(pw_2048)] + psrlq m10, 4 + psubw m6, m8, m5 +.end: + vpbroadcastd m7, [o(pw_2896x8)] + paddsw ym1, ym2, ym4 + psubsw ym2, ym4 + vinserti32x8 m1, ym2, 1 + pmulhrsw m1, m7 ; -out7 out4 out6 -out5 out8 -out11 -out9 out10 + psrlq m0, m10, 4 + vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d + vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f + punpcklqdq m5, m6 +.end2: + pmulhrsw m0, m5 + pmulhrsw m1, m5 +.end3: + vpbroadcastd m3, strided + pmulld m5, m3, [o(pd_0to15)] + kxnorw k1, k1, k1 + kmovw k2, k1 + vpgatherdd m3{k1}, [dstq+m5] + pxor m4, m4 + mova [cq+64*0], m4 + mova [cq+64*1], m4 + punpcklbw m2, m3, m4 + punpckhbw m3, m4 + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpscatterdd [dstq+m5]{k2}, m0 + RET +ALIGN function_align +.main: + movu m3, [o(permB+1)] + psrlq m10, m3, 4 +.main2: + vpermi2q m3, m0, m1 ; in15 in12 in13 in14 in11 in8 in9 in10 + vpermt2q m0, m10, m1 ; in0 in3 in2 in1 in4 in7 in6 in5 + vpbroadcastd m9, [o(pd_2048)] + vpbroadcastq ym13, [o(int_mshift)] + kxnorb k1, k1, k1 + punpckhwd m4, m3, m0 ; in12 in3 in14 in1 + punpcklwd m0, m3 ; in0 in15 in2 in13 + kshiftrb k1, k1, 4 + vextracti32x8 ym3, m4, 1 ; in8 in7 in10 in5 + vextracti32x8 ym1, m0, 1 ; in4 in11 in6 in9 +INIT_YMM avx512icl + vpcmpub k7, m13, m9, 6 ; 0x33... + pxor m8, m8 + ITX_MUL4X_PACK 0, 2, 5, 6, 7, 9, 201, 4091, 995, 3973, 5 + ITX_MUL4X_PACK 1, 2, 5, 6, 7, 9, 1751, 3703, 2440, 3290, 5 + ITX_MUL4X_PACK 3, 2, 5, 6, 7, 9, 3035, 2751, 3513, 2106, 5 + ITX_MUL4X_PACK 4, 2, 5, 6, 7, 9, 3857, 1380, 4052, 601, 5 + psubsw m2, m0, m3 ; t9a t8a t11a t10a + paddsw m0, m3 ; t1a t0a t3a t2a + psubsw m3, m1, m4 ; t13a t12a t15a t14a + paddsw m4, m1 ; t5a t4a t7a t6a + ITX_MUL4X_PACK 2, 1, 5, 6, 7, 9, 799, 4017, 3406, 2276, 5 + psubw m7, m8, m7 + ITX_MUL2X_PACK 3, 1, 5, 9, 7, 6, 4 + vpbroadcastd m6, [o(pw_3784_m1567)] + vpbroadcastd m6{k1}, [o(pw_m3784_1567)] + psubsw m1, m0, m4 ; t5 t4 t7 t6 + paddsw m0, m4 ; t1 t0 t3 t2 + psubsw m4, m2, m3 ; t13a t12a t15a t14a + paddsw m2, m3 ; t9a t8a t11a t10a + ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t4a t5a t7a t6a + ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t12 t13 t15 t14 + vbroadcasti32x4 m5, [o(deint_shuf)] + pshufb m0, m5 + pshufb m2, m5 + vshufi32x4 m3, m0, m2, 0x03 ; t3 t2 t11a t10a + vinserti32x4 m0, xm2, 1 ; t1 t0 t9a t8a + vshufi32x4 m2, m1, m4, 0x03 ; t7a t6a t15 t14 + vinserti32x4 m1, xm4, 1 ; t4a t5a t12 t13 + pshufd m2, m2, q1032 ; t6a t7a t14 t15 + psubsw m4, m0, m3 ; t3a t2a t11 t10 + paddsw m0, m3 ; -out15 out0 out14 -out1 + paddsw m3, m1, m2 ; out12 -out3 -out13 out2 + psubsw m1, m2 ; t7 t6 t15a t14a + punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a + punpcklqdq m4, m1 ; t3a t7 t11 t15a +INIT_ZMM avx512icl + vinserti32x8 m3, ym0, 1 ; out12 -out3 -out13 out2 -out15 out0 out14 -out1 + ret + +INV_TXFM_4X16_FN flipadst, dct +INV_TXFM_4X16_FN flipadst, adst +INV_TXFM_4X16_FN flipadst, flipadst +INV_TXFM_4X16_FN flipadst, identity + +cglobal iflipadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m1, [o(permB)] + vpermq m0, m1, [cq+64*0] + vpermq m1, m1, [cq+64*1] + call m(iadst_16x4_internal_8bpc).main + vpbroadcastd m3, [o(pw_16384)] + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + pmulhrsw m2, m3 + pmulhrsw m1, m3 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + jmp tx2q +.pass2: + call m(iadst_4x16_internal_8bpc).main + vpbroadcastd m6, [o(pw_2048)] + psrlq m10, 12 + psubw m5, m8, m6 + jmp m(iadst_4x16_internal_8bpc).end + +INV_TXFM_4X16_FN identity, dct +INV_TXFM_4X16_FN identity, adst +INV_TXFM_4X16_FN identity, flipadst +INV_TXFM_4X16_FN identity, identity + +cglobal iidentity_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m2, [o(int16_perm)] + vpermb m1, m2, [cq+64*0] + vpermb m2, m2, [cq+64*1] + vpbroadcastd m4, [o(pw_1697x8)] + vpbroadcastd m0, [o(pd_m1)] + pmulhrsw m3, m4, m1 ; we want to do a signed avg, but pavgw is + vpcmpw k1, m1, m0, 4 ; unsigned. as long as both signs are equal + pmulhrsw m4, m2 ; it still works, but if the input is -1 the + vpcmpw k2, m2, m0, 4 ; pmulhrsw result will become 0 which causes + vpavgw m1{k1}{z}, m3 ; pavgw to output -32768 instead of 0 unless + vpavgw m2{k2}{z}, m4 ; we explicitly deal with that case here. + punpckldq m0, m1, m2 + punpckhdq m1, m2 + jmp tx2q +.pass2: + vpbroadcastd m3, [o(pw_1697x16)] + vpbroadcastd m5, [o(pw_2048)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m0 + paddsw m1, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_4x16_internal_8bpc).end2 + +%macro WRITE_8X4 4-7 strideq*1, strideq*2, r6 ; coefs[1-2], tmp[1-2], off[1-3] + movq xm%3, [dstq ] + movhps xm%3, [dstq+%5] + movq xm%4, [dstq+%6] + movhps xm%4, [dstq+%7] + pmovzxbw m%3, xm%3 + pmovzxbw m%4, xm%4 +%ifnum %1 + paddw m%3, m%1 +%else + paddw m%3, %1 +%endif +%ifnum %2 + paddw m%4, m%2 +%else + paddw m%4, %2 +%endif + packuswb m%3, m%4 + vextracti32x4 xm%4, m%3, 1 + movq [dstq ], xm%3 + movhps [dstq+%6], xm%3 + movq [dstq+%5], xm%4 + movhps [dstq+%7], xm%4 +%endmacro + +%macro INV_TXFM_8X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x4 +%ifidn %1_%2, dct_dct + movd xm1, [o(pw_2896x8)] + pmulhrsw xm0, xm1, [cq] + movd xm2, [o(pw_2048)] + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm1 + pmulhrsw xm0, xm2 + vpbroadcastw m0, xm0 + mova m1, m0 + jmp m(iadst_8x4_internal_8bpc).end3 +%endif +%endmacro + +INIT_YMM avx512icl +INV_TXFM_8X4_FN dct, dct +INV_TXFM_8X4_FN dct, adst +INV_TXFM_8X4_FN dct, flipadst +INV_TXFM_8X4_FN dct, identity + +cglobal idct_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpbroadcastd xm3, [o(pw_2896x8)] + pmulhrsw xm0, xm3, [cq+16*0] + pmulhrsw xm1, xm3, [cq+16*1] + pmulhrsw xm2, xm3, [cq+16*2] + pmulhrsw xm3, [cq+16*3] + call m(idct_4x8_internal_8bpc).main + vbroadcasti32x4 m4, [o(deint_shuf)] + vinserti32x4 m3, m1, xm3, 1 + vinserti32x4 m1, m0, xm2, 1 + shufps m0, m1, m3, q0220 + shufps m1, m3, q1331 + pshufb m0, m4 + pshufb m1, m4 + jmp tx2q +.pass2: + IDCT4_1D_PACKED + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + jmp m(iadst_8x4_internal_8bpc).end2 + +INV_TXFM_8X4_FN adst, dct +INV_TXFM_8X4_FN adst, adst +INV_TXFM_8X4_FN adst, flipadst +INV_TXFM_8X4_FN adst, identity + +cglobal iadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpbroadcastd xm0, [o(pw_2896x8)] + pshufd xm4, [cq+16*0], q1032 + pmulhrsw xm3, xm0, [cq+16*3] + pshufd xm5, [cq+16*1], q1032 + pmulhrsw xm2, xm0, [cq+16*2] + pmulhrsw xm4, xm0 + pmulhrsw xm5, xm0 + call m(iadst_4x8_internal_8bpc).main_pass1 + vinserti32x4 m0, xm2, 1 + vinserti32x4 m1, xm3, 1 + pxor m3, m3 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + psubsw m3, m2 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call .main +.end: + vpermq m0, m0, q3120 + vpermq m1, m1, q3120 +.end2: + vpbroadcastd m2, [o(pw_2048)] + pmulhrsw m0, m2 + pmulhrsw m1, m2 +.end3: + pxor m2, m2 + mova [cq], zmm18 + lea r6, [strideq*3] + WRITE_8X4 0, 1, 4, 5 + RET +ALIGN function_align +.main: + IADST4_1D_PACKED + ret + +INV_TXFM_8X4_FN flipadst, dct +INV_TXFM_8X4_FN flipadst, adst +INV_TXFM_8X4_FN flipadst, flipadst +INV_TXFM_8X4_FN flipadst, identity + +cglobal iflipadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpbroadcastd xm0, [o(pw_2896x8)] + pshufd xm4, [cq+16*0], q1032 + pmulhrsw xm3, xm0, [cq+16*3] + pshufd xm5, [cq+16*1], q1032 + pmulhrsw xm2, xm0, [cq+16*2] + pmulhrsw xm4, xm0 + pmulhrsw xm5, xm0 + call m(iadst_4x8_internal_8bpc).main_pass1 + vinserti32x4 m3, m3, xm1, 1 + vinserti32x4 m2, m2, xm0, 1 + punpckhwd m1, m3, m2 + punpcklwd m3, m2 + pxor m0, m0 + psubsw m0, m1 + punpckhwd m1, m0, m3 + punpcklwd m0, m3 + jmp tx2q +.pass2: + call m(iadst_8x4_internal_8bpc).main + mova m2, m1 + vpermq m1, m0, q2031 + vpermq m0, m2, q2031 + jmp m(iadst_8x4_internal_8bpc).end2 + +INV_TXFM_8X4_FN identity, dct +INV_TXFM_8X4_FN identity, adst +INV_TXFM_8X4_FN identity, flipadst +INV_TXFM_8X4_FN identity, identity + +cglobal iidentity_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova xm2, [cq+16*0] + mova xm0, [cq+16*1] + vinserti32x4 m2, [cq+16*2], 1 + vinserti32x4 m0, [cq+16*3], 1 + vpbroadcastd m3, [o(pw_2896x8)] + punpcklwd m1, m2, m0 + punpckhwd m2, m0 + pmulhrsw m1, m3 + pmulhrsw m2, m3 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + paddsw m0, m0 + paddsw m1, m1 + jmp tx2q +.pass2: + vpbroadcastd m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + jmp m(iadst_8x4_internal_8bpc).end + +%macro INV_TXFM_8X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x8 +%ifidn %1_%2, dct_dct +INIT_ZMM avx512icl + movsx r6d, word [cq] + mov [cq], eobd +.dconly: + imul r6d, 181 + add r6d, 128+256 + sar r6d, 8+1 +.dconly2: + vpbroadcastd ym2, strided + imul r6d, 181 + pmulld ym5, ym2, [o(pd_0to15)] + kxnorb k1, k1, k1 + add r6d, 128+2048 + sar r6d, 8+4 + pxor m3, m3 + vpbroadcastw m4, r6d +.dconly_loop: + kmovb k2, k1 + vpgatherdq m2{k1}, [dstq+ym5] + punpcklbw m0, m2, m3 + punpckhbw m1, m2, m3 + paddw m0, m4 + paddw m1, m4 + packuswb m0, m1 + kmovb k1, k2 + vpscatterdq [dstq+ym5]{k2}, m0 + lea dstq, [dstq+strideq*8] + sub r3d, 8 + jg .dconly_loop + RET +INIT_YMM avx512icl +%endif +%endmacro + +INV_TXFM_8X8_FN dct, dct +INV_TXFM_8X8_FN dct, identity +INV_TXFM_8X8_FN dct, adst +INV_TXFM_8X8_FN dct, flipadst + +cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpermq m0, [cq+32*0], q3120 ; 0 1 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m2, [cq+32*2], q3120 ; 4 5 + vpermq m1, [cq+32*1], q3120 ; 2 3 + call .main + shufps m4, m0, m1, q0220 + shufps m5, m0, m1, q1331 + shufps m1, m2, m3, q0220 + shufps m3, m2, m3, q1331 + vbroadcasti32x4 m0, [o(deint_shuf)] + vpbroadcastd m2, [o(pw_16384)] + REPX {pshufb x, m0}, m4, m5, m1, m3 + REPX {pmulhrsw x, m2}, m4, m5, m1, m3 + vinserti32x4 m0, m4, xm1, 1 + vshufi32x4 m2, m4, m1, 0x03 + vinserti32x4 m1, m5, xm3, 1 + vshufi32x4 m3, m5, m3, 0x03 + jmp tx2q +.pass2: + call .main + vpbroadcastd m4, [o(pw_2048)] + vpermq m0, m0, q3120 + vpermq m1, m1, q2031 + vpermq m2, m2, q3120 + vpermq m3, m3, q2031 + jmp m(iadst_8x8_internal_8bpc).end2 +ALIGN function_align +cglobal_label .main + IDCT8_1D_PACKED + ret + +INV_TXFM_8X8_FN adst, dct +INV_TXFM_8X8_FN adst, adst +INV_TXFM_8X8_FN adst, flipadst +INV_TXFM_8X8_FN adst, identity + +cglobal iadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpermq m4, [cq+32*0], q1302 ; 1 0 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m5, [cq+32*1], q1302 ; 3 2 + vpermq m2, [cq+32*2], q3120 ; 4 5 + call .main_pass1 + vpbroadcastd m5, [o(pw_16384_m16384)] + punpcklwd m4, m0, m1 + punpckhwd m0, m1 + punpcklwd m1, m2, m3 + punpckhwd m2, m3 + punpcklwd m3, m4, m0 + punpckhwd m4, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 + REPX {pmulhrsw x, m5}, m3, m4, m0, m1 + vshufi32x4 m2, m3, m0, 0x03 + vinserti32x4 m0, m3, xm0, 1 + vshufi32x4 m3, m4, m1, 0x03 + vinserti32x4 m1, m4, xm1, 1 + jmp tx2q +.pass2: + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call .main_pass2 + vpbroadcastd m5, [o(pw_2048)] + vpbroadcastd xm4, [o(pw_4096)] + psubw m4, m5 ; lower half = 2048, upper half = -2048 +.end: + REPX {vpermq x, x, q3120}, m0, m1, m2, m3 +.end2: + pmulhrsw m0, m4 + pmulhrsw m1, m4 +.end3: + pmulhrsw m2, m4 + pmulhrsw m3, m4 +.end4: + pxor m4, m4 + mova [cq+32*0], m4 + mova [cq+32*1], m4 + mova [cq+32*2], m4 + mova [cq+32*3], m4 + lea r6, [strideq*3] + WRITE_8X4 0, 1, 4, 5 + lea dstq, [dstq+strideq*4] + WRITE_8X4 2, 3, 4, 5 + RET +ALIGN function_align +.main_pass1: + punpckhwd m0, m4, m3 ; 0 7 + punpckhwd m1, m5, m2 ; 2 5 + punpcklwd m2, m5 ; 4 3 + punpcklwd m3, m4 ; 6 1 + IADST8_1D_PACKED 1 + punpcklqdq m3, m4, m0 ; out6 -out7 + punpckhqdq m0, m4 ; out0 -out1 + ret +ALIGN function_align +cglobal_label .main_pass2 + IADST8_1D_PACKED 2 + ret + +INV_TXFM_8X8_FN flipadst, dct +INV_TXFM_8X8_FN flipadst, adst +INV_TXFM_8X8_FN flipadst, flipadst +INV_TXFM_8X8_FN flipadst, identity + +cglobal iflipadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpermq m4, [cq+32*0], q1302 ; 1 0 + vpermq m3, [cq+32*3], q3120 ; 6 7 + vpermq m5, [cq+32*1], q1302 ; 3 2 + vpermq m2, [cq+32*2], q3120 ; 4 5 + call m(iadst_8x8_internal_8bpc).main_pass1 + vpbroadcastd m5, [o(pw_m16384_16384)] + punpckhwd m4, m3, m2 + punpcklwd m3, m2 + punpckhwd m2, m1, m0 + punpcklwd m1, m0 + punpckhwd m0, m4, m3 + punpcklwd m4, m3 + punpckhwd m3, m2, m1 + punpcklwd m2, m1 + REPX {pmulhrsw x, m5}, m0, m4, m3, m2 + vinserti32x4 m1, m0, xm3, 1 + vshufi32x4 m3, m0, m3, 0x03 + vinserti32x4 m0, m4, xm2, 1 + vshufi32x4 m2, m4, m2, 0x03 + jmp tx2q +.pass2: + pshufd m4, m0, q1032 + pshufd m5, m1, q1032 + call m(iadst_8x8_internal_8bpc).main_pass2 + vpbroadcastd m4, [o(pw_2048)] + vpbroadcastd xm5, [o(pw_4096)] + psubw m4, m5 ; lower half = -2048, upper half = 2048 + vpermq m5, m3, q2031 + vpermq m3, m0, q2031 + vpermq m0, m2, q2031 + vpermq m2, m1, q2031 + pmulhrsw m1, m0, m4 + pmulhrsw m0, m5, m4 + jmp m(iadst_8x8_internal_8bpc).end3 + +INV_TXFM_8X8_FN identity, dct +INV_TXFM_8X8_FN identity, adst +INV_TXFM_8X8_FN identity, flipadst +INV_TXFM_8X8_FN identity, identity + +cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova xm3, [cq+16*0] + mova xm2, [cq+16*1] + vinserti32x4 m3, [cq+16*4], 1 + vinserti32x4 m2, [cq+16*5], 1 + mova xm4, [cq+16*2] + mova xm0, [cq+16*3] + vinserti32x4 m4, [cq+16*6], 1 + vinserti32x4 m0, [cq+16*7], 1 + punpcklwd m1, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m4, m0 + punpckhwd m4, m0 + punpckldq m0, m1, m2 + punpckhdq m1, m2 + punpckldq m2, m3, m4 + punpckhdq m3, m4 + jmp tx2q +.pass2: + vpbroadcastd m4, [o(pw_4096)] + jmp m(iadst_8x8_internal_8bpc).end + +%macro INV_TXFM_8X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x16 +%ifidn %1_%2, dct_dct + movsx r6d, word [cq] + mov [cq], eobd + imul r6d, 181 + mov r3d, 16 + add r6d, 128 + sar r6d, 8 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly +%endif +%endmacro + +%macro ITX_8X16_LOAD_COEFS 0 + vpbroadcastd m4, [o(pw_2896x8)] + pmulhrsw m0, m4, [cq+32*0] + add cq, 32*4 + pmulhrsw m7, m4, [cq+32*3] + pmulhrsw m1, m4, [cq-32*3] + pmulhrsw m6, m4, [cq+32*2] + pmulhrsw m2, m4, [cq-32*2] + pmulhrsw m5, m4, [cq+32*1] + pmulhrsw m3, m4, [cq-32*1] + pmulhrsw m4, [cq+32*0] +%endmacro + +INIT_ZMM avx512icl +INV_TXFM_8X16_FN dct, dct +INV_TXFM_8X16_FN dct, identity +INV_TXFM_8X16_FN dct, adst +INV_TXFM_8X16_FN dct, flipadst + +cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m3, [o(permB)] + vpermq m0, m3, [cq+64*0] + vpbroadcastd m4, [o(pw_2896x8)] + vpermq m1, m3, [cq+64*1] + vpermq m2, m3, [cq+64*2] + vpermq m3, m3, [cq+64*3] + REPX {pmulhrsw x, m4}, m0, m1, m2, m3 + call m(idct_16x8_internal_8bpc).main + vpbroadcastd m5, [o(pw_16384)] + punpckhwd m4, m0, m2 ; b0 f0 b1 f1 b2 f2 b3 f3 + punpcklwd m0, m2 ; a0 e0 a1 e1 a2 e2 a3 e3 + punpckhwd m2, m1, m3 ; c0 g0 c1 g1 c2 g2 c3 g3 + punpcklwd m1, m3 ; d0 h0 d1 h1 d2 h2 d3 h3 + REPX {pmulhrsw x, m5}, m4, m0, m2, m1 + punpckhwd m3, m0, m4 ; a2 b2 e2 f2 a3 b3 e3 f3 + punpcklwd m0, m4 ; a0 b0 e0 f0 a1 b1 e1 f1 + punpckhwd m4, m2, m1 ; c2 d2 g2 h2 c3 d3 g3 h3 + punpcklwd m2, m1 ; c0 d0 g0 h0 c1 d1 g1 h1 + punpckhdq m1, m0, m2 ; 1 5 9 13 + punpckldq m0, m2 ; 0 4 8 12 + punpckldq m2, m3, m4 ; 2 6 10 14 + punpckhdq m3, m4 ; 3 7 11 15 + jmp tx2q +.pass2: + vprord m5, [o(int16_perm)], 16 + vshufi32x4 m2, m2, q1320 ; 2 10 14 6 + vshufi32x4 m4, m1, m3, q2310 ; 1 5 15 11 + vshufi32x4 m1, m3, q0132 ; 9 13 7 3 + vpermb m9, m5, m0 + vpermb m7, m5, m2 + vpermb m8, m5, m4 + vpermb m0, m5, m1 + vextracti32x8 ym6, m9, 1 + vextracti32x8 ym3, m7, 1 + vextracti32x8 ym5, m8, 1 + vextracti32x8 ym1, m0, 1 + call .main2 + mova ym8, [o(gather8a)] + lea r3, [dstq+strideq*4] + pmovzxdq m9, ym8 + pshufd ym8, ym8, q1230 + vpermt2q m0, m9, m4 + vpermt2q m1, m9, m5 + vpermt2q m2, m9, m6 + vpermt2q m3, m9, m7 +.end: + vpbroadcastd m7, [o(pw_2048)] +.end2: + pmulhrsw m0, m7 + pmulhrsw m1, m7 +.end3: + pmulhrsw m2, m7 + pmulhrsw m3, m7 +.end4: + vpbroadcastd ym6, strided + kxnorb k1, k1, k1 + pxor m4, m4 + pmulld ym8, ym6 + kmovb k2, k1 + vpgatherdq m6{k1}, [dstq+ym8] + kmovb k1, k2 + vpgatherdq m7{k2}, [r3+ym8] + mova [cq+64*0], m4 + mova [cq+64*1], m4 + kmovb k2, k1 + mova [cq+64*2], m4 + mova [cq+64*3], m4 + punpcklbw m5, m6, m4 + punpckhbw m6, m4 + paddw m0, m5 + paddw m1, m6 + packuswb m0, m1 + vpscatterdq [dstq+ym8]{k1}, m0 + punpcklbw m6, m7, m4 + punpckhbw m7, m4 + paddw m2, m6 + paddw m3, m7 + packuswb m2, m3 + vpscatterdq [r3+ym8]{k2}, m2 + RET +ALIGN function_align +cglobal_label .main + WRAP_YMM IDCT16_1D_PACKED + ret + +INV_TXFM_8X16_FN adst, dct +INV_TXFM_8X16_FN adst, adst +INV_TXFM_8X16_FN adst, flipadst +INV_TXFM_8X16_FN adst, identity + +cglobal iadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + call m(iadst_16x8_internal_8bpc).main_pass1 + vbroadcasti32x4 m6, [o(int_shuf1)] + vpbroadcastd m7, [o(pw_16384_m16384)] + punpckhwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3 + punpcklwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3 + pshufb m5, m1, m6 ; c0 d0 c1 d1 c2 d2 c3 d3 + pshufb m2, m6 ; e0 f0 e1 f1 e2 f2 e3 f3 +.pass1_end: + REPX {pmulhrsw x, m7}, m3, m5, m4, m2 + punpckldq m0, m3, m5 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhdq m3, m5 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckhdq m5, m2, m4 ; e2 f2 g2 h2 e3 f3 g3 h3 + punpckldq m2, m4 ; e0 f0 g0 h0 e1 f1 g1 h1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m5 + punpckhqdq m3, m5 + jmp tx2q +.pass2: + call .main_pass2 + vpbroadcastd m6, [o(pw_2048)] + psrlq m10, 4 + psubw m7, m8, m6 +.pass2_end: + vpbroadcastd m5, [o(pw_2896x8)] + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m5 ; -out7 out4 out6 -out5 + pmulhrsw m5, m2 ; out8 -out11 -out9 out10 + mova ym8, [o(gather8c)] + lea r3, [dstq+strideq] + psrlq m2, m10, 4 + vpermi2q m2, m0, m3 ; 1 3 13 15 + vpermt2q m0, m10, m3 ; 0 2 12 14 + psrlq m3, m10, 8 + vpermi2q m3, m1, m5 ; 5 7 9 11 + psrlq m10, 12 + vpermt2q m1, m10, m5 ; 4 6 8 10 + pmulhrsw m0, m6 + pmulhrsw m1, m6 + jmp m(idct_8x16_internal_8bpc).end3 +ALIGN function_align +.main_pass1: + vpbroadcastd m2, [o(pw_2896x8)] + pmulhrsw m5, m2, [cq+64*0] + pmulhrsw m3, m2, [cq+64*3] + pmulhrsw m1, m2, [cq+64*1] + pmulhrsw m2, [cq+64*2] + movu m4, [o(permA+3)] + psrlq m10, m4, 4 + mova m6, m4 + vpermi2q m4, m5, m3 ; in0 in12 in2 in14 + vpermt2q m5, m10, m3 ; in15 in3 in13 in1 + vpermi2q m6, m1, m2 ; in4 in8 in6 in10 + vpermt2q m1, m10, m2 ; in11 in7 in9 in5 + jmp .main +ALIGN function_align +.main_pass2: + mova m4, [o(permC)] + psrlq m5, m4, 4 + vpermi2q m4, m0, m2 ; in0 in12 in2 in14 + psrlq m6, m5, 4 + vpermi2q m5, m1, m3 ; in15 in3 in13 in1 + psrlq m10, m6, 4 + vpermi2q m6, m0, m2 ; in4 in8 in6 in10 + vpermt2q m1, m10, m3 ; in11 in7 in9 in5 +.main: + punpcklwd m0, m4, m5 ; in0 in15 in2 in13 + punpckhwd m4, m5 ; in12 in3 in14 in1 + punpcklwd m5, m6, m1 ; in4 in11 in6 in9 + punpckhwd m6, m1 ; in8 in7 in10 in5 +cglobal_label .main2 + vpbroadcastd m9, [o(pd_2048)] + vpbroadcastq m13, [o(int_mshift)] + kxnorb k1, k1, k1 + vpcmpub k7, m13, m9, 6 ; 0x33... + pxor m8, m8 + ITX_MUL4X_PACK 0, 1, 2, 3, 7, 9, 201, 4091, 995, 3973, 5 + ITX_MUL4X_PACK 6, 1, 2, 3, 7, 9, 3035, 2751, 3513, 2106, 5 + ITX_MUL4X_PACK 4, 1, 2, 3, 7, 9, 3857, 1380, 4052, 601, 5 + ITX_MUL4X_PACK 5, 1, 2, 3, 7, 9, 1751, 3703, 2440, 3290, 5 + psubsw m2, m0, m6 ; t9a t8a t11a t10a + paddsw m0, m6 ; t1a t0a t3a t2a + psubsw m3, m5, m4 ; t13a t12a t15a t14a + paddsw m5, m4 ; t5a t4a t7a t6a + ITX_MUL4X_PACK 2, 4, 1, 6, 7, 9, 799, 4017, 3406, 2276, 5 + psubw m7, m8, m7 + ITX_MUL2X_PACK 3, 4, 1, 9, 7, 6, 4 + vpbroadcastd m6, [o(pw_3784_m1567)] + vpbroadcastd m6{k1}, [o(pw_m3784_1567)] + psubsw m1, m0, m5 ; t5 t4 t7 t6 + paddsw m0, m5 ; t1 t0 t3 t2 + psubsw m4, m2, m3 ; t13a t12a t15a t14a + paddsw m2, m3 ; t9a t8a t11a t10a + ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t5a t4a t6a t7a + ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t13 t12 t14 t15 + vbroadcasti32x4 m5, [o(deint_shuf)] + pshufb m0, m5 + pshufb m2, m5 + vshufi32x4 m3, m0, m2, q3232 ; t3 t2 t11a t10a + vinserti32x8 m0, ym2, 1 ; t1 t0 t9a t8a + vshufi32x4 m2, m1, m4, q3232 ; t6a t7a t14 t15 + vinserti32x8 m1, ym4, 1 ; t5a t4a t13 t12 + pshufd m2, m2, q1032 ; t7a t6a t15 t14 + psubsw m4, m0, m3 ; t3a t2a t11 t10 + paddsw m0, m3 ; -out15 out0 out14 -out1 + paddsw m3, m1, m2 ; out12 -out3 -out13 out2 + psubsw m1, m2 ; t7 t6 t15a t14a + punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a + punpcklqdq m4, m1 ; t3a t7 t11 t15a + ret + +INV_TXFM_8X16_FN flipadst, dct +INV_TXFM_8X16_FN flipadst, adst +INV_TXFM_8X16_FN flipadst, flipadst +INV_TXFM_8X16_FN flipadst, identity + +cglobal iflipadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + call m(iadst_16x8_internal_8bpc).main_pass1 + vbroadcasti32x4 m6, [o(int_shuf2)] + vpbroadcastd m7, [o(pw_m16384_16384)] + punpcklwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3 + punpckhwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3 + pshufb m5, m2, m6 ; c0 d0 c1 d1 c2 d2 c3 d3 + pshufb m2, m1, m6 ; e0 f0 e1 f1 e2 f2 e3 f3 + jmp m(iadst_8x16_internal_8bpc).pass1_end +.pass2: + call m(iadst_8x16_internal_8bpc).main_pass2 + vpbroadcastd m7, [o(pw_2048)] + psrlq m10, 36 + psubw m6, m8, m7 + jmp m(iadst_8x16_internal_8bpc).pass2_end + +INV_TXFM_8X16_FN identity, dct +INV_TXFM_8X16_FN identity, adst +INV_TXFM_8X16_FN identity, flipadst +INV_TXFM_8X16_FN identity, identity + +cglobal iidentity_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m0, [o(int16_perm)] + vpermb m3, m0, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3 + vpermb m2, m0, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3 + vpermb m4, m0, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3 + vpermb m0, m0, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3 + vpbroadcastd m5, [o(pw_2896x8)] + punpckldq m1, m3, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhdq m3, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckldq m2, m4, m0 ; e0 f0 g0 h0 a1 f1 g1 h1 + punpckhdq m4, m0 ; e2 f2 g2 h2 e3 f3 g3 h3 + REPX {pmulhrsw x, m5}, m1, m2, m3, m4 + punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0 + punpckhqdq m1, m2 ; a1 b1 c1 d1 e1 f1 g1 h1 + punpcklqdq m2, m3, m4 ; a2 b2 c2 d2 e2 f2 g2 h2 + punpckhqdq m3, m4 ; a3 b3 c3 d3 e3 f3 g3 h3 + jmp tx2q +.pass2: + vpbroadcastd m7, [o(pw_1697x16)] + mova ym8, [o(gather8b)] + lea r3, [dstq+strideq*2] + pmulhrsw m4, m7, m0 + pmulhrsw m5, m7, m1 + pmulhrsw m6, m7, m2 + pmulhrsw m7, m3 + REPX {paddsw x, x}, m0, m1, m2, m3 + paddsw m0, m4 + paddsw m1, m5 + paddsw m2, m6 + paddsw m3, m7 + jmp m(idct_8x16_internal_8bpc).end + +%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2] + pmovzxbw m%3, [dstq+%5] +%ifnum %1 + paddw m%3, m%1 +%else + paddw m%3, %1 +%endif + pmovzxbw m%4, [dstq+%6] +%ifnum %2 + paddw m%4, m%2 +%else + paddw m%4, %2 +%endif + packuswb m%3, m%4 + vpermq m%3, m%3, q3120 + mova [dstq+%5], xm%3 + vextracti32x4 [dstq+%6], m%3, 1 +%endmacro + +%macro INV_TXFM_16X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x4 +%ifidn %1_%2, dct_dct + movsx r6d, word [cq] + mov [cq], eobd + jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly2 +%endif +%endmacro + +INIT_ZMM avx512icl +INV_TXFM_16X4_FN dct, dct +INV_TXFM_16X4_FN dct, adst +INV_TXFM_16X4_FN dct, flipadst +INV_TXFM_16X4_FN dct, identity + +cglobal idct_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova xm0, [cq+16*0] + mova xm1, [cq+16*1] + mova xm2, [cq+16*2] + mova xm3, [cq+16*3] + mova xm4, [cq+16*4] + mova xm5, [cq+16*5] + mova xm6, [cq+16*6] + mova xm7, [cq+16*7] + call m(idct_4x16_internal_8bpc).main + vpbroadcastd m8, [o(pw_16384)] + vinserti32x4 ym1, xm3, 1 ; 3 2 7 6 + vinserti32x4 ym5, xm7, 1 ; b a f e + vinserti32x4 ym0, xm2, 1 ; 0 1 4 5 + vinserti32x4 ym4, xm6, 1 ; 8 9 c d + vinserti32x8 m1, ym5, 1 ; 3 2 7 6 b a f e + vinserti32x8 m0, ym4, 1 ; 0 1 4 5 8 9 c d + pmulhrsw m1, m8 + pmulhrsw m0, m8 + pshufd m1, m1, q1032 + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + jmp tx2q +.pass2: + IDCT4_1D_PACKED + mova m2, [o(permA)] + jmp m(iadst_16x4_internal_8bpc).end + +INV_TXFM_16X4_FN adst, dct +INV_TXFM_16X4_FN adst, adst +INV_TXFM_16X4_FN adst, flipadst +INV_TXFM_16X4_FN adst, identity + +cglobal iadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m0, [cq+64*0] + mova m1, [cq+64*1] + movshdup m3, [o(permB)] + psrlq m10, m3, 4 + call m(iadst_4x16_internal_8bpc).main2 + vpbroadcastd m6, [o(pw_16384_m16384)] + psrlq m0, m10, 4 + psrlq m10, 8 +.pass1_end: + punpcklwd ym5, ym4, ym2 + punpckhwd ym4, ym2 + vinserti32x8 m5, ym4, 1 + mova m1, m9 + vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16} + mova m4, m9 + vpdpwssd m4, m5, [o(pw_2896_2896)] {1to16} + psrad m1, 12 + psrad m4, 12 + packssdw m1, m4 ; out8 -out7 -out9 out6 -out11 out4 out10 -out5 + vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d + vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f + punpckhwd m2, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m0, m2 + punpcklwd m0, m2 + pmulhrsw m0, m6 + pmulhrsw m1, m6 + jmp tx2q +.pass2: + call .main + movu m2, [o(permA+1)] +.end: + vpbroadcastd m3, [o(pw_2048)] + pmulhrsw m0, m3 + pmulhrsw m1, m3 +.end2: + psrlq m3, m2, 4 + vpermi2q m2, m0, m1 + vpermi2q m3, m0, m1 +.end3: + lea r3, [dstq+strideq*2] + mova xm1, [dstq+strideq*0] + vinserti32x4 ym1, [dstq+strideq*1], 1 + vinserti32x4 m1, [r3 +strideq*0], 2 + vinserti32x4 m1, [r3 +strideq*1], 3 + pxor m4, m4 + mova [cq+64*0], m4 + mova [cq+64*1], m4 + punpcklbw m0, m1, m4 + punpckhbw m1, m4 + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [r3 +strideq*0], m0, 2 + vextracti32x4 [r3 +strideq*1], m0, 3 + RET +ALIGN function_align +.main: + IADST4_1D_PACKED + ret + +INV_TXFM_16X4_FN flipadst, dct +INV_TXFM_16X4_FN flipadst, adst +INV_TXFM_16X4_FN flipadst, flipadst +INV_TXFM_16X4_FN flipadst, identity + +cglobal iflipadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m0, [cq+64*0] + mova m1, [cq+64*1] + movshdup m3, [o(permB)] + psrlq m10, m3, 4 + call m(iadst_4x16_internal_8bpc).main2 + vpbroadcastd m6, [o(pw_m16384_16384)] + psrlq m0, m10, 12 + psrlq m10, 16 + jmp m(iadst_16x4_internal_8bpc).pass1_end +.pass2: + call m(iadst_16x4_internal_8bpc).main + movu m2, [o(permA+2)] + jmp m(iadst_16x4_internal_8bpc).end + +INV_TXFM_16X4_FN identity, dct +INV_TXFM_16X4_FN identity, adst +INV_TXFM_16X4_FN identity, flipadst +INV_TXFM_16X4_FN identity, identity + +cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m1, [cq+64*0] + mova m2, [cq+64*1] + vpbroadcastd m3, [o(pw_1697x16)] + vpbroadcastd m4, [o(pw_16384)] + mova m5, [o(idtx_16x4p)] + shufps m0, m1, m2, q2020 + shufps m1, m2, q3131 + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + pmulhrsw m2, m4 + pmulhrsw m3, m4 + paddsw m0, m2 + paddsw m1, m3 + vpermb m0, m5, m0 + vpermb m1, m5, m1 + jmp tx2q +.pass2: + vpbroadcastd m3, [o(pw_1697x8)] + pmulhrsw m2, m3, m0 + pmulhrsw m3, m1 + paddsw m0, m2 + paddsw m1, m3 + movu m2, [o(permA+1)] + jmp m(iadst_16x4_internal_8bpc).end + +%macro INV_TXFM_16X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x8 +%ifidn %1_%2, dct_dct + movsx r6d, word [cq] + mov [cq], eobd + mov r3d, 8 +.dconly: + imul r6d, 181 + add r6d, 128 + sar r6d, 8 +.dconly2: + imul r6d, 181 + add r6d, 128+256 + sar r6d, 8+1 +.dconly3: + imul r6d, 181 + lea r2, [strideq*3] + add r6d, 128+2048 + sar r6d, 8+4 + pxor m2, m2 + vpbroadcastw m3, r6d +.dconly_loop: + mova xm1, [dstq+strideq*0] + vinserti32x4 ym1, [dstq+strideq*1], 1 + vinserti32x4 m1, [dstq+strideq*2], 2 + vinserti32x4 m1, [dstq+r2 ], 3 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + paddw m0, m3 + paddw m1, m3 + packuswb m0, m1 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [dstq+strideq*2], m0, 2 + vextracti32x4 [dstq+r2 ], m0, 3 + lea dstq, [dstq+strideq*4] + sub r3d, 4 + jg .dconly_loop + RET +%endif +%endmacro + +%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd + vpbroadcastd m8, [o(pw_2896x8)] + vpermq m0, [cq+32*0], q3120 + add cq, 32*4 + vpermq m7, [cq+32*3], q%1 + vpermq m1, [cq-32*3], q%1 + vpermq m6, [cq+32*2], q3120 + vpermq m2, [cq-32*2], q3120 + vpermq m5, [cq+32*1], q%1 + vpermq m3, [cq-32*1], q%1 + vpermq m4, [cq+32*0], q3120 + REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4 +%endmacro + +INV_TXFM_16X8_FN dct, dct +INV_TXFM_16X8_FN dct, identity +INV_TXFM_16X8_FN dct, adst +INV_TXFM_16X8_FN dct, flipadst + +cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpbroadcastd m1, [o(pw_2896x8)] + vpermq m0, [cq+64*0], q3120 + vpermq m2, [cq+64*1], q3120 + vpermq m4, [cq+64*2], q3120 + vpermq m6, [cq+64*3], q3120 + REPX {pmulhrsw x, m1}, m0, m2, m4, m6 + vextracti32x8 ym1, m0, 1 + vextracti32x8 ym3, m2, 1 + vextracti32x8 ym5, m4, 1 + vextracti32x8 ym7, m6, 1 + call m(idct_8x16_internal_8bpc).main + vbroadcasti32x4 m8, [o(int_shuf1)] + vbroadcasti32x4 m9, [o(int_shuf2)] + vinserti32x8 m0, ym2, 1 ; a0 a1 a2 a3 b0 b1 b2 b3 + vinserti32x8 m1, ym3, 1 ; d0 d1 d2 d3 c0 c1 c2 c3 + vinserti32x8 m4, ym6, 1 ; i0 i1 i2 i3 j0 j1 j2 j3 + vinserti32x8 m5, ym7, 1 ; l0 l1 l2 l3 k0 k1 k2 k3 + vpbroadcastd m2, [o(pw_16384)] + pshufb m0, m8 ; a0 b0 a1 b1 a2 b2 a3 b3 + pshufb m1, m9 ; c0 d0 c1 d1 c2 d2 c3 d3 + pshufb m6, m4, m8 ; i0 j0 i1 j1 i2 j2 i3 j3 + pshufb m7, m5, m9 ; m0 n0 m1 n1 m2 n2 m3 n3 + REPX {pmulhrsw x, m2}, m0, m1, m6, m7 + punpckldq m2, m0, m1 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhdq m3, m0, m1 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckldq m4, m6, m7 ; i0 j0 k0 l0 i1 j1 k1 l1 + punpckhdq m5, m6, m7 ; i2 j2 k2 l2 i3 j3 k3 l3 + jmp tx2q +.pass2: + vshufi32x4 m0, m2, m4, q2020 ; 0 1 + vshufi32x4 m2, m4, q3131 ; 4 5 + vshufi32x4 m1, m3, m5, q2020 ; 2 3 + vshufi32x4 m3, m5, q3131 ; 6 7 + call .main + movshdup m4, [o(permC)] + psrlq m6, m4, 4 + vpermq m5, m4, q1032 + vpermi2q m4, m0, m2 ; a2 a3 b2 b3 e2 e3 f2 f3 + vpermt2q m0, m6, m2 ; a0 a1 b0 b1 e0 e1 f0 f1 + psrlq m6, m5, 4 + vpermi2q m5, m1, m3 ; c2 c3 d2 d3 g2 g3 h2 h3 + vpermt2q m1, m6, m3 ; c0 c1 d0 d1 g0 g1 h0 h1 + vpbroadcastd m6, [o(pw_2048)] +.end: + REPX {pmulhrsw x, m6}, m0, m4, m1, m5 +.end2: + lea r3, [dstq+strideq*4] + lea r4, [strideq*3] + mova xm3, [dstq+strideq*0] + mova xm6, [dstq+strideq*2] + vinserti32x4 ym3, [dstq+strideq*1], 1 + vinserti32x4 ym6, [dstq+r4 ], 1 + vinserti32x4 m3, [r3 +strideq*0], 2 + vinserti32x4 m6, [r3 +strideq*2], 2 + vinserti32x4 m3, [r3 +strideq*1], 3 + vinserti32x4 m6, [r3 +r4 ], 3 + pxor m7, m7 + mova [cq+64*0], m7 + mova [cq+64*1], m7 + mova [cq+64*2], m7 + mova [cq+64*3], m7 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + paddw m0, m2 + paddw m4, m3 + packuswb m0, m4 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + vextracti32x4 [r3 +strideq*0], m0, 2 + vextracti32x4 [r3 +strideq*1], m0, 3 + punpcklbw m3, m6, m7 + punpckhbw m6, m7 + paddw m1, m3 + paddw m5, m6 + packuswb m1, m5 + mova [dstq+strideq*2], xm1 + vextracti32x4 [dstq+r4 ], ym1, 1 + vextracti32x4 [r3 +strideq*2], m1, 2 + vextracti32x4 [r3 +r4 ], m1, 3 + RET +ALIGN function_align +cglobal_label .main + IDCT8_1D_PACKED + ret + +INV_TXFM_16X8_FN adst, dct +INV_TXFM_16X8_FN adst, adst +INV_TXFM_16X8_FN adst, flipadst +INV_TXFM_16X8_FN adst, identity + +cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + call m(iadst_8x16_internal_8bpc).main_pass1 + vpbroadcastd m7, [o(pw_16384_m16384)] + psrlq m10, 4 +.pass1_end: + punpcklwd m5, m4, m2 + punpckhwd m4, m2 + mova m1, m9 + vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16} + mova m6, m9 + vpdpwssd m6, m5, [o(pw_2896_2896)] {1to16} + mova m2, m9 + vpdpwssd m2, m4, [o(pw_m2896_2896)] {1to16} + vpdpwssd m9, m4, [o(pw_2896_2896)] {1to16} + psrad m1, 12 + psrad m6, 12 + packssdw m1, m6 ; out8 -out7 -out9 out6 + psrad m2, 12 + psrad m9, 12 + packssdw m2, m9 ; -out11 out4 out10 -out5 + psrlq m4, m10, 4 + vpermi2q m4, m0, m2 + vpermt2q m0, m10, m2 + psrlq m5, m10, 8 + vpermi2q m5, m1, m3 + psrlq m10, 12 + vpermt2q m1, m10, m3 + punpcklwd m3, m4, m5 ; a0 c0 a1 c1 a2 c2 a3 c3 + punpckhwd m4, m5 ; b0 d0 b1 d1 b2 d2 b3 d3 + punpcklwd m5, m1, m0 ; i0 k0 i1 k1 2i k2 i3 k3 + punpckhwd m1, m0 ; j0 l0 j1 l1 j2 l2 j3 l3 + punpcklwd m2, m3, m4 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhwd m3, m4 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpcklwd m4, m5, m1 ; i0 j0 k0 l0 i1 j1 k1 l1 + punpckhwd m5, m1 ; i2 j2 k2 l2 i3 j3 k3 l3 + REPX {pmulhrsw x, m7}, m2, m3, m4, m5 + jmp tx2q +.pass2: + vshufi32x4 m0, m2, m4, q2020 + vshufi32x4 m2, m4, q3131 ; 4 5 + vshufi32x4 m1, m3, m5, q2020 + vshufi32x4 m3, m5, q3131 ; 6 7 + pshufd m4, m0, q1032 ; 1 0 + pshufd m5, m1, q1032 ; 3 2 + call .main_pass2 + movshdup m4, [o(permC)] + pmulhrsw m0, m6 + pmulhrsw m1, m6 + psrlq m6, m4, 4 + mova m5, m4 + vpermi2q m4, m0, m2 + vpermt2q m0, m6, m2 + vpermi2q m5, m1, m3 + vpermt2q m1, m6, m3 + jmp m(idct_16x8_internal_8bpc).end2 +ALIGN function_align +.main_pass1: + vpbroadcastd m4, [o(pw_2896x8)] + pmulhrsw m3, m4, [cq+64*0] + pmulhrsw m1, m4, [cq+64*3] + pmulhrsw m2, m4, [cq+64*1] + pmulhrsw m4, [cq+64*2] + mova m5, [o(int16_perm)] + kxnorb k1, k1, k1 + vpblendmd m0{k1}, m1, m3 ; 0 7 + vmovdqa32 m3{k1}, m1 ; 6 1 + vpblendmd m1{k1}, m4, m2 ; 2 5 + vmovdqa32 m2{k1}, m4 ; 4 3 + REPX {vpermb x, m5, x}, m0, m1, m2, m3 + IADST8_1D_PACKED 1 + ret +ALIGN function_align +cglobal_label .main_pass2 + IADST8_1D_PACKED 2 + pxor m5, m5 + psubd m5, m6 + packssdw m6, m5 + pmulhrsw m2, m6 + pmulhrsw m3, m6 + ret + +INV_TXFM_16X8_FN flipadst, dct +INV_TXFM_16X8_FN flipadst, adst +INV_TXFM_16X8_FN flipadst, flipadst +INV_TXFM_16X8_FN flipadst, identity + +cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + call m(iadst_8x16_internal_8bpc).main_pass1 + vpbroadcastd m7, [o(pw_m16384_16384)] + psrlq m10, 20 + jmp m(iadst_16x8_internal_8bpc).pass1_end +.pass2: + vshufi32x4 m0, m2, m4, q2020 + vshufi32x4 m2, m4, q3131 ; 4 5 + vshufi32x4 m1, m3, m5, q2020 + vshufi32x4 m3, m5, q3131 ; 6 7 + pshufd m4, m0, q1032 ; 1 0 + pshufd m5, m1, q1032 ; 3 2 + call m(iadst_16x8_internal_8bpc).main_pass2 + movshdup m4, [o(permC)] + pmulhrsw m5, m6, m0 + pmulhrsw m0, m6, m1 + psrlq m1, m4, 12 + psrlq m4, 8 + mova m7, m4 + vpermi2q m4, m0, m3 + vpermt2q m0, m1, m3 + vpermi2q m1, m5, m2 + vpermt2q m5, m7, m2 + jmp m(idct_16x8_internal_8bpc).end2 + +INV_TXFM_16X8_FN identity, dct +INV_TXFM_16X8_FN identity, adst +INV_TXFM_16X8_FN identity, flipadst +INV_TXFM_16X8_FN identity, identity + +cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + vpbroadcastd m0, [o(pw_2896x8)] + pmulhrsw m3, m0, [cq+64*0] + pmulhrsw m4, m0, [cq+64*1] + pmulhrsw m5, m0, [cq+64*2] + pmulhrsw m0, [cq+64*3] + vpbroadcastd m7, [o(pw_1697x16)] + vpbroadcastd m8, [o(pw_16384)] + shufps m2, m3, m4, q2020 ; a0 a1 a4 a5 e0 e1 e4 e5 + shufps m3, m4, q3131 ; a2 a3 a6 a7 e2 e3 e6 e7 + shufps m4, m5, m0, q2020 ; i0 i1 i4 i5 m0 m1 m4 m5 + shufps m5, m0, q3131 ; i2 i3 i6 i7 m2 m3 m6 m7 + mova m9, [o(int8_permA)] + pmulhrsw m0, m7, m2 + pmulhrsw m1, m7, m3 + pmulhrsw m6, m7, m4 + pmulhrsw m7, m5 + REPX {pmulhrsw x, m8}, m0, m1, m6, m7 + paddsw m2, m0 + paddsw m3, m1 + paddsw m4, m6 + paddsw m5, m7 + REPX {vpermb x, m9, x}, m2, m3, m4, m5 + jmp tx2q +.pass2: + mova m7, [o(permB)] + vpbroadcastd m6, [o(pw_4096)] + vpermq m0, m7, m2 + vpermq m4, m7, m4 + vpermq m1, m7, m3 + vpermq m5, m7, m5 + jmp m(idct_16x8_internal_8bpc).end + +%macro INV_TXFM_16X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x16 +%ifidn %1_%2, dct_dct + movsx r6d, word [cq] + mov [cq], eobd + imul r6d, 181 + mov r3d, 16 + add r6d, 128+512 + sar r6d, 8+2 + jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 +%endif +%endmacro + +INV_TXFM_16X16_FN dct, dct +INV_TXFM_16X16_FN dct, identity +INV_TXFM_16X16_FN dct, adst +INV_TXFM_16X16_FN dct, flipadst + +cglobal idct_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m7, [o(permB)] + vpermq m0, m7, [cq+64*0] + vpermq m1, m7, [cq+64*1] + vpermq m2, m7, [cq+64*2] + vpermq m3, m7, [cq+64*3] + vpermq m4, m7, [cq+64*4] + vpermq m5, m7, [cq+64*5] + vpermq m6, m7, [cq+64*6] + vpermq m7, m7, [cq+64*7] + call .main + vbroadcasti32x4 m12, [o(int_shuf1)] + vbroadcasti32x4 m11, [o(int_shuf2)] + vpbroadcastd m13, [o(pw_8192)] + pshufb m0, m12 + pshufb m8, m1, m11 + pshufb m2, m12 + pshufb m9, m3, m11 + pshufb m4, m12 + pshufb m10, m5, m11 + pshufb m6, m12 + pshufb m11, m7, m11 + REPX {pmulhrsw x, m13}, m0, m8, m2, m9, m4, m10, m6, m11 + punpckhdq m1, m0, m8 + punpckldq m0, m8 + punpckhdq m3, m2, m9 + punpckldq m2, m9 + punpckhdq m5, m4, m10 + punpckldq m4, m10 + punpckhdq m7, m6, m11 + punpckldq m6, m11 + jmp tx2q +.pass2: + vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc + vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4 + vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec + vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4 + vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me + vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6 + vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee + vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6 + vshufi32x4 m2, m0, m4, q3131 ; 4 5 + vshufi32x4 m0, m4, q2020 ; 0 1 + vshufi32x4 m4, m6, m8, q2020 ; 8 9 + vshufi32x4 m6, m8, q3131 ; 12 13 + vshufi32x4 m3, m1, m5, q3131 ; 6 7 + vshufi32x4 m1, m5, q2020 ; 2 3 + vshufi32x4 m5, m7, m9, q2020 ; 10 11 + vshufi32x4 m7, m9, q3131 ; 14 15 + call .main + mova m8, [o(permD)] + psrlq m12, m8, 4 + psrlq m9, m8, 8 + psrlq m13, m8, 12 + mova m10, m8 + vpermi2q m8, m0, m2 ; 0 1 4 5 + vpermt2q m0, m12, m2 + mova m11, m9 + vpermi2q m9, m1, m3 ; 2 3 6 7 + vpermt2q m1, m13, m3 + vpermi2q m10, m4, m6 ; 8 9 12 13 + vpermt2q m4, m12, m6 + vpermi2q m11, m5, m7 ; 10 11 14 15 + vpermt2q m5, m13, m7 +.end: + vpbroadcastd m12, [o(pw_2048)] +.end2: + REPX {pmulhrsw x, m12}, m0, m1, m4, m5 +.end3: + REPX {pmulhrsw x, m12}, m8, m9, m10, m11 + lea r3, [strideq*3] + lea r4, [dstq+strideq*4] + lea r5, [dstq+strideq*8] + lea r6, [r4 +strideq*8] + mova xm3, [dstq+strideq*0] + mova xm6, [dstq+strideq*2] + vinserti32x4 ym3, [dstq+strideq*1], 1 + vinserti32x4 ym6, [dstq+r3 ], 1 + vinserti32x4 m3, [r4+strideq*0], 2 + vinserti32x4 m6, [r4+strideq*2], 2 + vinserti32x4 m3, [r4+strideq*1], 3 + vinserti32x4 m6, [r4+r3 ], 3 + mova xm12, [r5+strideq*0] + mova xm13, [r5+strideq*2] + vinserti32x4 ym12, [r5+strideq*1], 1 + vinserti32x4 ym13, [r5+r3 ], 1 + vinserti32x4 m12, [r6+strideq*0], 2 + vinserti32x4 m13, [r6+strideq*2], 2 + vinserti32x4 m12, [r6+strideq*1], 3 + vinserti32x4 m13, [r6+r3 ], 3 + pxor m7, m7 + REPX {mova [cq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 + punpcklbw m2, m3, m7 + punpckhbw m3, m7 + paddw m0, m2 + paddw m8, m3 + packuswb m0, m8 + punpcklbw m2, m6, m7 + punpckhbw m6, m7 + paddw m1, m2 + paddw m9, m6 + packuswb m1, m9 + punpcklbw m2, m12, m7 + punpckhbw m12, m7 + paddw m2, m4 + paddw m10, m12 + packuswb m2, m10 + punpcklbw m3, m13, m7 + punpckhbw m13, m7 + paddw m3, m5 + paddw m11, m13 + packuswb m3, m11 + mova [dstq+strideq*0], xm0 + vextracti32x4 [dstq+strideq*1], ym0, 1 + mova [dstq+strideq*2], xm1 + vextracti32x4 [dstq+r3 ], ym1, 1 + vextracti32x4 [r4+strideq*0], m0, 2 + vextracti32x4 [r4+strideq*1], m0, 3 + vextracti32x4 [r4+strideq*2], m1, 2 + vextracti32x4 [r4+r3 ], m1, 3 + mova [r5+strideq*0], xm2 + vextracti32x4 [r5+strideq*1], ym2, 1 + mova [r5+strideq*2], xm3 + vextracti32x4 [r5+r3 ], ym3, 1 + vextracti32x4 [r6+strideq*0], m2, 2 + vextracti32x4 [r6+strideq*1], m2, 3 + vextracti32x4 [r6+strideq*2], m3, 2 + vextracti32x4 [r6+r3 ], m3, 3 + RET +ALIGN function_align +.main_fast2: ; bottom three-quarters are zero + vpbroadcastd m10, [o(pd_2048)] + vpbroadcastq m13, [o(int_mshift)] + vpcmpub k7, m13, m10, 6 +.main_fast4: + vpbroadcastd m2, [o(pw_401_4076x8)] + vpbroadcastd m4, [o(pw_m1189_3920x8)] + vpbroadcastd m3, [o(pw_799_4017x8)] + pmulhrsw m2, m8 ; t8a t15a + pmulhrsw m4, m1 ; t11a t12a + pmulhrsw m7, m3 ; t4a t7a + pxor m6, m6 + psubsw m0, m2, m4 ; t11a t12a + paddsw m8, m2, m4 ; t8a t15a + mova m1, m7 + jmp .main5 +ALIGN function_align +.main_fast: ; bottom half is zero + vpbroadcastd m10, [o(pd_2048)] +.main_fast3: + vpbroadcastq m13, [o(int_mshift)] + vpcmpub k7, m13, m10, 6 +.main_fast5: + vpbroadcastd m2, [o(pw_401_4076x8)] + vpbroadcastd m4, [o(pw_m2598_3166x8)] + vpbroadcastd m11, [o(pw_1931_3612x8)] + vpbroadcastd m12, [o(pw_m1189_3920x8)] + pmulhrsw m8, m2 ; t8a t15a + vpbroadcastd m2, [o(pw_799_4017x8)] + pmulhrsw m0, m4 ; t9a t14a + vpbroadcastd m4, [o(pw_m2276_3406x8)] + pmulhrsw m5, m11 ; t10a t13a + pmulhrsw m1, m12 ; t11a t12a + pmulhrsw m7, m2 ; t4a t7a + pmulhrsw m3, m4 ; t5a t6a + jmp .main4 +ALIGN function_align +cglobal_label .main + IDCT16_1D_PACKED + ret + +INV_TXFM_16X16_FN adst, dct +INV_TXFM_16X16_FN adst, adst +INV_TXFM_16X16_FN adst, flipadst + +cglobal iadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + call .main_pass1 + vpbroadcastd m10, [o(pw_8192_m8192)] + punpcklwd m8, m0, m1 ; b0 d0 b1 d1 b2 d2 b3 d3 + punpckhwd m0, m1 ; a0 c0 a1 c1 a2 c2 a3 c3 + punpckhwd m1, m0, m8 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpcklwd m0, m8 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpcklwd m8, m2, m3 ; f0 h0 f1 h1 f2 h2 f3 h3 + punpckhwd m2, m3 ; e0 g0 e1 g1 e2 g2 e3 g3 + punpckhwd m3, m2, m8 ; e2 f2 g2 h2 e3 f3 g3 h3 + punpcklwd m2, m8 ; e0 f0 g0 h0 e1 f1 g1 h1 + punpckhwd m8, m4, m5 ; i0 k0 i1 k1 i2 k2 i3 k3 + punpcklwd m4, m5 ; j0 l0 j1 l1 j2 l2 j3 l3 + punpckhwd m5, m4, m8 ; i2 j2 k2 l2 i3 j3 k3 l3 + punpcklwd m4, m8 ; i0 j0 k0 l0 i1 j1 k1 l1 + punpckhwd m8, m6, m7 ; m0 o0 m1 o1 m2 o2 m3 o3 + punpcklwd m6, m7 ; n0 p0 n1 p1 n2 p2 n3 p3 + punpckhwd m7, m6, m8 ; m2 n2 o2 p2 m3 n3 o3 p3 + punpcklwd m6, m8 ; m0 n0 o0 p0 m1 n1 o1 p1 +.pass1_end: + REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 + jmp tx2q +.pass2: + call .main_pass2 + mova m10, [o(permD)] + psrlq m8, m10, 8 + psrlq m12, m10, 12 + psrlq m13, m10, 4 + mova m9, m8 + vpermi2q m8, m0, m2 ; 0 1 4 5 + vpermt2q m0, m12, m2 + vpermi2q m9, m1, m3 ; 2 3 6 7 + vpermt2q m1, m12, m3 + vpbroadcastd m12, [o(pw_2048)] + mov r3d, 0xff00ff00 + mova m11, m10 + vpermi2q m10, m4, m6 ; 8 9 12 13 + vpermt2q m4, m13, m6 + kmovd k1, r3d + vpermi2q m11, m5, m7 ; 10 11 14 15 + vpermt2q m5, m13, m7 + pxor m7, m7 + vpsubw m12{k1}, m7, m12 + jmp m(idct_16x16_internal_8bpc).end2 +ALIGN function_align +.main_pass1: + mova m4, [o(permB)] + psrlq m3, m4, 4 + vpermq m0, m4, [cq+64*0] + vpermq m7, m3, [cq+64*7] + vpermq m6, m4, [cq+64*6] + vpermq m1, m3, [cq+64*1] + vpermq m2, m4, [cq+64*2] + vpermq m5, m3, [cq+64*5] + vpermq m4, m4, [cq+64*4] + vpermq m3, m3, [cq+64*3] + call .main + vpbroadcastd m13, [o(pw_2896_2896)] + vpbroadcastd m12, [o(pw_m2896_2896)] + mova m2, m10 + vpdpwssd m2, m5, m13 ; -out5 + mova m8, m10 + vpdpwssd m8, m11, m13 ; out4 + mova m9, m10 + vpdpwssd m9, m5, m12 ; out10 + mova m5, m10 + vpdpwssd m5, m11, m12 ; -out11 + mova m11, m10 + vpdpwssd m11, m3, m13 ; -out7 + mova m14, m10 + vpdpwssd m14, m4, m13 ; out6 + mova m13, m10 + vpdpwssd m13, m3, m12 ; out8 + vpdpwssd m10, m4, [o(pw_2896_m2896)] {1to16} ; -out9 + REPX {psrad x, 12}, m2, m8, m9, m5, m11, m14, m13, m10 + packssdw m2, m8 ; -out5 out4 + packssdw m5, m9, m5 ; out10 -out11 + packssdw m3, m11, m14 ; -out7 out6 + packssdw m4, m13, m10 ; out8 -out9 + ret +ALIGN function_align +.main_pass2: + vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc + vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4 + vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec + vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4 + vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me + vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6 + vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee + vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6 + vshufi32x4 m2, m0, m4, q3131 ; 4 5 + vshufi32x4 m0, m4, q2020 ; 0 1 + vshufi32x4 m4, m6, m8, q2020 ; 8 9 + vshufi32x4 m6, m8, q3131 ; 12 13 + vshufi32x4 m3, m1, m5, q3131 ; 6 7 + vshufi32x4 m1, m5, q2020 ; 2 3 + vshufi32x4 m5, m7, m9, q2020 ; 10 11 + vshufi32x4 m7, m9, q3131 ; 14 15 +cglobal_label .main_pass2b + REPX {pshufd x, x, q1032}, m1, m3, m5, m7 + call .main + vpbroadcastd m8, [o(pw_2896x8)] + pshufb m2, m11, m12 + pshufb m5, m12 + pshufb m3, m12 + pshufb m4, m12 + punpcklqdq m9, m5, m2 ; t15a t7 + punpckhqdq m5, m2 ; t14a t6 + shufps m2, m3, m4, q1032 ; t2a t10 + shufps m3, m4, q3210 ; t3a t11 + psubsw m4, m2, m3 ; out8 -out9 + paddsw m3, m2 ; -out7 out6 + paddsw m2, m5, m9 ; -out5 out4 + psubsw m5, m9 ; out10 -out11 + REPX {pmulhrsw x, m8}, m2, m3, m4, m5 + ret +ALIGN function_align +.main: + vpbroadcastd m10, [o(pd_2048)] + vpbroadcastq m13, [o(int_mshift)] + punpckhwd m8, m7, m0 ; in14 in1 + punpcklwd m0, m7 ; in0 in15 + punpcklwd m7, m6, m1 ; in12 in3 + punpckhwd m1, m6 ; in2 in13 + punpckhwd m6, m5, m2 ; in10 in5 + punpcklwd m2, m5 ; in4 in11 + punpcklwd m5, m4, m3 ; in8 in7 + punpckhwd m3, m4 ; in6 in9 + vpcmpub k7, m13, m10, 6 ; 0x33... + ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 5 ; t0 t1 + ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 5 ; t2 t3 + ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 5 ; t4 t5 + ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 5 ; t6 t7 + ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 5 ; t8 t9 + ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 5 ; t10 t11 + ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 5 ; t12 t13 + ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 5 ; t14 t15 + psubsw m4, m0, m5 ; t9a t8a + paddsw m0, m5 ; t1a t0a + psubsw m5, m1, m6 ; t11a t10a + paddsw m1, m6 ; t3a t2a + psubsw m6, m2, m7 ; t13a t12a + paddsw m2, m7 ; t5a t4a + psubsw m7, m3, m8 ; t15a t14a + paddsw m3, m8 ; t7a t6a + ITX_MUL2X_PACK 4, 8, 9, 10, 799, 4017, 4 ; t8 t9 + ITX_MUL2X_PACK 6, 8, 9, 10, 799_4017, 4017_m799, 52 ; t12 t13 + ITX_MUL2X_PACK 5, 8, 9, 10, 3406, 2276, 4 ; t10 t11 + ITX_MUL2X_PACK 7, 8, 9, 10, 3406_2276, 2276_m3406, 52 ; t14 t15 + psubsw m8, m1, m3 ; t7 t6 + paddsw m1, m3 ; t3 t2 + psubsw m3, m0, m2 ; t5 t4 + paddsw m0, m2 ; t1 t0 + psubsw m2, m5, m7 ; t14a t15a + paddsw m7, m5 ; t10a t11a + psubsw m5, m4, m6 ; t12a t13a + paddsw m4, m6 ; t8a t9a + ITX_MUL2X_PACK 3, 6, 9, 10, 1567, 3784, 5 ; t5a t4a + ITX_MUL2X_PACK 8, 6, 9, 10, 3784_m1567, 1567_3784, 52 ; t7a t6a + ITX_MUL2X_PACK 2, 6, 9, 10, 3784, 1567, 4 ; t15 t14 + ITX_MUL2X_PACK 5, 6, 9, 10, 3784_1567, 1567_m3784, 52 ; t13 t12 + vbroadcasti32x4 m12, [o(deint_shuf)] + paddsw m6, m4, m7 ; -out1 out14 + psubsw m4, m7 ; t10 t11 + psubsw m11, m3, m8 ; t7 t6 + paddsw m8, m3 ; out12 -out3 + psubsw m3, m0, m1 ; t3a t2a + paddsw m0, m1 ; -out15 out0 + paddsw m1, m2, m5 ; -out13 out2 + psubsw m5, m2 ; t15a t14a + pshufb m0, m12 + pshufb m6, m12 + pshufb m8, m12 + pshufb m1, m12 + shufps m7, m6, m0, q1032 ; out14 -out15 + shufps m0, m6, m0, q3210 ; -out1 out0 + punpcklqdq m6, m8, m1 ; out12 -out13 + punpckhqdq m1, m8, m1 ; -out3 out2 + ret + +INV_TXFM_16X16_FN flipadst, dct +INV_TXFM_16X16_FN flipadst, adst +INV_TXFM_16X16_FN flipadst, flipadst + +cglobal iflipadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + call m(iadst_16x16_internal_8bpc).main_pass1 + vpbroadcastd m10, [o(pw_m8192_8192)] + punpcklwd m8, m1, m0 ; m0 o0 m1 o1 m2 o2 m3 o3 + punpckhwd m9, m1, m0 ; n0 p0 n1 p1 n2 p2 n3 p3 + punpckhwd m1, m7, m6 ; a0 c0 a1 c1 a2 c2 a3 c3 + punpcklwd m7, m6 ; b0 d0 b1 d1 b2 d2 b3 d3 + punpcklwd m0, m1, m7 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhwd m1, m7 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpcklwd m6, m8, m9 ; m0 n0 o0 p0 m1 n1 o1 p1 + punpckhwd m7, m8, m9 ; m2 n2 o2 p2 m3 n3 o3 p3 + punpcklwd m8, m3, m2 ; i0 k0 i1 k1 i2 k2 i3 k3 + punpckhwd m9, m3, m2 ; j0 l0 j1 l1 j2 l2 j3 l3 + punpckhwd m3, m5, m4 ; e0 g0 e1 g1 e2 g2 e3 g3 + punpcklwd m5, m4 ; f0 h0 f1 h1 f2 h2 f3 h3 + punpcklwd m2, m3, m5 ; e0 f0 g0 h0 e1 f1 g1 h1 + punpckhwd m3, m5 ; e2 f2 g2 h2 e3 f3 g3 h3 + punpcklwd m4, m8, m9 ; i0 j0 k0 l0 i1 j1 k1 l1 + punpckhwd m5, m8, m9 ; i2 j2 k2 l2 i3 j3 k3 l3 + jmp m(iadst_16x16_internal_8bpc).pass1_end +.pass2: + call m(iadst_16x16_internal_8bpc).main_pass2 + mova m10, [o(permD)] + psrlq m8, m10, 8 + psrlq m12, m10, 12 + psrlq m13, m10, 4 + mova m9, m8 + vpermi2q m8, m7, m5 ; 0 1 4 5 + vpermt2q m7, m12, m5 + vpermi2q m9, m6, m4 ; 2 3 6 7 + vpermt2q m6, m12, m4 + vpbroadcastd m12, [o(pw_2048)] + mov r3d, 0x00ff00ff + mova m11, m10 + vpermi2q m10, m3, m1 ; 8 9 12 13 + vpermt2q m3, m13, m1 + kmovd k1, r3d + vpermi2q m11, m2, m0 ; 10 11 14 15 + vpermt2q m2, m13, m0 + pxor m0, m0 + vpsubw m12{k1}, m0, m12 + pmulhrsw m0, m7, m12 + pmulhrsw m1, m6, m12 + pmulhrsw m4, m3, m12 + pmulhrsw m5, m2, m12 + jmp m(idct_16x16_internal_8bpc).end3 + +INV_TXFM_16X16_FN identity, dct +INV_TXFM_16X16_FN identity, identity + +cglobal iidentity_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2 + mova m8, [o(int16_perm)] + vpermb m1, m8, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3 + vpermb m2, m8, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3 + vpbroadcastd m0, [o(pw_1697x16)] + vpermb m3, m8, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3 + vpermb m4, m8, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3 + vpermb m5, m8, [cq+64*4] ; i0 j0 i1 j1 i2 j2 i3 j3 + vpermb m6, m8, [cq+64*5] ; k0 l0 k1 l1 k2 l2 k3 l3 + vpermb m7, m8, [cq+64*6] ; m0 n0 m1 n1 m2 n2 m3 n3 + vpermb m8, m8, [cq+64*7] ; o0 p0 o1 p1 o2 p2 o3 p3 + pmulhrsw m9, m0, m1 + pmulhrsw m10, m0, m2 + pmulhrsw m11, m0, m3 + pmulhrsw m12, m0, m4 + pmulhrsw m13, m0, m5 + pmulhrsw m14, m0, m6 + pmulhrsw m15, m0, m7 + pmulhrsw m0, m8 + REPX {psraw x, 1}, m9, m10, m11, m12 + pavgw m1, m9 + pavgw m2, m10 + pavgw m3, m11 + pavgw m4, m12 + REPX {psraw x, 1}, m13, m14, m15, m0 + pavgw m5, m13 + pavgw m6, m14 + pavgw m7, m15 + pavgw m8, m0 + punpckldq m0, m1, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhdq m1, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckldq m2, m3, m4 ; e0 f0 g0 h0 e1 f1 g1 h1 + punpckhdq m3, m4 ; e2 f2 g2 h2 e3 f3 g3 h3 + punpckldq m4, m5, m6 ; i0 j0 k0 l0 i1 j1 k1 l1 + punpckhdq m5, m6 ; i2 j2 k2 l2 i3 j3 k3 l3 + punpckldq m6, m7, m8 ; m0 n0 o0 p0 m1 n1 o1 p1 + punpckhdq m7, m8 ; m2 n2 o2 p2 m3 n3 o3 p3 + jmp tx2q +ALIGN function_align +.pass2: + vpbroadcastd m11, [o(pw_1697x16)] + pmulhrsw m12, m11, m0 + pmulhrsw m13, m11, m1 + pmulhrsw m14, m11, m2 + pmulhrsw m15, m11, m3 + pmulhrsw m8, m11, m4 + pmulhrsw m9, m11, m5 + pmulhrsw m10, m11, m6 + pmulhrsw m11, m7 + REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7 + paddsw m0, m12 + paddsw m1, m13 + paddsw m2, m14 + paddsw m3, m15 + paddsw m8, m4 + movu m4, [o(permD+2)] + paddsw m9, m5 + paddsw m6, m10 + paddsw m7, m11 + psrlq m12, m4, 4 + mova m5, m4 + mova m10, m4 + mova m11, m4 + vpermi2q m4, m0, m2 ; 8 9 12 13 + vpermt2q m0, m12, m2 ; 0 1 4 5 + vpermi2q m5, m1, m3 ; 10 11 14 15 + vpermt2q m1, m12, m3 ; 2 3 6 7 + vpermi2q m10, m8, m6 + vpermt2q m8, m12, m6 + vpermi2q m11, m9, m7 + vpermt2q m9, m12, m7 + jmp m(idct_16x16_internal_8bpc).end + +%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4] + vpbroadcastd m%3, [o(pw_%4_%5x8)] + punpcklwd m%1, m%2, m%2 + pmulhrsw m%1, m%3 + vpbroadcastd m%3, [o(pw_%6_%7x8)] + punpckhwd m%2, m%2 + pmulhrsw m%2, m%3 +%endmacro + +cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + cmp eobd, 107 + jb .fast + mova m5, [cq+64*5] + mova m3, [cq+64*3] + mova m1, [cq+64*1] + mova m7, [cq+64*7] + mova m2, [cq+64*2] + mova m6, [cq+64*6] + mova m0, [cq+64*0] + mova m4, [cq+64*4] + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + mova m8, [o(idct_8x32p)] + vpbroadcastd m9, [o(pw_8192)] + REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7 + punpckldq m8, m0, m1 ; ab + punpckhdq m0, m1 + punpckldq m1, m2, m3 ; cd + punpckhdq m2, m3 + punpckldq m3, m4, m5 ; ef + punpckhdq m4, m5 + punpckldq m5, m6, m7 ; gh + punpckhdq m6, m7 + REPX {pmulhrsw x, m9}, m8, m0, m1, m2, m3, m4, m5, m6 + punpcklqdq m18, m8, m1 ; 30 2 6 26 31 1 23 9 + punpckhqdq m14, m8, m1 ; 16 0 12 20 3 29 11 21 + punpcklqdq m21, m0, m2 ; 14 18 22 10 27 5 19 13 + punpckhqdq m15, m0, m2 ; 18 4 24 8 7 25 15 17 + punpcklqdq m20, m3, m5 + punpckhqdq m16, m3, m5 + punpcklqdq m19, m4, m6 + punpckhqdq m17, m4, m6 + vinserti32x4 ym8, ym18, xm20, 1 + vshufi32x4 ym1, ym18, ym20, 0x03 + vinserti32x4 ym9, ym14, xm16, 1 + vshufi32x4 ym3, ym14, ym16, 0x03 + vinserti32x4 ym0, ym21, xm19, 1 + vshufi32x4 ym5, ym21, ym19, 0x03 + vinserti32x4 ym7, ym15, xm17, 1 + vshufi32x4 ym6, ym15, ym17, 0x03 + call m(idct_8x16_internal_8bpc).main2 + psrlq m12, [o(permB)], 60 + vpermt2q m14, m12, m16 + vpermt2q m21, m12, m19 + vpermt2q m15, m12, m17 + vpermi2q m12, m18, m20 + vextracti32x8 ym16, m14, 1 + vextracti32x8 ym19, m21, 1 + vextracti32x8 ym17, m15, 1 + vextracti32x8 ym20, m12, 1 + call .main2 + jmp .end +.fast: ; right half is zero + mova m0, [o(int16_perm)] + mova ym2, [cq+64*4] + vinserti32x8 m2, [cq+64*0], 1 + mova ym3, [cq+64*6] + vinserti32x8 m3, [cq+64*2], 1 + mova ym4, [cq+64*3] + vinserti32x8 m4, [cq+64*5], 1 + mova ym5, [cq+64*7] + vinserti32x8 m5, [cq+64*1], 1 + REPX {vpermb x, m0, x}, m2, m3, m4, m5 + call m(idct_16x8_internal_8bpc).main2 + vbroadcasti32x4 m4, [o(int_shuf3)] + vbroadcasti32x4 m5, [o(int_shuf4)] + pshufb m2, m4 ; e0 f0 e2 f2 e1 f1 e3 f3 + pshufb m3, m5 ; g0 h0 g2 h2 g1 h1 g3 h3 + pshufb m0, m4 ; a0 b0 a2 b2 a1 b1 a3 b3 + pshufb m1, m5 ; c0 d0 c2 d2 c1 d1 c3 d3 + vpbroadcastd m4, [o(pw_8192)] + psrlq m5, [o(permB)], 60 + punpckldq m6, m2, m3 ; e0 f0 g0 h0 e2 f2 g2 h2 + punpckhdq m17, m2, m3 ; e1 f1 g1 h1 e3 f3 g3 h3 + punpckldq m2, m0, m1 ; a0 b0 c0 d0 a2 b2 c2 d2 + punpckhdq m16, m0, m1 ; a1 b1 c1 d1 a3 b3 c3 d3 + REPX {pmulhrsw x, m4}, m6, m17, m2, m16 + vinserti32x4 ym0, ym2, xm6, 1 ; 0 2 + vshufi32x4 ym1, ym2, ym6, 0x03 ; 4 6 + vinserti32x4 ym14, ym16, xm17, 1 ; 1 3 + vshufi32x4 ym15, ym16, ym17, 0x03 ; 5 7 + pxor ym4, ym4 + vpermt2q m2, m5, m6 ; 8 10 + vpermt2q m16, m5, m17 ; 9 11 + mova ym5, ym4 + mova ym6, ym4 + mova ym7, ym4 + vextracti32x8 ym3, m2, 1 ; 12 14 + vextracti32x8 ym17, m16, 1 ; 13 15 + call m(idct_8x16_internal_8bpc).main + call .main_fast +.end: + vpbroadcastd ym12, strided + vpbroadcastd m13, [o(pw_2048)] + pmulld ym7, ym12, [o(gather8d)] + REPX {pmulhrsw x, m13}, m0, m1, m2, m3, m8, m9, m10, m11 + lea r3, [dstq+strideq*4] + shl strideq, 4 + lea r4, [dstq+strideq] + add r1, r3 + kxnorb k1, k1, k1 + pxor m6, m6 + kmovb k2, k1 + vpgatherdq m12{k1}, [r0+ym7] + kmovb k1, k2 + vpgatherdq m13{k2}, [r3+ym7] + kmovb k2, k1 + vpgatherdq m14{k1}, [r4+ym7] + kmovb k1, k2 + vpgatherdq m15{k2}, [r1+ym7] + REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7 + punpcklbw m4, m12, m6 + punpckhbw m12, m6 + paddw m0, m4 + paddw m1, m12 + packuswb m0, m1 + kmovb k2, k1 + vpscatterdq [r0+ym7]{k1}, m0 + punpcklbw m4, m13, m6 + punpckhbw m13, m6 + paddw m2, m4 + paddw m3, m13 + packuswb m2, m3 + kmovb k1, k2 + vpscatterdq [r3+ym7]{k2}, m2 + punpcklbw m4, m14, m6 + punpckhbw m14, m6 + paddw m8, m4 + paddw m9, m14 + packuswb m8, m9 + kmovb k2, k1 + vpscatterdq [r4+ym7]{k1}, m8 + punpcklbw m4, m15, m6 + punpckhbw m15, m6 + paddw m10, m4 + paddw m11, m15 + packuswb m10, m11 + vpscatterdq [r1+ym7]{k2}, m10 + RET +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + mov r3d, 32 + imul r6d, 181 + add r6d, 128+512 + sar r6d, 8+2 + jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2 +INIT_YMM avx512icl +ALIGN function_align +.main_fast: ; bottom half is zero + ITX_UNPACK_MULHRSW 12, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a + ITX_UNPACK_MULHRSW 21, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a + ITX_UNPACK_MULHRSW 20, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a + ITX_UNPACK_MULHRSW 19, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a + jmp .main3 +ALIGN function_align +.main: + punpcklwd m12, m21, m14 ; in31 in1 + punpckhwd m14, m21 ; in3 in29 + punpcklwd m21, m20, m15 ; in27 in5 + punpckhwd m15, m20 ; in7 in25 + punpcklwd m20, m19, m16 ; in23 in9 + punpckhwd m16, m19 ; in11 in21 + punpcklwd m19, m18, m17 ; in19 in13 + punpckhwd m17, m18 ; in15 in17 +.main2: + ITX_MUL2X_PACK 12, 8, 9, 10, 201, 4091, 5 ; t16a, t31a + ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a + ITX_MUL2X_PACK 21, 8, 9, 10, 995, 3973, 5 ; t20a, t27a + ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a + ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a + ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a + ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a + ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a +.main3: + psubsw m11, m12, m17 ; t17 t30 + paddsw m12, m17 ; t16 t31 + psubsw m17, m15, m20 ; t18 t29 + paddsw m20, m15 ; t19 t28 + psubsw m15, m21, m16 ; t21 t26 + paddsw m21, m16 ; t20 t27 + psubsw m16, m14, m19 ; t22 t25 + paddsw m14, m19 ; t23 t24 + ITX_MUL2X_PACK 11, 18, 19, 10, 799, 4017, 5 ; t17a t30a + ITX_MUL2X_PACK 17, 18, 19, 10, m4017, 799, 5 ; t18a t29a + ITX_MUL2X_PACK 15, 18, 19, 10, 3406, 2276, 5 ; t21a t26a + ITX_MUL2X_PACK 16, 18, 19, 10, m2276, 3406, 5 ; t22a t25a + vpbroadcastd m8, [o(pw_m3784_1567)] + psubsw m19, m12, m20 ; t19a t28a + paddsw m20, m12 ; t16a t31a + psubsw m12, m14, m21 ; t20a t27a + paddsw m14, m21 ; t23a t24a + psubsw m21, m11, m17 ; t18 t29 + paddsw m11, m17 ; t17 t30 + psubsw m17, m16, m15 ; t21 t26 + paddsw m16, m15 ; t22 t25 + ITX_MUL2X_PACK 21, 18, 15, 10, 1567_3784, 8, 20 ; t18a t29a + ITX_MUL2X_PACK 19, 18, 15, 10, 1567_3784, 8, 20 ; t19 t28 + ITX_MUL2X_PACK 12, 18, 15, 10, 8, m1567_m3784, 36 ; t20 t27 + ITX_MUL2X_PACK 17, 18, 15, 10, 8, m1567_m3784, 36 ; t21a t26a + vbroadcasti32x4 m18, [o(deint_shuf)] + vpbroadcastd m8, [o(pw_m2896_2896)] + vpbroadcastd m9, [o(pw_2896_2896)] + psubsw m15, m20, m14 ; t23 t24 + paddsw m20, m14 ; t16 t31 + psubsw m14, m11, m16 ; t22a t25a + paddsw m11, m16 ; t17a t30a + psubsw m16, m21, m17 ; t21 t26 + paddsw m21, m17 ; t18 t29 + psubsw m17, m19, m12 ; t20a t27a + paddsw m19, m12 ; t19a t28a + REPX {pshufb x, m18}, m20, m11, m21, m19 + ITX_MUL2X_PACK 15, 18, 12, 10, 8, 9, 8 ; t23a t22a + ITX_MUL2X_PACK 14, 13, 15, 10, 8, 9, 8 ; t22 t25 + packssdw m18, m13 ; t23a t22 + packssdw m12, m15 ; t24a t25 + ITX_MUL2X_PACK 16, 13, 15, 10, 8, 9, 8 ; t21a t26a + ITX_MUL2X_PACK 17, 16, 14, 10, 8, 9, 8 ; t20 t27 + packssdw m16, m13 ; t20 t21a + packssdw m14, m15 ; t27 t26a + punpcklqdq m13, m19, m21 ; t19a t18 + punpckhqdq m19, m21 ; t28a t29 + punpcklqdq m21, m20, m11 ; t16 t17a + punpckhqdq m20, m11 ; t31 t30a + psubsw m15, m1, m19 ; out28 out29 + paddsw m1, m19 ; out3 out2 + psubsw m9, m6, m13 ; out19 out18 + paddsw m6, m13 ; out12 out13 + psubsw m10, m5, m16 ; out20 out21 + paddsw m5, m16 ; out11 out10 + psubsw m19, m3, m12 ; out24 out25 + paddsw m3, m12 ; out7 out6 + psubsw m8, m7, m21 ; out16 out17 + paddsw m7, m21 ; out15 out14 + psubsw m21, m0, m20 ; out31 out30 + paddsw m0, m20 ; out0 out1 + psubsw m11, m4, m18 ; out23 out22 + paddsw m4, m18 ; out8 out9 + psubsw m18, m2, m14 ; out27 out26 + paddsw m2, m14 ; out4 out5 +INIT_ZMM avx512icl + movu m16, [o(permD+3)] + vpermt2q m0, m16, m4 ; 0 1 8 9 + vpermt2q m8, m16, m19 ; 16 17 24 25 + vpermt2q m1, m16, m5 ; 3 2 11 10 + vpermt2q m9, m16, m18 ; 19 18 27 26 + vpermt2q m2, m16, m6 ; 4 5 12 13 + vpermt2q m10, m16, m15 ; 20 21 28 29 + vpermt2q m3, m16, m7 ; 7 6 15 14 + vpermt2q m11, m16, m21 ; 23 22 31 30 + vzeroupper + ret + +%macro LOAD_PACKED_16X2 3 ; dst, row[1-2] + vbroadcasti32x4 ym%1, [cq+16*%2] + vbroadcasti32x4 ym8, [cq+16*%3] + shufpd ym%1, ym8, 0x0c +%endmacro + +cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob +%undef cmp + test eobd, eobd + jz .dconly + lea r5, [o_base] + LOAD_PACKED_16X2 0, 0, 2 ; in0 in2 + LOAD_PACKED_16X2 1, 4, 6 ; in4 in6 + LOAD_PACKED_16X2 2, 8, 10 ; in8 in10 + LOAD_PACKED_16X2 3, 12, 14 ; in12 in14 + LOAD_PACKED_16X2 14, 1, 3 ; in1 in3 + LOAD_PACKED_16X2 15, 5, 7 ; in5 in7 + LOAD_PACKED_16X2 16, 9, 11 ; in9 in11 + LOAD_PACKED_16X2 17, 13, 15 ; in13 in15 + pxor m4, m4 + REPX {mova [cq+64*x], m4}, 0, 1, 2, 3 + cmp eobd, 107 + jb .fast + LOAD_PACKED_16X2 4, 16, 18 ; in16 in18 + LOAD_PACKED_16X2 5, 20, 22 ; in20 in22 + LOAD_PACKED_16X2 6, 24, 26 ; in24 in26 + LOAD_PACKED_16X2 7, 28, 30 ; in28 in30 + call m(idct_8x16_internal_8bpc).main + LOAD_PACKED_16X2 18, 19, 17 ; in19 in17 + LOAD_PACKED_16X2 19, 23, 21 ; in23 in21 + LOAD_PACKED_16X2 20, 27, 25 ; in27 in25 + LOAD_PACKED_16X2 21, 31, 29 ; in31 in29 + pxor m8, m8 + REPX {mova [cq+64*x], m8}, 4, 5, 6, 7 + call m(inv_txfm_add_dct_dct_8x32_8bpc).main + jmp .pass2 +.fast: ; bottom half is zero + mova ym5, ym4 + mova ym6, ym4 + mova ym7, ym4 + call m(idct_8x16_internal_8bpc).main + call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast +.pass2: + vpbroadcastd m12, [o(pw_8192)] + vshufi32x4 m7, m3, m11, q2020 ; 7 15 23 31 + vshufi32x4 m6, m3, m11, q3131 ; 6 14 22 30 + vshufi32x4 m5, m2, m10, q3131 ; 5 13 21 29 + vshufi32x4 m4, m2, m10, q2020 ; 4 12 20 28 + vshufi32x4 m3, m1, m9, q2020 ; 3 11 19 27 + vshufi32x4 m2, m1, m9, q3131 ; 2 10 18 26 + vshufi32x4 m1, m0, m8, q3131 ; 1 9 17 15 + vshufi32x4 m0, m8, q2020 ; 0 8 16 24 + REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 + call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8 + call .main + vpbroadcastd m8, [o(pw_2048)] + REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 + lea r2, [strideq*3] + lea r3, [dstq+strideq*4] + movshdup m12, [o(permD)] + pmovzxbw m8, [dstq+strideq*0] + pmovzxbw m9, [dstq+strideq*1] + pmovzxbw m10, [dstq+strideq*2] + pmovzxbw m11, [dstq+r2 ] + paddw m0, m8 + paddw m1, m9 + paddw m2, m10 + paddw m3, m11 + pmovzxbw m8, [r3+strideq*0] + pmovzxbw m9, [r3+strideq*1] + pmovzxbw m10, [r3+strideq*2] + pmovzxbw m11, [r3+r2 ] + paddw m4, m8 + paddw m5, m9 + paddw m6, m10 + paddw m7, m11 + packuswb m0, m1 + packuswb m2, m3 + vpermq m0, m12, m0 + vpermq m2, m12, m2 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym2 + vextracti32x8 [dstq+r2 ], m2, 1 + packuswb m4, m5 + packuswb m6, m7 + vpermq m4, m12, m4 + vpermq m6, m12, m6 + mova [r3+strideq*0], ym4 + vextracti32x8 [r3+strideq*1], m4, 1 + mova [r3+strideq*2], ym6 + vextracti32x8 [r3+r2 ], m6, 1 + RET +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + mov r3d, 8 +.dconly2: + imul r6d, 181 + add r6d, 128+512 + sar r6d, 8+2 +.dconly3: + imul r6d, 181 + add r6d, 128+2048 + sar r6d, 8+4 + pxor m2, m2 + vpbroadcastw m3, r6d +.dconly_loop: + mova ym1, [dstq+strideq*0] + vinserti32x8 m1, [dstq+strideq*1], 1 + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + paddw m0, m3 + paddw m1, m3 + packuswb m0, m1 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + lea dstq, [dstq+strideq*2] + sub r3d, 2 + jg .dconly_loop + RET +ALIGN function_align +.main: + vpbroadcastd m10, [o(pd_2048)] +.main2: + ITX_MULSUB_2W 5, 3, 8, 9, 10, 3406, 2276 ; t5a, t6a + ITX_MULSUB_2W 1, 7, 8, 9, 10, 799, 4017 ; t4a, t7a + ITX_MULSUB_2W 2, 6, 8, 9, 10, 1567, 3784 ; t2, t3 + vpbroadcastd m11, [o(pw_2896_2896)] + vpbroadcastd m12, [o(pw_m2896_2896)] + ITX_MULSUB_2W 0, 4, 8, 9, 10, 11, 12 ; t1, t0 +.main3: + paddsw m8, m1, m5 ; t4 + psubsw m1, m5 ; t5a + paddsw m9, m7, m3 ; t7 + psubsw m7, m3 ; t6a + ITX_MULSUB_2W 7, 1, 3, 5, 10, 11, 12 ; t5, t6 + psubsw m5, m0, m2 ; dct4 out2 + paddsw m2, m0 ; dct4 out1 + paddsw m0, m4, m6 ; dct4 out0 + psubsw m4, m6 ; dct4 out3 + psubsw m6, m2, m1 ; out6 + paddsw m1, m2 ; out1 + paddsw m2, m5, m7 ; out2 + psubsw m5, m7 ; out5 + psubsw m7, m0, m9 ; out7 + paddsw m0, m9 ; out0 + paddsw m3, m4, m8 ; out3 + psubsw m4, m8 ; out4 + ret + +cglobal inv_txfm_add_identity_identity_8x32_8bpc, 3, 5, 0, dst, stride, c + vpbroadcastd m7, [pw_5] + paddsw m0, m7, [cq+64*0] + paddsw m1, m7, [cq+64*1] + vpbroadcastd ym9, strided + paddsw m2, m7, [cq+64*2] + paddsw m3, m7, [cq+64*3] + paddsw m4, m7, [cq+64*4] + paddsw m5, m7, [cq+64*5] + paddsw m6, m7, [cq+64*6] + paddsw m7, [cq+64*7] + pmulld ym14, ym9, [pd_0to15] + lea r3, [dstq+strideq*1] + lea r4, [dstq+strideq*2] + kxnorb k1, k1, k1 + pxor m13, m13 + add r1, r4 ; dstq+strideq*3 + kmovb k2, k1 + vpgatherdq m9{k1}, [r0+ym14*4] + kmovb k1, k2 + vpgatherdq m10{k2}, [r3+ym14*4] + kmovb k2, k1 + call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8 + REPX {psraw x, 3}, m0, m1, m2, m3, m4, m5, m6, m7 + vpgatherdq m11{k1}, [r4+ym14*4] + kmovb k1, k2 + vpgatherdq m12{k2}, [r1+ym14*4] + REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7 + punpcklbw m8, m9, m13 ; 0 8 16 24 + punpckhbw m9, m13 ; 4 12 20 28 + paddw m0, m8 + paddw m4, m9 + packuswb m0, m4 + kmovb k2, k1 + vpscatterdq [r0+ym14*4]{k1}, m0 + punpcklbw m8, m10, m13 ; 1 9 17 25 + punpckhbw m10, m13 ; 5 13 21 29 + paddw m1, m8 + paddw m5, m10 + packuswb m1, m5 + kmovb k1, k2 + vpscatterdq [r3+ym14*4]{k2}, m1 + punpcklbw m8, m11, m13 ; 2 10 18 26 + punpckhbw m11, m13 ; 6 14 22 30 + paddw m2, m8 + paddw m6, m11 + packuswb m2, m6 + kmovb k2, k1 + vpscatterdq [r4+ym14*4]{k1}, m2 + punpcklbw m8, m12, m13 ; 3 11 19 27 + punpckhbw m12, m13 ; 7 15 23 31 + paddw m3, m8 + paddw m7, m12 + packuswb m3, m7 + vpscatterdq [r1+ym14*4]{k2}, m3 + RET + +cglobal inv_txfm_add_identity_identity_32x8_8bpc, 3, 5, 0, dst, stride, c + vpbroadcastd m0, [pw_4096] + pmulhrsw m3, m0, [cq+64*0] + pmulhrsw m4, m0, [cq+64*4] + pmulhrsw m6, m0, [cq+64*1] + pmulhrsw m5, m0, [cq+64*5] + pmulhrsw m7, m0, [cq+64*2] + pmulhrsw m2, m0, [cq+64*6] + pmulhrsw m8, m0, [cq+64*3] + pmulhrsw m0, [cq+64*7] + mova m13, [int8_permA] + lea r3, [strideq*3] + lea r4, [dstq+strideq*4] + punpckldq m1, m3, m4 + punpckhdq m3, m4 + punpckldq m4, m6, m5 + punpckhdq m6, m5 + punpckldq m5, m7, m2 + punpckhdq m7, m2 + punpckldq m2, m8, m0 + punpckhdq m8, m0 + mova ym9, [dstq+strideq*0] + vinserti32x8 m9, [dstq+strideq*2], 1 + mova ym10, [dstq+strideq*1] + vinserti32x8 m10, [dstq+r3 ], 1 + mova ym11, [r4+strideq*0] + vinserti32x8 m11, [r4+strideq*2], 1 + mova ym12, [r4+strideq*1] + vinserti32x8 m12, [r4+r3 ], 1 + REPX {vpermb x, m13, x}, m1, m4, m5, m2, m3, m6, m7, m8 + pxor m13, m13 + REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7 + punpcklqdq m0, m1, m4 ; a0 a2 c0 c2 + punpckhqdq m1, m4 ; b0 b2 d0 d2 + punpcklqdq m4, m5, m2 ; a1 a3 c1 c3 + punpckhqdq m5, m2 ; b1 b3 d1 d3 + punpcklqdq m2, m3, m6 ; e0 e2 g0 g2 + punpckhqdq m3, m6 ; f0 f2 h0 h2 + punpcklqdq m6, m7, m8 ; e1 e3 g1 g3 + punpckhqdq m7, m8 ; f1 f3 h1 h3 + punpcklbw m8, m9, m13 + punpckhbw m9, m13 + paddw m0, m8 + paddw m4, m9 + packuswb m0, m4 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*2], m0, 1 + punpcklbw m8, m10, m13 + punpckhbw m10, m13 + paddw m1, m8 + paddw m5, m10 + packuswb m1, m5 + mova [dstq+strideq*1], ym1 + vextracti32x8 [dstq+r3 ], m1, 1 + punpcklbw m8, m11, m13 + punpckhbw m11, m13 + paddw m2, m8 + paddw m6, m11 + packuswb m2, m6 + mova [r4+strideq*0], ym2 + vextracti32x8 [r4+strideq*2], m2, 1 + punpcklbw m8, m12, m13 + punpckhbw m12, m13 + paddw m3, m8 + paddw m7, m12 + packuswb m3, m7 + mova [r4+strideq*1], ym3 + vextracti32x8 [r4+r3 ], m3, 1 + RET + +%macro IDCT_16x32_END 3 ; src[1-2], row + mova xm8, [dstq+strideq*0] + vinserti32x4 ym8, [dstq+strideq*1], 1 + mova xm9, [dstq+r3 ] + vinserti32x4 ym9, [dstq+strideq*2], 1 + pmulhrsw m%1, m10 + pmulhrsw m%2, m10 + vpermb m8, m11, m8 + vpermb m9, m11, m9 + mova [cq+64*(%3*2+0)], m13 + mova [cq+64*(%3*2+1)], m13 + paddw m8, m%1 + paddw m9, m%2 + packuswb m8, m9 + vpermd m8, m12, m8 + mova [dstq+strideq*0], xm8 + vextracti32x4 [dstq+strideq*1], ym8, 1 + vextracti32x4 [dstq+strideq*2], m8, 2 + vextracti32x4 [dstq+r3 ], m8, 3 +%if %1 != 20 + lea dstq, [dstq+strideq*4] +%endif +%endmacro + +cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 22, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + vpbroadcastd m15, [o(pw_2896x8)] + cmp eobd, 151 + jb .fast + pmulhrsw m5, m15, [cq+64*10] + pmulhrsw m3, m15, [cq+64* 6] + pmulhrsw m1, m15, [cq+64* 2] + pmulhrsw m7, m15, [cq+64*14] + pmulhrsw m2, m15, [cq+64* 4] + pmulhrsw m6, m15, [cq+64*12] + pmulhrsw m0, m15, [cq+64* 0] + pmulhrsw m4, m15, [cq+64* 8] + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + pmulhrsw m14, m15, [cq+64* 1] + pmulhrsw m21, m15, [cq+64*15] + pmulhrsw m18, m15, [cq+64* 9] + pmulhrsw m17, m15, [cq+64* 7] + pmulhrsw m16, m15, [cq+64* 5] + pmulhrsw m19, m15, [cq+64*11] + pmulhrsw m20, m15, [cq+64*13] + pmulhrsw m15, [cq+64* 3] + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + mova m8, [o(idct_16x32p)] + vpbroadcastd m9, [o(pw_16384)] + REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7, \ + m14, m15, m16, m17, m18, m19, m20, m21 + punpckldq m8, m0, m1 + punpckhdq m0, m1 + punpckldq m1, m2, m3 + punpckhdq m2, m3 + REPX {pmulhrsw x, m9}, m8, m0, m1, m2 + punpckldq m3, m4, m5 + punpckhdq m4, m5 + punpckldq m5, m6, m7 + punpckhdq m6, m7 + REPX {pmulhrsw x, m9}, m3, m4, m5, m6 + punpckldq m7, m14, m15 + punpckhdq m14, m15 + punpckldq m15, m16, m17 + punpckhdq m16, m17 + REPX {pmulhrsw x, m9}, m7, m14, m15, m16 + punpckldq m17, m18, m19 + punpckhdq m18, m19 + punpckldq m19, m20, m21 + punpckhdq m20, m21 + REPX {pmulhrsw x, m9}, m17, m18, m19, m20 + punpcklqdq m21, m8, m1 + punpckhqdq m8, m1 + punpcklqdq m1, m0, m2 + punpckhqdq m0, m2 + punpcklqdq m2, m3, m5 + punpckhqdq m3, m5 + punpcklqdq m5, m4, m6 + punpckhqdq m4, m6 + punpcklqdq m6, m7, m15 + punpckhqdq m7, m15 + punpcklqdq m15, m14, m16 + punpckhqdq m14, m16 + punpcklqdq m16, m17, m19 + punpckhqdq m17, m19 + punpcklqdq m19, m18, m20 + punpckhqdq m18, m20 + vinserti32x8 m20, m21, ym2, 1 + vshufi32x4 m21, m2, q3232 + vinserti32x8 m2, m8, ym3, 1 + vshufi32x4 m8, m3, q3232 + vinserti32x8 m3, m1, ym5, 1 + vshufi32x4 m1, m5, q3232 + vinserti32x8 m5, m0, ym4, 1 + vshufi32x4 m0, m4, q3232 + vinserti32x8 m4, m6, ym16, 1 + vshufi32x4 m6, m16, q3232 + vinserti32x8 m16, m7, ym17, 1 + vshufi32x4 m7, m17, q3232 + vinserti32x8 m17, m15, ym19, 1 + vshufi32x4 m15, m19, q3232 + vinserti32x8 m19, m14, ym18, 1 + vshufi32x4 m14, m18, q3232 + vshufi32x4 m18, m21, m6, q3131 ; 27 5 + vshufi32x4 m21, m6, q2020 ; 31 1 + vshufi32x4 m6, m8, m7, q2020 ; 24 8 + vshufi32x4 m8, m7, q3131 ; 30 2 + vshufi32x4 m7, m1, m15, q2020 ; 28 4 + vshufi32x4 m1, m15, q3131 ; 6 26 + vshufi32x4 m15, m0, m14, q2020 ; 7 25 + vshufi32x4 m0, m14, q3131 ; 14 18 + vshufi32x4 m14, m20, m4, q2020 ; 3 29 + vshufi32x4 m20, m4, q3131 ; 23 9 + vshufi32x4 m9, m3, m17, q2020 ; 16 0 + vshufi32x4 m3, m17, q3131 ; 12 20 + vshufi32x4 m17, m5, m19, q2020 ; 15 17 + vshufi32x4 m5, m19, q3131 ; 22 10 + vshufi32x4 m19, m2, m16, q2020 ; 19 13 + vshufi32x4 m16, m2, m16, q3131 ; 11 21 + call m(idct_16x16_internal_8bpc).main3 + call .main_oddhalf + jmp .pass2 +.fast: ; right half is zero + mova ym8, [cq+64*15] + vinserti32x8 m8, [cq+64* 1], 1 + mova m2, [o(int16_perm)] + mova ym9, [cq+64* 8] + vinserti32x8 m9, [cq+64* 0], 1 + mova ym0, [cq+64* 7] + vinserti32x8 m0, [cq+64* 9], 1 + mova ym7, [cq+64*14] + vinserti32x8 m7, [cq+64* 2], 1 + mova ym1, [cq+64* 3] + vinserti32x8 m1, [cq+64*13], 1 + mova ym3, [cq+64* 6] + vinserti32x8 m3, [cq+64*10], 1 + mova ym5, [cq+64*11] + vinserti32x8 m5, [cq+64* 5], 1 + mova ym6, [cq+64*12] + vinserti32x8 m6, [cq+64* 4], 1 + REPX {pmulhrsw x, m15}, m8, m9, m0, m7, m1, m3, m5, m6 + REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6 + call m(idct_16x16_internal_8bpc).main2 + vbroadcasti32x4 m8, [o(int_shuf3)] + vbroadcasti32x4 m9, [o(int_shuf4)] + vpbroadcastd m11, [o(pw_16384)] + pshufb m0, m8 + pshufb m1, m9 + pshufb m2, m8 + pshufb m3, m9 + REPX {pmulhrsw x, m11}, m0, m1, m2, m3 + pshufb m4, m8 + pshufb m5, m9 + pshufb m6, m8 + pshufb m7, m9 + REPX {pmulhrsw x, m11}, m4, m5, m6, m7 + punpckhdq m17, m0, m1 + punpckldq m0, m1 + punpckhdq m16, m2, m3 + punpckldq m2, m3 + punpckhdq m18, m4, m5 + punpckldq m4, m5 + punpckhdq m5, m6, m7 + punpckldq m6, m7 + vinserti32x8 m1, m0, ym2, 1 + vshufi32x4 m3, m0, m2, q3232 + vinserti32x8 m2, m4, ym6, 1 + vshufi32x4 m4, m6, q3232 + vinserti32x8 m15, m17, ym16, 1 + vshufi32x4 m17, m16, q3232 + vinserti32x8 m16, m18, ym5, 1 + vshufi32x4 m18, m5, q3232 + vshufi32x4 m0, m1, m2, q2020 ; 0 2 + vshufi32x4 m1, m2, q3131 ; 4 6 + vshufi32x4 m2, m3, m4, q2020 ; 8 10 + vshufi32x4 m3, m4, q3131 ; 12 14 + vshufi32x4 m14, m15, m16, q2020 ; 1 3 + vshufi32x4 m15, m16, q3131 ; 5 7 + vshufi32x4 m16, m17, m18, q2020 ; 9 11 + vshufi32x4 m17, m18, q3131 ; 13 15 + pxor m6, m6 + punpckhwd m8, m0, m0 + punpcklwd m9, m6, m0 + punpckhwd m0, m3, m3 + punpckhwd m5, m2, m2 + punpcklwd m7, m1, m1 + punpckhwd m1, m1 + punpcklwd m3, m3 + punpcklwd m6, m2 + call m(idct_16x16_internal_8bpc).main_fast5 + punpcklwd m21, m14, m14 + punpckhwd m14, m14 + punpcklwd m18, m15, m15 + punpckhwd m15, m15 + punpcklwd m20, m16, m16 + punpckhwd m16, m16 + punpcklwd m19, m17, m17 + punpckhwd m17, m17 + call .main_oddhalf_fast +.pass2: + vpbroadcastd m10, [o(pw_2048)] + mova m11, [o(end_16x32p)] + lea r3, [strideq*3] + pxor m13, m13 + psrld m12, m11, 8 + IDCT_16x32_END 0, 1, 0 + IDCT_16x32_END 2, 3, 1 + IDCT_16x32_END 4, 5, 2 + IDCT_16x32_END 6, 7, 3 + IDCT_16x32_END 14, 15, 4 + IDCT_16x32_END 16, 17, 5 + IDCT_16x32_END 18, 19, 6 + IDCT_16x32_END 20, 21, 7 + RET +ALIGN function_align +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + mov r3d, 32 + jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly +ALIGN function_align +.main_oddhalf_fast2: ; bottom three-quarters are zero + vpbroadcastd m8, [o(pw_201_4091x8)] + vpbroadcastd m20, [o(pw_m1380_3857x8)] + vpbroadcastd m9, [o(pw_995_3973x8)] + vpbroadcastd m16, [o(pw_m601_4052x8)] + pmulhrsw m21, m8 ; t16a, t31a + pmulhrsw m20, m15 ; t19a, t28a + pmulhrsw m18, m9 ; t20a, t27a + pmulhrsw m14, m16 ; t23a, t24a + mova m8, m21 + mova m17, m20 + mova m15, m18 + mova m16, m14 + jmp .main3 +ALIGN function_align +.main_oddhalf_fast: ; bottom half is zero + vpbroadcastd m8, [o(pw_201_4091x8)] + vpbroadcastd m9, [o(pw_m2751_3035x8)] + vpbroadcastd m11, [o(pw_1751_3703x8)] + vpbroadcastd m12, [o(pw_m1380_3857x8)] + pmulhrsw m21, m8 ; t16a, t31a + vpbroadcastd m8, [o(pw_995_3973x8)] + pmulhrsw m17, m9 ; t17a, t30a + vpbroadcastd m9, [o(pw_m2106_3513x8)] + pmulhrsw m20, m11 ; t18a, t29a + vpbroadcastd m11, [o(pw_2440_3290x8)] + pmulhrsw m15, m12 ; t19a, t28a + vpbroadcastd m12, [o(pw_m601_4052x8)] + pmulhrsw m18, m8 ; t20a, t27a + pmulhrsw m16, m9 ; t21a, t26a + pmulhrsw m19, m11 ; t22a, t25a + pmulhrsw m14, m12 ; t23a, t24a + jmp .main2 +ALIGN function_align +.main_oddhalf: + ITX_MUL2X_PACK 21, 8, 9, 10, 201, 4091, 5 ; t16a, t31a + ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a + ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a + ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a + ITX_MUL2X_PACK 18, 8, 9, 10, 995, 3973, 5 ; t20a, t27a + ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a + ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a + ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a +.main2: + psubsw m8, m21, m17 ; t17 t30 + paddsw m21, m17 ; t16 t31 + psubsw m17, m15, m20 ; t18 t29 + paddsw m20, m15 ; t19 t28 + psubsw m15, m18, m16 ; t21 t26 + paddsw m18, m16 ; t20 t27 + psubsw m16, m14, m19 ; t22 t25 + paddsw m14, m19 ; t23 t24 +.main3: + ITX_MUL2X_PACK 8, 9, 19, 10, 799, 4017, 5 ; t17a t30a + ITX_MUL2X_PACK 17, 9, 19, 10, m4017, 799, 5 ; t18a t29a + ITX_MUL2X_PACK 15, 9, 19, 10, 3406, 2276, 5 ; t21a t26a + ITX_MUL2X_PACK 16, 9, 19, 10, m2276, 3406, 5 ; t22a t25a + vpbroadcastd m11, [o(pw_m3784_1567)] + psubsw m19, m21, m20 ; t19a t28a + paddsw m21, m20 ; t16a t31a + psubsw m20, m14, m18 ; t20a t27a + paddsw m14, m18 ; t23a t24a + psubsw m18, m8, m17 ; t18 t29 + paddsw m8, m17 ; t17 t30 + psubsw m17, m16, m15 ; t21 t26 + paddsw m15, m16 ; t22 t25 + ITX_MUL2X_PACK 18, 9, 16, 10, 1567_3784, 11, 20 ; t18a t29a + ITX_MUL2X_PACK 19, 9, 16, 10, 1567_3784, 11, 20 ; t19 t28 + ITX_MUL2X_PACK 20, 9, 16, 10, 11, m1567_m3784, 36 ; t20 t27 + ITX_MUL2X_PACK 17, 9, 16, 10, 11, m1567_m3784, 36 ; t21a t26a + vbroadcasti32x4 m9, [o(deint_shuf)] + psubsw m16, m21, m14 ; t23 t24 + paddsw m14, m21 ; t16 t31 + psubsw m21, m8, m15 ; t22a t25a + paddsw m15, m8 ; t17a t30a + psubsw m8, m18, m17 ; t21 t26 + paddsw m18, m17 ; t18 t29 + paddsw m17, m19, m20 ; t19a t28a + psubsw m19, m20 ; t20a t27a + vpbroadcastd m11, [o(pw_m2896_2896)] + vpbroadcastd m12, [o(pw_2896_2896)] + REPX {pshufb x, m9}, m14, m15, m18, m17 + mova m9, m10 + vpdpwssd m9, m16, m11 + mova m20, m10 + vpdpwssd m20, m21, m11 + psrad m9, 12 + psrad m20, 12 + packssdw m9, m20 ; t23a t22 + mova m20, m10 + vpdpwssd m20, m16, m12 + mova m16, m10 + vpdpwssd m16, m21, m12 + psrad m20, 12 + psrad m16, 12 + packssdw m16, m20, m16 ; t24a t25 + ITX_MUL2X_PACK 8, 21, 20, 10, 11, 12, 8 ; t21a t26a + ITX_MUL2X_PACK 19, 8, 11, 10, 11, 12, 8 ; t20 t27 + packssdw m11, m20 ; t27 t26a + packssdw m8, m21 ; t20 t21a + punpcklqdq m20, m14, m15 ; t16 t17a + punpckhqdq m14, m15 ; t31 t30a + punpckhqdq m15, m17, m18 ; t28a t29 + punpcklqdq m17, m18 ; t19a t18 + psubsw m21, m0, m14 ; out31 out30 + paddsw m0, m14 ; out0 out1 + psubsw m14, m7, m20 ; out16 out17 + paddsw m7, m20 ; out15 out14 + psubsw m20, m1, m15 ; out28 out29 + paddsw m1, m15 ; out3 out2 + psubsw m15, m6, m17 ; out19 out18 + paddsw m6, m17 ; out12 out13 + psubsw m17, m4, m9 ; out23 out22 + paddsw m4, m9 ; out8 out9 + psubsw m18, m3, m16 ; out24 out25 + paddsw m3, m16 ; out7 out6 + psubsw m16, m5, m8 ; out20 out21 + paddsw m5, m8 ; out11 out10 + psubsw m19, m2, m11 ; out27 out26 + paddsw m2, m11 ; out4 out5 + ret + +cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 22, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + mova m21, [o(permB)] + vpermq m1, m21, [cq+64* 0] ; 0 1 + vpermq m14, m21, [cq+64* 1] ; 2 3 + vpermq m20, m21, [cq+64* 2] ; 4 5 + vpermq m15, m21, [cq+64* 3] ; 6 7 + vpbroadcastd m8, [o(pw_2896x8)] + vpermq m2, m21, [cq+64* 4] ; 8 9 + vpermq m16, m21, [cq+64* 5] ; 10 11 + vpermq m3, m21, [cq+64* 6] ; 12 13 + vpermq m17, m21, [cq+64* 7] ; 14 15 + REPX {pmulhrsw x, m8}, m1, m14, m20, m15, m2, m16, m3, m17 + pxor m12, m12 + REPX {mova [cq+64*x], m12}, 0, 1, 2, 3, 4, 5, 6, 7 + cmp eobd, 151 + jb .fast + vpermq m9, m21, [cq+64* 8] ; 16 17 + vpermq m19, m21, [cq+64* 9] ; 18 19 + vpermq m4, m21, [cq+64*10] ; 20 21 + vpermq m5, m21, [cq+64*11] ; 22 23 + vpermq m6, m21, [cq+64*12] ; 24 25 + vpermq m18, m21, [cq+64*13] ; 26 27 + vpermq m7, m21, [cq+64*14] ; 28 29 + vpermq m21, m21, [cq+64*15] ; 30 31 + REPX {pmulhrsw x, m8}, m9, m19, m4, m5, m6, m18, m7, m21 + REPX {mova [cq+64*x], m12}, 8, 9, 10, 11, 12, 13, 14, 15 + punpcklwd m8, m21, m14 ; 30 2 + punpckhwd m21, m1 ; 31 1 + punpcklwd m0, m17, m19 ; 14 18 + punpckhwd m17, m9 ; 15 17 + punpcklwd m9, m1 ; 16 0 + punpckhwd m14, m7 ; 3 29 + punpcklwd m1, m15, m18 ; 6 26 + punpckhwd m15, m6 ; 7 25 + punpcklwd m6, m2 ; 24 8 + punpckhwd m19, m3 ; 19 13 + punpcklwd m3, m4 ; 12 20 + punpckhwd m18, m20 ; 27 5 + punpcklwd m7, m20 ; 28 4 + punpckhwd m20, m5, m2 ; 23 9 + punpcklwd m5, m16 ; 22 10 + punpckhwd m16, m4 ; 11 21 + call m(idct_16x16_internal_8bpc).main2 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf + jmp .pass2 +.fast: ; bottom half zero + punpcklwd m8, m14, m14 ; 2 + punpcklwd m0, m17, m17 ; 14 + punpcklwd m5, m16, m16 ; 10 + punpcklwd m9, m12, m1 ; __ 0 + punpckhwd m21, m1, m1 ; 1 + punpcklwd m1, m15, m15 ; 6 + punpcklwd m7, m20, m20 ; 4 + punpckhwd m19, m3, m3 ; 13 + punpcklwd m3, m3 ; 12 + punpcklwd m6, m12, m2 ; __ 8 + punpckhwd m18, m20, m20 ; 5 + punpckhwd m20, m2, m2 ; 9 + call m(idct_16x16_internal_8bpc).main_fast + punpckhwd m15, m15 ; 7 + punpckhwd m14, m14 ; 3 + punpckhwd m16, m16 ; 11 + punpckhwd m17, m17 ; 15 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast +.pass2: + vpbroadcastd m9, [o(pw_16384)] + call .transpose_round + vshufi32x4 m16, m14, m2, q3131 ; 5 + vshufi32x4 m14, m2, q2020 ; 1 + vshufi32x4 m2, m0, m3, q3131 ; 4 + vshufi32x4 m0, m3, q2020 ; 0 + vshufi32x4 m3, m1, m18, q3131 ; 6 + vshufi32x4 m1, m18, q2020 ; 2 + vshufi32x4 m18, m20, m6, q2020 ; 9 + vshufi32x4 m20, m6, q3131 ; 13 + vshufi32x4 m6, m21, m4, q3131 ; 12 + vshufi32x4 m4, m21, m4, q2020 ; 8 + vshufi32x4 m21, m19, m7, q3131 ; 15 + vshufi32x4 m19, m7, q2020 ; 11 + vshufi32x4 m7, m5, m15, q3131 ; 14 + vshufi32x4 m5, m15, q2020 ; 10 + vshufi32x4 m15, m17, m9, q2020 ; 3 + vshufi32x4 m17, m9, q3131 ; 7 + call m(inv_txfm_add_dct_dct_32x8_8bpc).main2 + call .main_oddhalf + vpbroadcastd m12, [o(pw_2048)] + movshdup m13, [o(permD)] + lea r2, [strideq*3] + pmovzxbw m8, [dstq+strideq*0] + pmovzxbw m9, [dstq+strideq*1] + pmovzxbw m10, [dstq+strideq*2] + pmovzxbw m11, [dstq+r2 ] + REPX {pmulhrsw x, m12}, m0, m1, m2, m3 + lea r3, [dstq+strideq*4] + paddw m0, m8 + paddw m1, m9 + paddw m2, m10 + paddw m3, m11 + pmovzxbw m8, [r3+strideq*0] + pmovzxbw m9, [r3+strideq*1] + pmovzxbw m10, [r3+strideq*2] + pmovzxbw m11, [r3+r2 ] + REPX {pmulhrsw x, m12}, m4, m5, m6, m7 + lea r4, [dstq+strideq*8] + packuswb m0, m1 + paddw m4, m8 + paddw m5, m9 + packuswb m2, m3 + paddw m6, m10 + paddw m7, m11 + pmovzxbw m8, [r4+strideq*0] + pmovzxbw m9, [r4+strideq*1] + pmovzxbw m10, [r4+strideq*2] + pmovzxbw m11, [r4+r2 ] + REPX {pmulhrsw x, m12}, m14, m15, m16, m17 + lea r5, [r3+strideq*8] + packuswb m4, m5 + paddw m14, m8 + paddw m15, m9 + packuswb m6, m7 + paddw m16, m10 + paddw m17, m11 + pmovzxbw m8, [r5+strideq*0] + pmovzxbw m9, [r5+strideq*1] + pmovzxbw m10, [r5+strideq*2] + pmovzxbw m11, [r5+r2 ] + REPX {pmulhrsw x, m12}, m18, m19, m20, m21 + packuswb m14, m15 + paddw m18, m8 + paddw m19, m9 + packuswb m16, m17 + paddw m20, m10 + paddw m21, m11 + packuswb m18, m19 + packuswb m20, m21 + REPX {vpermq x, m13, x}, m0, m2, m4, m6, m14, m16, m18, m20 + mova [dstq+strideq*0], ym0 + vextracti32x8 [dstq+strideq*1], m0, 1 + mova [dstq+strideq*2], ym2 + vextracti32x8 [dstq+r2 ], m2, 1 + mova [r3+strideq*0], ym4 + vextracti32x8 [r3+strideq*1], m4, 1 + mova [r3+strideq*2], ym6 + vextracti32x8 [r3+r2 ], m6, 1 + mova [r4+strideq*0], ym14 + vextracti32x8 [r4+strideq*1], m14, 1 + mova [r4+strideq*2], ym16 + vextracti32x8 [r4+r2 ], m16, 1 + mova [r5+strideq*0], ym18 + vextracti32x8 [r5+strideq*1], m18, 1 + mova [r5+strideq*2], ym20 + vextracti32x8 [r5+r2 ], m20, 1 + RET +ALIGN function_align +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + imul r6d, 181 + mov r3d, 16 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 128+256 + sar r6d, 8+1 + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3 +ALIGN function_align +.main_oddhalf_fast2: ; bottom three-quarters are zero + vpbroadcastd m9, [o(pw_2896x8)] + vpbroadcastd m2, [o(pw_4017x8)] + vpbroadcastd m3, [o(pw_799x8)] + vpbroadcastd m18, [o(pw_4076x8)] + vpbroadcastd m19, [o(pw_401x8)] + vpbroadcastd m20, [o(pw_m1189x8)] + vpbroadcastd m16, [o(pw_3920x8)] + pmulhrsw m9, m0 ; t0 + pmulhrsw m2, m1 ; t7a + pmulhrsw m1, m3 ; t4a + pmulhrsw m18, m14 ; t15a + pmulhrsw m14, m19 ; t8a + pmulhrsw m20, m15 ; t11a + pmulhrsw m15, m16 ; t12a + psubsw m7, m9, m2 ; idct8 out7 + paddsw m0, m9, m2 ; idct8 out0 + psubsw m4, m9, m1 ; idct8 out4 + paddsw m3, m9, m1 ; idct8 out3 + ITX_MULSUB_2W 2, 1, 5, 6, 10, 2896, 2896 ; t5, t6 + mova m21, m18 + mova m19, m14 + mova m16, m15 + mova m8, m20 + psubsw m6, m9, m1 ; idct8 out6 + paddsw m1, m9 ; idct8 out1 + psubsw m5, m9, m2 ; idct8 out5 + paddsw m2, m9 ; idct8 out2 + jmp .main3 +ALIGN function_align +.main_oddhalf_fast: ; bottom half is zero + vpbroadcastd m5, [o(pw_m2276x8)] + vpbroadcastd m11, [o(pw_3406x8)] + vpbroadcastd m7, [o(pw_4017x8)] + vpbroadcastd m12, [o(pw_799x8)] + vpbroadcastd m6, [o(pw_3784x8)] + vpbroadcastd m10, [o(pw_1567x8)] + vpbroadcastd m4, [o(pw_2896x8)] + pmulhrsw m5, m3 ; t5a + pmulhrsw m3, m11 ; t6a + pmulhrsw m7, m1 ; t7a + pmulhrsw m1, m12 ; t4a + pmulhrsw m6, m2 ; t3 + pmulhrsw m2, m10 ; t2 + pmulhrsw m4, m0 ; t0 + vpbroadcastd m11, [o(pw_2896_2896)] + vpbroadcastd m12, [o(pw_m2896_2896)] + vpbroadcastd m10, [o(pd_2048)] + mova m0, m4 ; t1 + call m(inv_txfm_add_dct_dct_32x8_8bpc).main3 + vpbroadcastd m21, [o(pw_4076x8)] + vpbroadcastd m8, [o(pw_401x8)] + vpbroadcastd m18, [o(pw_m2598x8)] + vpbroadcastd m9, [o(pw_3166x8)] + vpbroadcastd m19, [o(pw_3612x8)] + vpbroadcastd m11, [o(pw_1931x8)] + vpbroadcastd m20, [o(pw_m1189x8)] + vpbroadcastd m12, [o(pw_3920x8)] + pmulhrsw m21, m14 ; t15a + pmulhrsw m14, m8 ; t8a + pmulhrsw m18, m17 ; t9a + pmulhrsw m17, m9 ; t14a + pmulhrsw m19, m16 ; t13a + pmulhrsw m16, m11 ; t10a + pmulhrsw m20, m15 ; t11a + pmulhrsw m15, m12 ; t12a + jmp .main2 +ALIGN function_align +.main_oddhalf: + ITX_MULSUB_2W 14, 21, 8, 9, 10, 401, 4076 ; t8a, t15a + ITX_MULSUB_2W 18, 17, 8, 9, 10, 3166, 2598 ; t9a, t14a + ITX_MULSUB_2W 16, 19, 8, 9, 10, 1931, 3612 ; t10a, t13a + ITX_MULSUB_2W 20, 15, 8, 9, 10, 3920, 1189 ; t11a, t12a +.main2: + paddsw m8, m20, m16 ; t11 + psubsw m20, m16 ; t10 + paddsw m16, m15, m19 ; t12 + psubsw m15, m19 ; t13 + psubsw m19, m14, m18 ; t9 + paddsw m14, m18 ; t8 + psubsw m18, m21, m17 ; t14 + paddsw m21, m17 ; t15 +.main3: + vpbroadcastd m11, [o(pw_1567_3784)] + vpbroadcastd m12, [o(pw_m3784_1567)] + ITX_MULSUB_2W 18, 19, 9, 17, 10, 11, 12 ; t9a, t14a + vpbroadcastd m11, [o(pw_m1567_m3784)] + ITX_MULSUB_2W 15, 20, 9, 17, 10, 12, 11 ; t10a, t13a + vpbroadcastd m11, [o(pw_2896_2896)] + vpbroadcastd m12, [o(pw_m2896_2896)] + psubsw m17, m14, m8 ; t11a + paddsw m8, m14 ; t8a + paddsw m14, m18, m15 ; t9 + psubsw m18, m15 ; t10 + psubsw m15, m19, m20 ; t13 + paddsw m19, m20 ; t14 + paddsw m20, m21, m16 ; t15a + psubsw m16, m21, m16 ; t12a + ITX_MULSUB_2W 15, 18, 9, 21, 10, 11, 12 ; t10a, t13a + ITX_MULSUB_2W 16, 17, 9, 21, 10, 11, 12 ; t11, t12 + psubsw m21, m0, m20 ; out15 + paddsw m0, m20 ; out0 + psubsw m20, m1, m19 ; out14 + paddsw m1, m19 ; out1 + psubsw m19, m2, m18 ; out13 + paddsw m2, m18 ; out2 + psubsw m18, m3, m17 ; out12 + paddsw m3, m17 ; out3 + psubsw m17, m4, m16 ; out11 + paddsw m4, m16 ; out4 + psubsw m16, m5, m15 ; out10 + paddsw m5, m15 ; out5 + psubsw m15, m6, m14 ; out9 + paddsw m6, m14 ; out6 + psubsw m14, m7, m8 ; out8 + paddsw m7, m8 ; out7 + ret +.transpose_round: + punpcklwd m8, m0, m2 + punpckhwd m0, m2 + punpcklwd m2, m1, m3 + punpckhwd m1, m3 + punpcklwd m3, m4, m6 + punpckhwd m4, m6 + punpcklwd m6, m5, m7 + punpckhwd m5, m7 + punpcklwd m7, m14, m16 + punpckhwd m14, m16 + punpcklwd m16, m15, m17 + punpckhwd m15, m17 + punpcklwd m17, m19, m21 + punpckhwd m19, m21 + punpckhwd m21, m18, m20 + punpcklwd m18, m20 + punpcklwd m20, m8, m1 + punpckhwd m8, m1 + punpcklwd m1, m0, m2 + punpckhwd m0, m2 + punpcklwd m2, m3, m5 + punpckhwd m3, m5 + punpcklwd m5, m4, m6 + punpckhwd m4, m6 + REPX {pmulhrsw x, m9}, m20, m8, m1, m0 + punpcklwd m6, m7, m15 + punpckhwd m7, m15 + punpcklwd m15, m14, m16 + punpckhwd m14, m16 + REPX {pmulhrsw x, m9}, m2, m3, m5, m4 + punpckhwd m16, m18, m19 + punpcklwd m18, m19 + punpcklwd m19, m21, m17 + punpckhwd m21, m17 + REPX {pmulhrsw x, m9}, m6, m7, m15, m14 + punpcklwd m17, m8, m0 ; a2 a6 aa ae + punpckhwd m8, m0 ; a3 a7 ab af + punpcklwd m0, m20, m1 ; a0 a4 a8 ac + punpckhwd m20, m1 ; a1 a5 a9 ad + REPX {pmulhrsw x, m9}, m16, m18, m19, m21 + punpcklwd m1, m2, m5 ; b0 b4 b8 bc + punpckhwd m2, m5 ; b1 b5 b9 bd + punpcklwd m5, m3, m4 ; b2 b6 ba be + punpckhwd m3, m4 ; b3 b7 bb bf + punpcklwd m4, m6, m15 ; c0 c4 c8 cc + punpckhwd m6, m15 ; c1 c5 c9 cd + punpcklwd m15, m7, m14 ; c2 c6 ca ce + punpckhwd m7, m14 ; c3 c7 cb cf + punpcklwd m14, m18, m19 ; d0 d4 d8 dc + punpckhwd m18, m19 ; d1 d5 d9 dd + punpcklwd m9, m16, m21 ; d2 d6 da de + punpckhwd m16, m21 ; d3 d7 db df + vshufi32x4 m21, m0, m1, q3232 ; a8 ac b8 bc + vinserti32x8 m0, ym1, 1 ; a0 a4 b0 b4 + vinserti32x8 m1, m17, ym5, 1 ; a2 a6 b2 b6 + vshufi32x4 m5, m17, m5, q3232 ; aa ae ba be + vinserti32x8 m17, m8, ym3, 1 ; a3 a7 b3 b7 + vshufi32x4 m19, m8, m3, q3232 ; ab af bb bf + vinserti32x8 m3, m4, ym14, 1 ; c0 c4 d0 d4 + vshufi32x4 m4, m14, q3232 ; c8 cc d8 dc + vinserti32x8 m14, m20, ym2, 1 ; a1 a5 b1 b5 + vshufi32x4 m20, m2, q3232 ; a9 ad b9 bd + vinserti32x8 m2, m6, ym18, 1 ; c1 c5 d1 d5 + vshufi32x4 m6, m18, q3232 ; c9 cd d9 dd + vinserti32x8 m18, m15, ym9, 1 ; c2 c6 d2 d6 + vshufi32x4 m15, m9, q3232 ; ca ce da de + vinserti32x8 m9, m7, ym16, 1 ; c3 c7 d3 d7 + vshufi32x4 m7, m16, q3232 ; cb cf db df + ret + +%macro IDTX_16x32 4 ; src/dst[1-4] + pmulhrsw m%1, m15, [cq+64*%1] + pmulhrsw m%2, m15, [cq+64*%2] + pmulhrsw m%3, m15, [cq+64*%3] + pmulhrsw m%4, m15, [cq+64*%4] + pmulhrsw m18, m16, m%1 + pmulhrsw m19, m16, m%2 + pmulhrsw m20, m16, m%3 + pmulhrsw m21, m16, m%4 + REPX {pmulhrsw x, m17}, m18, m19, m20, m21 + paddsw m%1, m18 + paddsw m%2, m19 + paddsw m%3, m20 + paddsw m%4, m21 +%endmacro + +%macro IDTX_16x32_STORE 2 ; src[1-2] + mova xm17, [dstq+r3*0] + vinserti128 ym17, [dstq+r3*4], 1 + vinserti32x4 m17, [dstq+r3*8], 2 + vinserti32x4 m17, [dstq+r4*8], 3 + mova [cq+64*(%1*2+0)], m18 + mova [cq+64*(%1*2+1)], m18 + punpcklbw m16, m17, m18 + punpckhbw m17, m18 + paddw m16, m%1 + paddw m17, m%2 + packuswb m16, m17 + mova [dstq+r3*0], xm16 + vextracti128 [dstq+r3*4], ym16, 1 + vextracti32x4 [dstq+r3*8], m16, 2 + vextracti32x4 [dstq+r4*8], m16, 3 +%if %1 != 7 + add dstq, strideq +%endif +%endmacro + +cglobal inv_txfm_add_identity_identity_16x32_8bpc, 3, 5, 22, dst, stride, c + vpbroadcastd m15, [pw_2896x8] + vpbroadcastd m16, [pw_1697x16] + vpbroadcastd m17, [pw_16384] + IDTX_16x32 0, 1, 2, 3 + IDTX_16x32 4, 5, 6, 7 + IDTX_16x32 8, 9, 10, 11 + IDTX_16x32 12, 13, 14, 15 + vpbroadcastd m16, [pw_8192] + call .transpose_2x8x8_round + lea r3, [strideq*2] + lea r4, [strideq*3] + pxor m18, m18 + IDTX_16x32_STORE 0, 8 + IDTX_16x32_STORE 1, 9 + IDTX_16x32_STORE 2, 10 + IDTX_16x32_STORE 3, 11 + IDTX_16x32_STORE 4, 12 + IDTX_16x32_STORE 5, 13 + IDTX_16x32_STORE 6, 14 + IDTX_16x32_STORE 7, 15 + RET +ALIGN function_align +.transpose_2x8x8_round: + punpckhwd m17, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckhdq m6, m5, m7 + punpckldq m5, m7 + punpckldq m7, m17, m1 + punpckhdq m17, m1 + REPX {pmulhrsw x, m16}, m0, m2, m3, m4, m5, m7, m6, m17 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m17 + punpcklqdq m6, m17 + punpckhwd m17, m12, m13 + punpcklwd m12, m13 + punpckhwd m13, m8, m9 + punpcklwd m8, m9 + punpckhwd m9, m14, m15 + punpcklwd m14, m15 + punpckhwd m15, m10, m11 + punpcklwd m10, m11 + punpckhdq m11, m8, m10 + punpckldq m8, m10 + punpckldq m10, m12, m14 + punpckhdq m12, m14 + punpckhdq m14, m13, m15 + punpckldq m13, m15 + punpckldq m15, m17, m9 + punpckhdq m17, m9 + REPX {pmulhrsw x, m16}, m8, m10, m11, m12, m13, m15, m14, m17 + punpckhqdq m9, m8, m10 + punpcklqdq m8, m10 + punpcklqdq m10, m11, m12 + punpckhqdq m11, m12 + punpcklqdq m12, m13, m15 + punpckhqdq m13, m15 + punpckhqdq m15, m14, m17 + punpcklqdq m14, m17 + ret + +%macro IDTX_32x16 4 ; dst[1-4] + pmulhrsw m%2, m12, [cq+32*(%1+ 0)] + pmulhrsw m18, m12, [cq+32*(%1+16)] + pmulhrsw m%4, m12, [cq+32*(%3+ 0)] + pmulhrsw m19, m12, [cq+32*(%3+16)] + REPX {paddsw x, x}, m%2, m18, m%4, m19 + mova m%1, m14 + vpermi2q m%1, m%2, m18 + vpermt2q m%2, m16, m18 +%if %3 != 14 + mova m%3, m14 +%endif + vpermi2q m%3, m%4, m19 + vpermt2q m%4, m16, m19 + pmulhrsw m18, m17, m%1 + pmulhrsw m19, m17, m%2 + pmulhrsw m20, m17, m%3 + pmulhrsw m21, m17, m%4 + REPX {paddsw x, x}, m%1, m%2, m%3, m%4 + paddsw m%1, m18 + paddsw m%2, m19 + paddsw m%3, m20 + paddsw m%4, m21 +%endmacro + +%macro IDTX_32x16_STORE 2-3 0 ; src[1-2], 32x32 + mova ym19, [dstq+strideq*0] + vinserti32x8 m19, [dstq+strideq*8], 1 +%if %3 == 0 + mova [cq+64*(%1*2+0)], m20 + mova [cq+64*(%1*2+1)], m20 +%endif + punpcklbw m18, m19, m20 + punpckhbw m19, m20 + paddw m18, m%1 + paddw m19, m%2 + packuswb m18, m19 + mova [dstq+strideq*0], ym18 + vextracti32x8 [dstq+strideq*8], m18, 1 +%if %3 || %1 != 7 + add dstq, strideq +%endif +%endmacro + +cglobal inv_txfm_add_identity_identity_32x16_8bpc, 3, 3, 22, dst, stride, c + vpbroadcastd m12, [pw_2896x8] + movu m14, [permB+7] + vpbroadcastd m17, [pw_1697x16] + psrlq m16, m14, 4 + IDTX_32x16 0, 1, 2, 3 + IDTX_32x16 4, 5, 6, 7 + IDTX_32x16 8, 9, 10, 11 + IDTX_32x16 12, 13, 14, 15 + vpbroadcastd m16, [pw_2048] + call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round + pxor m20, m20 + IDTX_32x16_STORE 0, 8 + IDTX_32x16_STORE 1, 9 + IDTX_32x16_STORE 2, 10 + IDTX_32x16_STORE 3, 11 + IDTX_32x16_STORE 4, 12 + IDTX_32x16_STORE 5, 13 + IDTX_32x16_STORE 6, 14 + IDTX_32x16_STORE 7, 15 + RET + +%macro IDCT_32x32_END 4 ; src, mem, stride[1-2] + pmovzxbw m10, [dstq+%3] + pmovzxbw m11, [r3 +%4] +%if %2 < 8 + paddsw m8, m%2, m%1 + psubsw m9, m%2, m%1 +%else + mova m9, [cq+64*(%2*2-16)] + paddsw m8, m9, m%1 + psubsw m9, m%1 +%endif + pmulhrsw m8, m12 + pmulhrsw m9, m12 +%if %2 >= 8 +%if %2 == 8 + pxor m0, m0 +%endif + mova [cq+64*(%2*2-16)], m0 + mova [cq+64*(%2*2-15)], m0 +%endif + paddw m8, m10 + paddw m9, m11 + packuswb m8, m9 + vpermq m8, m13, m8 + mova [dstq+%3], ym8 + vextracti32x8 [r3 +%4], m8, 1 +%if %2 == 3 || %2 == 7 || %2 == 11 + add dstq, r5 + sub r3, r5 +%endif +%endmacro + +cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 0, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + WIN64_SPILL_XMM 30 + cmp eobd, 136 + jb .fast + mova m5, [cq+64*20] + mova m3, [cq+64*12] + mova m1, [cq+64* 4] + mova m7, [cq+64*28] + mova m2, [cq+64* 8] + mova m6, [cq+64*24] + mova m0, [cq+64* 0] + mova m4, [cq+64*16] + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + mova m14, [cq+64* 2] + mova m21, [cq+64*30] + mova m18, [cq+64*18] + mova m17, [cq+64*14] + mova m16, [cq+64*10] + mova m19, [cq+64*22] + mova m20, [cq+64*26] + mova m15, [cq+64* 6] + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + mova [cq+64* 0], m14 + mova [cq+64* 2], m15 + mova [cq+64* 4], m16 + mova [cq+64* 6], m17 + mova [cq+64* 8], m18 + mova [cq+64*10], m19 + mova [cq+64*12], m20 + mova [cq+64*14], m21 + mova m22, [cq+64* 1] + mova m21, [cq+64*31] + mova m14, [cq+64*17] + mova m29, [cq+64*15] + mova m26, [cq+64* 9] + mova m17, [cq+64*23] + mova m18, [cq+64*25] + mova m25, [cq+64* 7] + mova m24, [cq+64* 5] + mova m19, [cq+64*27] + mova m16, [cq+64*21] + mova m27, [cq+64*11] + mova m28, [cq+64*13] + mova m15, [cq+64*19] + mova m20, [cq+64*29] + mova m23, [cq+64* 3] + call .main_oddhalf + vpbroadcastd m10, [o(pw_8192)] + psubsw m13, m0, m29 ; 31 + paddsw m0, m29 ; 0 + psubsw m29, m1, m28 ; 30 + paddsw m1, m28 ; 1 + psubsw m28, m2, m27 ; 29 + paddsw m2, m27 ; 2 + psubsw m27, m3, m26 ; 28 + paddsw m3, m26 ; 3 + psubsw m26, m4, m25 ; 27 + paddsw m4, m25 ; 4 + psubsw m25, m5, m24 ; 26 + paddsw m5, m24 ; 5 + psubsw m24, m6, m23 ; 25 + paddsw m6, m23 ; 6 + psubsw m23, m7, m22 ; 24 + paddsw m7, m22 ; 7 + pxor m9, m9 + punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7 + punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3 + punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7 + punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3 + REPX {mova [cq+64*x], m9}, 16, 17, 18, 19 + punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7 + punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3 + punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7 + punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3 + REPX {mova [cq+64*x], m9}, 20, 21, 22, 23 + punpckhwd m3, m23, m24 + punpcklwd m23, m24 + punpckhwd m24, m25, m26 + punpcklwd m25, m26 + REPX {mova [cq+64*x], m9}, 24, 25, 26, 27 + punpckhwd m26, m27, m28 + punpcklwd m27, m28 + punpckhwd m28, m29, m13 + punpcklwd m29, m13 + REPX {mova [cq+64*x], m9}, 28, 29, 30, 31 + punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3 + punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1 + punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7 + punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5 + punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7 + punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5 + REPX {pmulhrsw x, m10}, m0, m4, m8, m22 + punpckhdq m13, m23, m25 + punpckldq m23, m25 + punpckhdq m25, m27, m29 + punpckldq m27, m29 + REPX {pmulhrsw x, m10}, m13, m23, m25, m27 + punpckhdq m9, m3, m24 + punpckldq m3, m24 + punpckhdq m24, m26, m28 + punpckldq m26, m28 + punpcklqdq m5, m23, m27 ; d00 d08 d16 d24 + punpckhqdq m23, m27 ; d01 d09 d17 d25 + punpckhqdq m27, m13, m25 ; d03 d11 d19 d27 + punpcklqdq m13, m25 ; d02 d10 d18 d26 + punpckhqdq m25, m3, m26 ; d05 d13 d21 d29 + punpcklqdq m3, m26 ; d04 d12 d20 d28 + punpckhqdq m26, m9, m24 ; d07 d15 d23 d31 + punpcklqdq m9, m24 ; d06 d14 d22 d30 + REPX {pmulhrsw x, m10}, m25, m3, m26 + mova [cq+64* 9], m23 + mova [cq+64*11], m27 + mova [cq+64*13], m25 + mova [cq+64*15], m26 + punpckhqdq m24, m8, m22 ; a05 a13 a21 a29 + punpcklqdq m8, m22 ; a04 a12 a20 a28 + punpckhqdq m22, m0, m4 ; a01 a09 a17 a25 + punpcklqdq m0, m4 ; a00 a08 a16 a24 + punpckhqdq m23, m7, m2 ; a03 a11 a19 a27 + punpcklqdq m7, m2 ; a02 a10 a18 a26 + punpckhqdq m25, m6, m1 ; a07 a15 a23 a31 + punpcklqdq m6, m1 ; a06 a14 a22 a30 + mova m2, [cq+64* 0] + mova m11, [cq+64* 2] + mova m12, [cq+64* 4] + mova m29, [cq+64* 6] + mova m27, [cq+64* 8] + mova m26, [cq+64*10] + mova m4, [cq+64*12] + mova m28, [cq+64*14] + psubsw m1, m2, m21 ; 23 + paddsw m2, m21 ; 8 + psubsw m21, m11, m20 ; 22 + paddsw m11, m20 ; 9 + psubsw m20, m12, m19 ; 21 + paddsw m12, m19 ; 10 + psubsw m19, m29, m18 ; 20 + paddsw m29, m18 ; 11 + psubsw m18, m27, m17 ; 19 + paddsw m27, m17 ; 12 + psubsw m17, m26, m16 ; 18 + paddsw m26, m16 ; 13 + paddsw m16, m4, m15 ; 14 + psubsw m4, m15 ; 17 + pmulhrsw m15, m6, m10 + psubsw m6, m28, m14 ; 16 + paddsw m28, m14 ; 15 + pmulhrsw m14, m7, m10 + punpcklwd m7, m6, m4 + punpckhwd m6, m4 + punpckhwd m4, m17, m18 + punpcklwd m17, m18 + punpckhwd m18, m19, m20 + punpcklwd m19, m20 + punpckhwd m20, m21, m1 + punpcklwd m21, m1 + punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7 + punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3 + punpckhwd m11, m12, m29 ; k4 l4 k5 l5 k6 l6 k7 l7 + punpcklwd m12, m29 ; k0 l0 k1 l1 k2 l2 k3 l3 + punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7 + punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3 + punpckhwd m26, m16, m28 ; o4 p4 o5 p5 o6 p6 o7 p7 + punpcklwd m16, m28 ; o0 p0 o1 p1 o2 p2 o3 p3 + pmulhrsw m23, m10 + pmulhrsw m25, m10 + punpckhdq m28, m2, m12 ; i2 j2 k2 l2 i3 j3 k3 l3 + punpckldq m2, m12 ; i0 j0 k0 l0 i1 j1 k1 l1 + punpckhdq m12, m27, m16 ; m2 n2 o2 p2 m3 n3 o3 p3 + punpckldq m27, m16 ; m0 n0 o0 p0 m1 n1 o1 p1 + REPX {pmulhrsw x, m10}, m28, m2, m12, m27 + punpckhdq m16, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7 + punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5 + punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7 + punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5 + REPX {pmulhrsw x, m10}, m16, m1, m11, m29 + punpckhdq m26, m19, m21 + punpckldq m19, m21 + punpckhdq m21, m6, m4 + punpckldq m6, m4 + REPX {pmulhrsw x, m10}, m26, m19, m21, m6 + punpckhdq m4, m18, m20 + punpckldq m18, m20 + punpckhdq m20, m7, m17 + punpckldq m7, m17 + REPX {pmulhrsw x, m10}, m4, m18, m20, m7 + punpcklqdq m17, m28, m12 ; b02 b10 b18 b26 + punpckhqdq m28, m12 ; b03 b11 b19 b27 + punpckhqdq m12, m2, m27 ; b01 b09 b17 b25 + punpcklqdq m2, m27 ; b00 b08 b16 b24 + punpckhqdq m27, m1, m29 ; b05 b13 b21 b29 + punpcklqdq m1, m29 ; b04 b12 b20 b28 + punpckhqdq m29, m16, m11 ; b07 b15 b23 b31 + punpcklqdq m16, m11 ; b06 b14 b22 b30 + mova [cq+64* 1], m12 + mova [cq+64* 3], m28 + mova [cq+64* 5], m27 + mova [cq+64* 7], m29 + punpckhqdq m27, m20, m26 ; c03 c11 c19 c27 + punpcklqdq m20, m26 ; c02 c10 c18 c26 + punpckhqdq m26, m7, m19 ; c01 c09 c17 c25 + punpcklqdq m7, m19 ; c00 c08 c16 c24 + punpckhqdq m28, m6, m18 ; c05 c13 c21 c29 + punpcklqdq m6, m18 ; c04 c12 c20 c28 + punpckhqdq m29, m21, m4 ; c07 c15 c23 c31 + punpcklqdq m21, m4 ; c06 c14 c22 c30 + pmulhrsw m19, m9, m10 + vshufi32x4 m4, m0, m2, q3232 ; a16 a24 b16 b24 + vinserti32x8 m0, ym2, 1 ; a00 a08 b00 b08 + vshufi32x4 m2, m7, m5, q3232 ; c16 c24 d16 d24 + vinserti32x8 m7, ym5, 1 ; c00 c08 d00 d08 + vshufi32x4 m5, m8, m1, q3232 ; a20 a28 b20 b28 + vinserti32x8 m1, m8, ym1, 1 ; a04 a12 b04 b12 + vshufi32x4 m8, m6, m3, q3232 ; c20 c28 d20 d28 + vinserti32x8 m6, ym3, 1 ; c04 c12 d04 d12 + vshufi32x4 m3, m1, m6, q3131 ; 12 + vshufi32x4 m1, m6, q2020 ; 4 + vshufi32x4 m6, m4, m2, q3131 ; 24 + vshufi32x4 m4, m2, q2020 ; 16 + vshufi32x4 m2, m0, m7, q3131 ; 8 + vshufi32x4 m0, m7, q2020 ; 0 + vshufi32x4 m7, m5, m8, q3131 ; 28 + vshufi32x4 m5, m8, q2020 ; 20 + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + vshufi32x4 m18, m14, m17, q3232 ; a18 a26 b18 b26 + vinserti32x8 m14, ym17, 1 ; a02 a10 b02 b10 + vshufi32x4 m17, m20, m13, q3232 ; c18 c26 d18 d26 + vinserti32x8 m20, ym13, 1 ; c02 c10 d02 d10 + vshufi32x4 m13, m21, m19, q3232 ; c22 c30 d22 d30 + vinserti32x8 m21, ym19, 1 ; c06 c14 d06 d14 + vshufi32x4 m19, m15, m16, q3232 ; a22 a30 b22 b30 + vinserti32x8 m15, ym16, 1 ; a06 a14 b06 b14 + vshufi32x4 m16, m14, m20, q3131 ; 10 + vshufi32x4 m14, m20, q2020 ; 2 + vshufi32x4 m20, m18, m17, q3131 ; 26 + vshufi32x4 m18, m17, q2020 ; 18 + vshufi32x4 m17, m15, m21, q3131 ; 14 + vshufi32x4 m15, m21, q2020 ; 6 + vshufi32x4 m21, m19, m13, q3131 ; 30 + vshufi32x4 m19, m13, q2020 ; 22 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + mova [cq+64* 0], m14 + mova [cq+64* 2], m15 + mova [cq+64* 4], m16 + mova [cq+64* 6], m17 + mova [cq+64* 8], m18 + mova [cq+64*10], m19 + mova [cq+64*12], m20 + mova [cq+64*14], m21 + mova m15, [cq+64* 1] + mova m16, [cq+64* 3] + mova m17, [cq+64* 5] + mova m19, [cq+64* 7] + mova m20, [cq+64* 9] + mova m21, [cq+64*11] + mova m13, [cq+64*13] + mova m18, [cq+64*15] + vshufi32x4 m14, m22, m15, q3232 ; a17 a25 b17 b25 + vinserti32x8 m22, ym15, 1 ; a01 a09 b01 b09 + vshufi32x4 m15, m23, m16, q3232 ; a19 a27 b19 b27 + vinserti32x8 m23, ym16, 1 ; a03 a11 b03 b11 + vshufi32x4 m16, m24, m17, q3232 ; a21 a29 b21 b29 + vinserti32x8 m24, ym17, 1 ; a05 a13 b05 b13 + vshufi32x4 m17, m25, m19, q3232 ; a23 a31 b23 b31 + vinserti32x8 m25, ym19, 1 ; a07 a15 b07 b15 + vinserti32x8 m8, m26, ym20, 1 ; c01 c09 d01 d09 + vshufi32x4 m26, m20, q3232 ; c17 c25 d17 d25 + vinserti32x8 m9, m27, ym21, 1 ; c03 c11 d03 d11 + vshufi32x4 m27, m21, q3232 ; c19 c27 d19 d27 + vinserti32x8 m11, m28, ym13, 1 ; c05 c13 d05 d13 + vshufi32x4 m28, m13, q3232 ; c21 c29 d21 d29 + vinserti32x8 m12, m29, ym18, 1 ; c07 c15 d07 d15 + vshufi32x4 m29, m18, q3232 ; c23 c31 d23 d31 + vshufi32x4 m18, m14, m26, q3131 ; 25 + vshufi32x4 m14, m26, q2020 ; 17 + vshufi32x4 m19, m15, m27, q3131 ; 27 + vshufi32x4 m15, m27, q2020 ; 19 + vshufi32x4 m20, m16, m28, q3131 ; 29 + vshufi32x4 m16, m28, q2020 ; 21 + vshufi32x4 m21, m17, m29, q3131 ; 31 + vshufi32x4 m17, m29, q2020 ; 23 + vshufi32x4 m26, m22, m8, q3131 ; 9 + vshufi32x4 m22, m8, q2020 ; 1 + vshufi32x4 m27, m23, m9, q3131 ; 11 + vshufi32x4 m23, m9, q2020 ; 3 + vshufi32x4 m28, m24, m11, q3131 ; 13 + vshufi32x4 m24, m11, q2020 ; 5 + vshufi32x4 m29, m25, m12, q3131 ; 15 + vshufi32x4 m25, m12, q2020 ; 7 + call .main_oddhalf + jmp .end +.fast: ; bottom/right halves are zero + mova m14, [o(dup16_perm)] + pmovzxwd m9, [cq+64* 0] + pmovzxwd m6, [cq+64* 8] + vpermb m8, m14, [cq+64* 2] + vpermb ym0, ym14, [cq+64*14] + vpermb ym5, ym14, [cq+64*10] + vpermb m1, m14, [cq+64* 6] + vpermb m7, m14, [cq+64* 4] + vpermb ym3, ym14, [cq+64*12] + pslld m9, 16 + pslld m6, 16 + call m(idct_16x16_internal_8bpc).main_fast + vpermb m21, m14, [cq+64* 1] + vpermb ym17, ym14, [cq+64*15] + vpermb ym20, ym14, [cq+64* 9] + vpermb m15, m14, [cq+64* 7] + vpermb m18, m14, [cq+64* 5] + vpermb ym16, ym14, [cq+64*11] + vpermb ym19, ym14, [cq+64*13] + vpermb m14, m14, [cq+64* 3] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + vpbroadcastd m9, [o(pw_8192)] + call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round + vshufi32x4 m22, m14, m2, q2020 ; 1 + vshufi32x4 m24, m14, m2, q3131 ; 5 + vshufi32x4 m23, m17, m9, q2020 ; 3 + vshufi32x4 m25, m17, m9, q3131 ; 7 + vshufi32x4 m16, m5, m15, q2020 ; 10 + vshufi32x4 m17, m5, m15, q3131 ; 14 + vshufi32x4 m14, m1, m18, q2020 ; 2 + vshufi32x4 m15, m1, m18, q3131 ; 6 + vshufi32x4 m1, m0, m3, q3131 ; 4 + vshufi32x4 m0, m3, q2020 ; 0 + vshufi32x4 m3, m21, m4, q3131 ; 12 + vshufi32x4 m2, m21, m4, q2020 ; 8 + vshufi32x4 m26, m20, m6, q2020 ; 9 + vshufi32x4 m28, m20, m6, q3131 ; 13 + vshufi32x4 m27, m19, m7, q2020 ; 11 + vshufi32x4 m29, m19, m7, q3131 ; 15 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast + mova [cq+64* 0], m14 + mova [cq+64* 2], m15 + mova [cq+64* 4], m16 + mova [cq+64* 6], m17 + mova [cq+64* 8], m18 + mova [cq+64*10], m19 + mova [cq+64*12], m20 + mova [cq+64*14], m21 + call .main_oddhalf_fast +.end: + lea r4, [strideq*3] + vpbroadcastd m12, [o(pw_2048)] + movshdup m13, [o(permD)] + lea r3, [dstq+r4*8] + lea r5, [strideq+r4] ; stride*4 + add r3, r5 ; dst+stride*28 + IDCT_32x32_END 29, 0, strideq*0, r4 + IDCT_32x32_END 28, 1, strideq*1, strideq*2 + IDCT_32x32_END 27, 2, strideq*2, strideq*1 + IDCT_32x32_END 26, 3, r4 , strideq*0 + IDCT_32x32_END 25, 4, strideq*0, r4 + IDCT_32x32_END 24, 5, strideq*1, strideq*2 + IDCT_32x32_END 23, 6, strideq*2, strideq*1 + IDCT_32x32_END 22, 7, r4 , strideq*0 + IDCT_32x32_END 21, 8, strideq*0, r4 + IDCT_32x32_END 20, 9, strideq*1, strideq*2 + IDCT_32x32_END 19, 10, strideq*2, strideq*1 + IDCT_32x32_END 18, 11, r4 , strideq*0 + IDCT_32x32_END 17, 12, strideq*0, r4 + IDCT_32x32_END 16, 13, strideq*1, strideq*2 + IDCT_32x32_END 15, 14, strideq*2, strideq*1 + IDCT_32x32_END 14, 15, r4 , strideq*0 + RET +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + mov r3d, 32 + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly2 +ALIGN function_align +.main_oddhalf_fast2: ; bottom three-quarters are zero + vpbroadcastd m21, [o(pw_4091x8)] + vpbroadcastd m8, [o(pw_201x8)] + vpbroadcastd m18, [o(pw_m1380x8)] + vpbroadcastd m9, [o(pw_3857x8)] + vpbroadcastd m19, [o(pw_3973x8)] + vpbroadcastd m11, [o(pw_995x8)] + vpbroadcastd m28, [o(pw_m601x8)] + vpbroadcastd m12, [o(pw_4052x8)] + pmulhrsw m21, m22 ; t31a + pmulhrsw m22, m8 ; t16a + pmulhrsw m18, m25 ; t19a + pmulhrsw m25, m9 ; t28a + pmulhrsw m19, m24 ; t27a + pmulhrsw m24, m11 ; t20a + pmulhrsw m28, m23 ; t23a + pmulhrsw m23, m12 ; t24a + mova m15, m21 + mova m8, m22 + mova m14, m18 + mova m27, m25 + mova m29, m19 + mova m26, m24 + mova m16, m28 + mova m20, m23 + jmp .main3 +ALIGN function_align +.main_oddhalf_fast: ; bottom half is zero + vpbroadcastd m21, [o(pw_4091x8)] + vpbroadcastd m8, [o(pw_201x8)] + vpbroadcastd m14, [o(pw_m2751x8)] + vpbroadcastd m9, [o(pw_3035x8)] + vpbroadcastd m17, [o(pw_3703x8)] + vpbroadcastd m11, [o(pw_1751x8)] + vpbroadcastd m18, [o(pw_m1380x8)] + vpbroadcastd m12, [o(pw_3857x8)] + pmulhrsw m21, m22 ; t31a + vpbroadcastd m19, [o(pw_3973x8)] + pmulhrsw m22, m8 ; t16a + vpbroadcastd m8, [o(pw_995x8)] + pmulhrsw m14, m29 ; t30a + vpbroadcastd m16, [o(pw_m2106x8)] + pmulhrsw m29, m9 ; t17a + vpbroadcastd m9, [o(pw_3513x8)] + pmulhrsw m17, m26 ; t29a + vpbroadcastd m15, [o(pw_3290x8)] + pmulhrsw m26, m11 ; t18a + vpbroadcastd m11, [o(pw_2440x8)] + pmulhrsw m18, m25 ; t19a + vpbroadcastd m20, [o(pw_m601x8)] + pmulhrsw m25, m12 ; t28a + vpbroadcastd m12, [o(pw_4052x8)] + pmulhrsw m19, m24 ; t27a + pmulhrsw m24, m8 ; t20a + pmulhrsw m16, m27 ; t21a + pmulhrsw m27, m9 ; t26a + pmulhrsw m15, m28 ; t25a + pmulhrsw m28, m11 ; t22a + pmulhrsw m20, m23 ; t23a + pmulhrsw m23, m12 ; t24a + jmp .main2 +ALIGN function_align +.main_oddhalf: + ITX_MULSUB_2W 22, 21, 8, 9, 10, 201, 4091 ; t16a, t31a + ITX_MULSUB_2W 14, 29, 8, 9, 10, 3035, 2751 ; t17a, t30a + ITX_MULSUB_2W 26, 17, 8, 9, 10, 1751, 3703 ; t18a, t29a + ITX_MULSUB_2W 18, 25, 8, 9, 10, 3857, 1380 ; t19a, t28a + ITX_MULSUB_2W 24, 19, 8, 9, 10, 995, 3973 ; t20a, t27a + ITX_MULSUB_2W 16, 27, 8, 9, 10, 3513, 2106 ; t21a, t26a + ITX_MULSUB_2W 28, 15, 8, 9, 10, 2440, 3290 ; t22a, t25a + ITX_MULSUB_2W 20, 23, 8, 9, 10, 4052, 601 ; t23a, t24a +.main2: + psubsw m8, m22, m14 ; t17 + paddsw m22, m14 ; t16 + paddsw m14, m18, m26 ; t19 + psubsw m18, m26 ; t18 + psubsw m26, m24, m16 ; t21 + paddsw m24, m16 ; t20 + psubsw m16, m20, m28 ; t22 + paddsw m28, m20 ; t23 + psubsw m20, m23, m15 ; t25 + paddsw m23, m15 ; t24 + psubsw m15, m21, m29 ; t30 + paddsw m21, m29 ; t31 + psubsw m29, m19, m27 ; t26 + paddsw m19, m27 ; t27 + paddsw m27, m25, m17 ; t28 + psubsw m25, m17 ; t29 +.main3: + ITX_MULSUB_2W 15, 8, 9, 17, 10, 799, 4017 ; t17a, t30a + ITX_MULSUB_2W 25, 18, 9, 17, 10, m4017, 799 ; t18a, t29a + ITX_MULSUB_2W 29, 26, 9, 17, 10, 3406, 2276 ; t21a, t26a + ITX_MULSUB_2W 20, 16, 9, 17, 10, m2276, 3406 ; t22a, t25a + vpbroadcastd m12, [o(pw_m3784_1567)] + vpbroadcastd m11, [o(pw_1567_3784)] + psubsw m17, m21, m27 ; t28a + paddsw m21, m27 ; t31a + psubsw m27, m15, m25 ; t18 + paddsw m15, m25 ; t17 + psubsw m25, m20, m29 ; t21 + paddsw m20, m29 ; t22 + psubsw m29, m8, m18 ; t29 + paddsw m8, m18 ; t30 + psubsw m18, m22, m14 ; t19a + paddsw m22, m14 ; t16a + psubsw m14, m28, m24 ; t20a + paddsw m24, m28 ; t23a + paddsw m28, m16, m26 ; t25 + psubsw m16, m26 ; t26 + psubsw m26, m23, m19 ; t27a + paddsw m23, m19 ; t24a + ITX_MULSUB_2W 29, 27, 9, 19, 10, 11, 12 ; t18a, t29a + ITX_MULSUB_2W 17, 18, 9, 19, 10, 11, 12 ; t19, t28 + vpbroadcastd m11, [o(pw_m1567_m3784)] + ITX_MULSUB_2W 16, 25, 9, 19, 10, 12, 11 ; t21a, t26a + ITX_MULSUB_2W 26, 14, 9, 19, 10, 12, 11 ; t20, t27 + vpbroadcastd m12, [o(pw_m2896_2896)] + vpbroadcastd m11, [o(pw_2896_2896)] + psubsw m19, m27, m25 ; t26 + paddsw m27, m25 ; t29 + psubsw m25, m17, m26 ; t20a + paddsw m17, m26 ; t19a + paddsw m26, m18, m14 ; t28a + psubsw m18, m14 ; t27a + paddsw m14, m22, m24 ; t16 + psubsw m22, m24 ; t23 + psubsw m24, m29, m16 ; t21 + paddsw m16, m29 ; t18 + paddsw m29, m21, m23 ; t31 + psubsw m21, m23 ; t24 + psubsw m23, m15, m20 ; t22a + paddsw m15, m20 ; t17a + psubsw m20, m8, m28 ; t25a + paddsw m28, m8 ; t30a + ITX_MULSUB_2W 18, 25, 8, 9, 10, 11, 12 ; t20, t27 + ITX_MULSUB_2W 19, 24, 8, 9, 10, 11, 12 ; t21a, t26a + ITX_MULSUB_2W 21, 22, 8, 9, 10, 11, 12 ; t23a, t24a + ITX_MULSUB_2W 20, 23, 8, 9, 10, 11, 12 ; t22, t25 + ret + +%macro IDTX_32x32 2 ; dst[1-2] + vmovdqa32 ym%1, [cq+64*(%1+ 0)] ; force EVEX encoding, which + vmovdqa32 ym17, [cq+64*(%1+16)] ; reduces code size due to + vmovdqa32 ym%2, [cq+64*(%2+ 0)] ; compressed displacements + vmovdqa32 ym18, [cq+64*(%2+16)] + vpermt2q m%1, m21, m17 + vpermt2q m%2, m21, m18 +%endmacro + +cglobal inv_txfm_add_identity_identity_32x32_8bpc, 3, 3, 22, dst, stride, c + movu m21, [permB+7] + vpbroadcastd m16, [pw_8192] + pxor m20, m20 +.loop: + IDTX_32x32 0, 1 + IDTX_32x32 2, 3 + IDTX_32x32 4, 5 + IDTX_32x32 6, 7 + IDTX_32x32 8, 9 + IDTX_32x32 10, 11 + IDTX_32x32 12, 13 + IDTX_32x32 14, 15 + call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round + IDTX_32x16_STORE 0, 8, 1 + IDTX_32x16_STORE 1, 9, 1 + IDTX_32x16_STORE 2, 10, 1 + IDTX_32x16_STORE 3, 11, 1 + IDTX_32x16_STORE 4, 12, 1 + IDTX_32x16_STORE 5, 13, 1 + IDTX_32x16_STORE 6, 14, 1 + IDTX_32x16_STORE 7, 15, 1 + lea dstq, [dstq+strideq*8] + btc cq, 5 + jnc .loop + mov r0d, 8 +.zero_loop: + mova [cq+64*0], m20 + mova [cq+64*1], m20 + mova [cq+64*2], m20 + mova [cq+64*3], m20 + add cq, 64*4 + dec r0d + jg .zero_loop + RET + +cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jz .dconly + WIN64_SPILL_XMM 30 + cmp eobd, 151 + jb .fast + mova m5, [cq+64*10] + mova m3, [cq+64* 6] + mova m1, [cq+64* 2] + mova m7, [cq+64*14] + mova m2, [cq+64* 4] + mova m6, [cq+64*12] + mova m0, [cq+64* 0] + mova m4, [cq+64* 8] + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + mova m14, [cq+64* 1] + mova m21, [cq+64*15] + mova m18, [cq+64* 9] + mova m17, [cq+64* 7] + mova m16, [cq+64* 5] + mova m19, [cq+64*11] + mova m20, [cq+64*13] + mova m15, [cq+64* 3] + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + vpbroadcastd m9, [o(pw_8192)] +%macro TRANSPOSE_8x4_ROUND 4 + punpckhwd m8, m%3, m%4 ; c4 d4 c5 d5 c6 d6 c7 d7 + punpcklwd m%3, m%4 ; c0 d0 c1 d1 c2 d2 c3 d3 + punpckhwd m%4, m%1, m%2 ; a4 b4 a5 b5 a6 b6 a7 b7 + punpcklwd m%1, m%2 ; a0 b0 a1 b1 a2 b2 a3 b3 + punpckhdq m%2, m%1, m%3 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckldq m%1, m%3 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckldq m%3, m%4, m8 ; a4 b4 c4 d4 a5 b5 c5 d5 + punpckhdq m%4, m8 ; a6 b6 c6 d6 a7 b7 c7 d7 + REPX {pmulhrsw x, m9}, m%2, m%1, m%3, m%4 +%endmacro + TRANSPOSE_8x4_ROUND 0, 1, 2, 3 + TRANSPOSE_8x4_ROUND 4, 5, 6, 7 + TRANSPOSE_8x4_ROUND 14, 15, 16, 17 + TRANSPOSE_8x4_ROUND 18, 19, 20, 21 + vinserti32x8 m26, m0, ym4, 1 ; a0 a4 b0 b4 + vshufi32x4 m0, m4, q3232 ; a8 a12 b8 b12 + vinserti32x8 m27, m1, ym5, 1 ; a1 a5 b1 b5 + vshufi32x4 m1, m5, q3232 ; a9 a13 b9 b13 + vinserti32x8 m28, m2, ym6, 1 ; a2 a6 b2 b6 + vshufi32x4 m2, m6, q3232 ; a10 a14 b10 b14 + vinserti32x8 m29, m3, ym7, 1 ; a3 a7 b3 b7 + vshufi32x4 m8, m3, m7, q3232 ; a11 a15 b11 b15 + vinserti32x8 m4, m14, ym18, 1 ; c0 c4 d0 d4 + vshufi32x4 m14, m18, q3232 ; c8 c12 d8 d12 + vinserti32x8 m5, m15, ym19, 1 ; c1 c5 d1 d5 + vshufi32x4 m15, m19, q3232 ; c9 c13 d9 d13 + vinserti32x8 m6, m16, ym20, 1 ; c2 c6 d2 d6 + vshufi32x4 m16, m20, q3232 ; c10 c14 d10 d14 + vinserti32x8 m7, m17, ym21, 1 ; c3 c7 d3 d7 + vshufi32x4 m17, m21, q3232 ; c11 c15 d11 d15 + vshufi32x4 m22, m26, m4, q2020 ; 0 1 + vshufi32x4 m26, m4, q3131 ; 8 9 + vshufi32x4 m23, m27, m5, q2020 ; 2 3 + vshufi32x4 m27, m5, q3131 ; 10 11 + vshufi32x4 m24, m28, m6, q2020 ; 4 5 + vshufi32x4 m28, m6, q3131 ; 12 13 + vshufi32x4 m25, m29, m7, q2020 ; 6 7 + vshufi32x4 m29, m7, q3131 ; 14 15 + vshufi32x4 m4, m0, m14, q2020 ; 16 17 + vshufi32x4 m3, m0, m14, q3131 ; 24 25 + vshufi32x4 m20, m1, m15, q2020 ; 18 19 + vshufi32x4 m19, m1, m15, q3131 ; 26 27 + vshufi32x4 m5, m2, m16, q2020 ; 20 21 + vshufi32x4 m0, m2, m16, q3131 ; 28 29 + vshufi32x4 m16, m8, m17, q2020 ; 22 23 + vshufi32x4 m17, m8, m17, q3131 ; 30 31 + pxor m6, m6 + mova [cq+64* 0], m4 + mova [cq+64* 2], m5 + mova [cq+64* 4], m3 + mova [cq+64* 6], m0 + punpcklwd m8, m24, m24 ; 4 + punpcklwd m0, m0 ; 28 + punpcklwd m5, m5 ; 20 + punpcklwd m1, m28, m28 ; 12 + punpcklwd m7, m26, m26 ; 8 + punpcklwd m3, m3 ; 24 + punpcklwd m9, m6, m22 ; __ 0 + punpcklwd m6, m4 ; __ 16 + call m(idct_16x16_internal_8bpc).main_fast3 + mova [cq+64* 1], m20 + mova [cq+64* 3], m16 + mova [cq+64* 5], m19 + mova [cq+64* 7], m17 + punpcklwd m21, m23, m23 ; 2 + punpcklwd m17, m17 ; 30 + punpcklwd m20, m20 ; 18 + punpcklwd m15, m29, m29 ; 14 + punpcklwd m18, m27, m27 ; 10 + punpcklwd m16, m16 ; 22 + punpcklwd m19, m19 ; 26 + punpcklwd m14, m25, m25 ; 6 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + mova [cq+64* 8], m14 + mova [cq+64* 9], m15 + mova [cq+64*10], m16 + mova [cq+64*11], m17 + mova [cq+64*12], m18 + mova [cq+64*13], m19 + mova [cq+64*14], m20 + mova [cq+64*15], m21 + mova m21, [cq+64* 7] + mova m14, [cq+64* 0] + mova m17, [cq+64* 3] + mova m18, [cq+64* 4] + mova m19, [cq+64* 5] + mova m16, [cq+64* 2] + mova m15, [cq+64* 1] + mova m20, [cq+64* 6] + REPX {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \ + m24, m19, m16, m27, m28, m15, m20, m23 + call .main_oddhalf + jmp .end +.fast: ; right half is zero + mova ym8, [cq+64*15] + vinserti32x8 m8, [cq+64* 1], 1 + mova m2, [o(int16_perm)] + mova ym9, [cq+64* 8] + vinserti32x8 m9, [cq+64* 0], 1 + mova ym0, [cq+64* 7] + vinserti32x8 m0, [cq+64* 9], 1 + mova ym7, [cq+64*14] + vinserti32x8 m7, [cq+64* 2], 1 + mova ym1, [cq+64* 3] + vinserti32x8 m1, [cq+64*13], 1 + mova ym3, [cq+64* 6] + vinserti32x8 m3, [cq+64*10], 1 + mova ym5, [cq+64*11] + vinserti32x8 m5, [cq+64* 5], 1 + mova ym6, [cq+64*12] + vinserti32x8 m6, [cq+64* 4], 1 + REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6 + call m(idct_16x16_internal_8bpc).main2 + vbroadcasti32x4 m8, [o(int_shuf3)] + vbroadcasti32x4 m9, [o(int_shuf4)] + vpbroadcastd m11, [o(pw_8192)] + pshufb m0, m8 + pshufb m1, m9 + pshufb m2, m8 + pshufb m3, m9 + REPX {pmulhrsw x, m11}, m0, m1, m2, m3 + pshufb m4, m8 + pshufb m5, m9 + pshufb m6, m8 + pshufb m7, m9 + REPX {pmulhrsw x, m11}, m4, m5, m6, m7 + punpckhdq m28, m0, m1 + punpckldq m0, m1 + punpckhdq m27, m2, m3 + punpckldq m2, m3 + punpckhdq m22, m4, m5 + punpckldq m4, m5 + punpckhdq m23, m6, m7 + punpckldq m6, m7 + vinserti32x8 m14, m0, ym2, 1 + vshufi32x4 m15, m0, m2, q3232 + vinserti32x8 m2, m4, ym6, 1 + vshufi32x4 m4, m6, q3232 + vshufi32x4 m21, m14, m2, q2020 ; 0 2 + vshufi32x4 m14, m2, q3131 ; 4 6 + vshufi32x4 m18, m15, m4, q2020 ; 8 10 + vshufi32x4 m15, m4, q3131 ; 12 14 + pxor m9, m9 + punpcklwd m8, m14, m14 ; 4 + punpcklwd m1, m15, m15 ; 12 + punpcklwd m7, m18, m18 ; 8 + punpcklwd m9, m21 ; __ 0 + call m(idct_16x16_internal_8bpc).main_fast4 + punpckhwd m21, m21 ; 2 + punpckhwd m15, m15 ; 14 + punpckhwd m18, m18 ; 10 + punpckhwd m14, m14 ; 6 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 + vinserti32x8 m24, m28, ym27, 1 + vshufi32x4 m28, m27, q3232 + vinserti32x8 m27, m22, ym23, 1 + vshufi32x4 m22, m23, q3232 + vshufi32x4 m23, m24, m27, q2020 ; 1 3 + vshufi32x4 m24, m27, q3131 ; 5 7 + vshufi32x4 m27, m28, m22, q2020 ; 9 11 + vshufi32x4 m28, m22, q3131 ; 13 15 + punpcklwd m22, m23, m23 ; 1 + punpckhwd m29, m28, m28 ; 15 + punpcklwd m26, m27, m27 ; 9 + punpckhwd m25, m24, m24 ; 7 + mova [cq+64* 8], m14 + mova [cq+64* 9], m15 + mova [cq+64*10], m16 + mova [cq+64*11], m17 + punpcklwd m24, m24 ; 5 + punpckhwd m27, m27 ; 11 + punpcklwd m28, m28 ; 13 + punpckhwd m23, m23 ; 3 + mova [cq+64*12], m18 + mova [cq+64*13], m19 + mova [cq+64*14], m20 + mova [cq+64*15], m21 + call .main_oddhalf_fast +.end: + imul r6, strideq, 60 + mova m10, [o(end_16x32p)] + vpbroadcastd m11, [o(pw_2048)] + lea r3, [strideq*3] + pxor m12, m12 + add r6, dstq ; dst+stride*60 + psrldq m13, m10, 1 + lea r4, [strideq+r3] ; stride*4 +%macro IDCT_16x64_END 3 ; idct32, idct64, tmp +%if %1 & 1 + %define %%s0 r3 + %define %%s1 strideq*2 + %define %%s2 strideq*1 + %define %%s3 strideq*0 +%else + %define %%s0 strideq*0 + %define %%s1 strideq*1 + %define %%s2 strideq*2 + %define %%s3 r3 +%if %1 + add dstq, r4 + sub r6, r4 +%endif +%endif +%if %1 < 8 + pmulhrsw m8, m11, m%1 + pmulhrsw m9, m11, m%2 +%else + mova m9, [cq+64*%1] + paddsw m8, m9, m%2 ; out 0+n, 1+n + psubsw m9, m%2 ; out 63-n, 62-n + pmulhrsw m8, m11 + pmulhrsw m9, m11 +%endif + mova xm29, [dstq+%%s0] + vinserti128 ym29, [dstq+%%s1], 1 + mova xm%3, [r6 +%%s3] + vinserti128 ym%3, [r6 +%%s2], 1 + vpermb m29, m10, m29 + vpermb m%3, m10, m%3 + mova [cq+64*%1], m12 + paddw m29, m8 + paddw m%3, m9 + packuswb m29, m%3 + vpermd m29, m13, m29 + mova [dstq+%%s0], xm29 + vextracti128 [dstq+%%s1], ym29, 1 + vextracti32x4 [r6 +%%s2], m29, 2 + vextracti32x4 [r6 +%%s3], m29, 3 +%endmacro + IDCT_16x64_END 0, 29, 0 + IDCT_16x64_END 1, 28, 28 + IDCT_16x64_END 2, 27, 28 + IDCT_16x64_END 3, 26, 28 + IDCT_16x64_END 4, 25, 28 + IDCT_16x64_END 5, 24, 28 + IDCT_16x64_END 6, 23, 28 + IDCT_16x64_END 7, 22, 28 + IDCT_16x64_END 8, 21, 28 + IDCT_16x64_END 9, 20, 28 + IDCT_16x64_END 10, 19, 28 + IDCT_16x64_END 11, 18, 28 + IDCT_16x64_END 12, 17, 28 + IDCT_16x64_END 13, 16, 28 + IDCT_16x64_END 14, 15, 28 + IDCT_16x64_END 15, 14, 28 + RET +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + imul r6d, 181 + mov r3d, 64 + add r6d, 128+512 + sar r6d, 8+2 + jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3 +ALIGN function_align +.main_oddhalf_fast: ; bottom three-quarters are zero + vpbroadcastd m8, [o(pw_101_4095x8)] + vpbroadcastd m21, [o(pw_m1474_3822x8)] + vpbroadcastd m14, [o(pw_897_3996x8)] + vpbroadcastd m17, [o(pw_m700_4036x8)] + vpbroadcastd m18, [o(pw_501_4065x8)] + vpbroadcastd m19, [o(pw_m1092_3948x8)] + vpbroadcastd m16, [o(pw_1285_3889x8)] + vpbroadcastd m15, [o(pw_m301_4085x8)] + pmulhrsw m8, m22 ; t32a t63a + pmulhrsw m21, m29 ; t35a t60a + pmulhrsw m14, m26 ; t36a t59a + pmulhrsw m17, m25 ; t39a t56 + pmulhrsw m18, m24 ; t40a t55a + pmulhrsw m19, m27 ; t43a t52a + pmulhrsw m16, m28 ; t44a t51a + pmulhrsw m15, m23 ; t47a t48a + mova m22, m8 + mova m29, m21 + mova m26, m14 + mova m25, m17 + mova m24, m18 + mova m27, m19 + mova m28, m16 + mova m20, m15 + jmp .main_oddhalf2 +ALIGN function_align +.main_oddhalf: + vpbroadcastd m8, [o(pw_101_4095x8)] + vpbroadcastd m9, [o(pw_m2824_2967x8)] + vpbroadcastd m11, [o(pw_1660_3745x8)] + vpbroadcastd m12, [o(pw_m1474_3822x8)] + pmulhrsw m22, m8 ; t32a t63a + vpbroadcastd m8, [o(pw_897_3996x8)] + pmulhrsw m21, m9 ; t33a t62a + vpbroadcastd m9, [o(pw_m2191_3461x8)] + pmulhrsw m14, m11 ; t34a t61a + vpbroadcastd m11, [o(pw_2359_3349x8)] + pmulhrsw m29, m12 ; t35a t60a + vpbroadcastd m12, [o(pw_m700_4036x8)] + pmulhrsw m26, m8 ; t36a t59a + vpbroadcastd m8, [o(pw_501_4065x8)] + pmulhrsw m17, m9 ; t37a t58a + vpbroadcastd m9, [o(pw_m2520_3229x8)] + pmulhrsw m18, m11 ; t38a t57a + vpbroadcastd m11, [o(pw_2019_3564x8)] + pmulhrsw m25, m12 ; t39a t56a + vpbroadcastd m12, [o(pw_m1092_3948x8)] + pmulhrsw m24, m8 ; t40a t55a + vpbroadcastd m8, [o(pw_1285_3889x8)] + pmulhrsw m19, m9 ; t41a t54a + vpbroadcastd m9, [o(pw_m1842_3659x8)] + pmulhrsw m16, m11 ; t42a t53a + vpbroadcastd m11, [o(pw_2675_3102x8)] + pmulhrsw m27, m12 ; t43a t52a + vpbroadcastd m12, [o(pw_m301_4085x8)] + pmulhrsw m28, m8 ; t44a t51a + pmulhrsw m15, m9 ; t45a t50a + pmulhrsw m20, m11 ; t46a t49a + pmulhrsw m23, m12 ; t47a t48a + psubsw m8, m22, m21 ; t33 t62 + paddsw m22, m21 ; t32 t63 + psubsw m21, m29, m14 ; t34 t61 + paddsw m29, m14 ; t35 t60 + psubsw m14, m26, m17 ; t37 t58 + paddsw m26, m17 ; t36 t59 + psubsw m17, m25, m18 ; t38 t57 + paddsw m25, m18 ; t39 t56 + psubsw m18, m24, m19 ; t41 t54 + paddsw m24, m19 ; t40 t55 + psubsw m19, m27, m16 ; t42 t53 + paddsw m27, m16 ; t43 t52 + psubsw m16, m28, m15 ; t45 t50 + paddsw m28, m15 ; t44 t51 + psubsw m15, m23, m20 ; t46 t49 + paddsw m20, m23 ; t47 t48 +.main_oddhalf2: + ITX_MUL2X_PACK 8, 9, 23, 10, 401, 4076, 5 ; t33a t62a + ITX_MUL2X_PACK 21, 9, 23, 10, m4076, 401, 5 ; t34a t61a + ITX_MUL2X_PACK 14, 9, 23, 10, 3166, 2598, 5 ; t37a t58a + ITX_MUL2X_PACK 17, 9, 23, 10, m2598, 3166, 5 ; t38a t57a + ITX_MUL2X_PACK 18, 9, 23, 10, 1931, 3612, 5 ; t41a t54a + ITX_MUL2X_PACK 19, 9, 23, 10, m3612, 1931, 5 ; t42a t53a + ITX_MUL2X_PACK 16, 9, 23, 10, 3920, 1189, 5 ; t45a t50a + ITX_MUL2X_PACK 15, 9, 23, 10, m1189, 3920, 5 ; t46a t49a + vpbroadcastd m11, [o(pw_m4017_799)] + psubsw m23, m25, m26 ; t36a t59a + paddsw m25, m26 ; t39a t56a + psubsw m26, m24, m27 ; t43a t52a + paddsw m27, m24 ; t40a t55a + psubsw m24, m20, m28 ; t44a t51a + paddsw m20, m28 ; t47a t48a + psubsw m28, m8, m21 ; t34 t61 + paddsw m8, m21 ; t33 t62 + psubsw m21, m17, m14 ; t37 t58 + paddsw m17, m14 ; t38 t57 + psubsw m14, m18, m19 ; t42 t53 + paddsw m18, m19 ; t41 t54 + psubsw m19, m15, m16 ; t45 t50 + paddsw m15, m16 ; t46 t49 + psubsw m16, m22, m29 ; t35a t60a + paddsw m22, m29 ; t32a t63a + ITX_MUL2X_PACK 16, 9, 29, 10, 799_4017, 11, 20 ; t35 t60 + ITX_MUL2X_PACK 28, 9, 29, 10, 799_4017, 11, 20 ; t34a t61a + ITX_MUL2X_PACK 23, 9, 29, 10, 11, m799_m4017, 36 ; t36 t59 + ITX_MUL2X_PACK 21, 9, 29, 10, 11, m799_m4017, 36 ; t37a t58a + vpbroadcastd m11, [o(pw_m2276_3406)] + ITX_MUL2X_PACK 26, 9, 29, 10, 3406_2276, 11, 20 ; t43 t52 + ITX_MUL2X_PACK 14, 9, 29, 10, 3406_2276, 11, 20 ; t42a t53a + ITX_MUL2X_PACK 24, 9, 29, 10, 11, m3406_m2276, 36 ; t44 t51 + ITX_MUL2X_PACK 19, 9, 29, 10, 11, m3406_m2276, 36 ; t45a t50a + vpbroadcastd m11, [o(pw_1567_3784)] + vpbroadcastd m12, [o(pw_m3784_1567)] + psubsw m29, m22, m25 ; t39 t56 + paddsw m22, m25 ; t32 t63 + psubsw m25, m20, m27 ; t40 t55 + paddsw m20, m27 ; t47 t48 + psubsw m27, m8, m17 ; t38a t57a + paddsw m8, m17 ; t33a t62a + psubsw m17, m15, m18 ; t41a t54a + paddsw m15, m18 ; t46a t49a + paddsw m18, m16, m23 ; t35a t60a + psubsw m16, m23 ; t36a t59a + psubsw m23, m24, m26 ; t43a t52a + paddsw m24, m26 ; t44a t51a + paddsw m26, m28, m21 ; t34 t61 + psubsw m28, m21 ; t37 t58 + psubsw m21, m19, m14 ; t42 t53 + paddsw m19, m14 ; t45 t50 + ITX_MUL2X_PACK 29, 9, 14, 10, 11, 12, 4 ; t39a t56a + ITX_MUL2X_PACK 27, 9, 14, 10, 11, 12, 4 ; t38 t57 + ITX_MUL2X_PACK 16, 9, 14, 10, 11, 12, 4 ; t36 t59 + ITX_MUL2X_PACK 28, 9, 14, 10, 11, 12, 4 ; t37a t58a + vpbroadcastd m11, [o(pw_m1567_m3784)] + ITX_MUL2X_PACK 25, 9, 14, 10, 12, 11, 4 ; t40a t55a + ITX_MUL2X_PACK 17, 9, 14, 10, 12, 11, 4 ; t41 t54 + ITX_MUL2X_PACK 23, 9, 14, 10, 12, 11, 4 ; t43 t52 + ITX_MUL2X_PACK 21, 9, 14, 10, 12, 11, 4 ; t42a t53a + vbroadcasti32x4 m13, [o(deint_shuf)] + vpbroadcastd m11, [o(pw_2896_2896)] + vpbroadcastd m12, [o(pw_m2896_2896)] + paddsw m14, m22, m20 ; t32a t63a + psubsw m22, m20 ; t47a t48a + psubsw m20, m8, m15 ; t46 t49 + paddsw m8, m15 ; t33 t62 + paddsw m15, m18, m24 ; t35 t60 + psubsw m18, m24 ; t44 t51 + psubsw m24, m26, m19 ; t45a t50a + paddsw m26, m19 ; t34a t61a + REPX {pshufb x, m13}, m14, m8, m15, m26 + psubsw m19, m29, m25 ; t40 t55 + paddsw m25, m29 ; t39 t56 + psubsw m29, m27, m17 ; t41a t54a + paddsw m27, m17 ; t38a t57a + psubsw m17, m16, m23 ; t43a t52a + paddsw m16, m23 ; t36a t59a + psubsw m9, m28, m21 ; t42 t53 + paddsw m28, m21 ; t37 t58 + REPX {pshufb x, m13}, m25, m27, m16, m28 + ITX_MUL2X_PACK 22, 13, 21, 10, 11, 12, 8 ; t47 t48 + ITX_MUL2X_PACK 20, 23, 22, 10, 11, 12, 8 ; t46a t49a + packssdw m21, m22 ; t47 t46a + packssdw m13, m23 ; t48 t49a + ITX_MUL2X_PACK 18, 22, 20, 10, 11, 12, 8 ; t44a t51a + ITX_MUL2X_PACK 24, 23, 18, 10, 11, 12, 8 ; t45 t50 + packssdw m20, m18 ; t44a t45 + packssdw m22, m23 ; t51a t50 + ITX_MUL2X_PACK 19, 24, 18, 10, 11, 12, 8 ; t40a t55a + ITX_MUL2X_PACK 29, 23, 19, 10, 11, 12, 8 ; t41 t54 + packssdw m18, m19 ; t40a t41 + packssdw m24, m23 ; t55a t54 + ITX_MUL2X_PACK 17, 23, 19, 10, 11, 12, 8 ; t43 t52 + ITX_MUL2X_PACK 9, 29, 17, 10, 11, 12, 8 ; t42a t53a + packssdw m19, m17 ; t43 t42a + packssdw m23, m29 ; t52 t53a + punpcklqdq m17, m25, m27 ; t39 t38a + punpckhqdq m25, m27 ; t56 t57a + punpckhqdq m27, m15, m26 ; t60 t61a + punpcklqdq m15, m26 ; t35 t34a + punpckhqdq m26, m16, m28 ; t59a t58 + punpcklqdq m16, m28 ; t36a t37 + punpckhqdq m28, m14, m8 ; t63a t62 + punpcklqdq m14, m8 ; t32a t33 + psubsw m29, m0, m28 ; out63 out62 + paddsw m0, m28 ; out0 out1 + psubsw m28, m1, m27 ; out60 out61 + paddsw m1, m27 ; out3 out2 + psubsw m27, m2, m26 ; out59 out58 + paddsw m2, m26 ; out4 out5 + psubsw m26, m3, m25 ; out56 out57 + paddsw m3, m25 ; out7 out6 + psubsw m25, m4, m24 ; out55 out54 + paddsw m4, m24 ; out8 out9 + psubsw m24, m5, m23 ; out52 out53 + paddsw m5, m23 ; out11 out10 + psubsw m23, m6, m22 ; out51 out50 + paddsw m6, m22 ; out12 out13 + psubsw m22, m7, m13 ; out48 out49 + paddsw m7, m13 ; out15 out14 + ret + +cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 7, 0, dst, stride, c, eob +%undef cmp + lea r5, [o_base] + test eobd, eobd + jnz .normal + movsx r6d, word [cq] + mov [cq], eobd + mov r3d, 16 +.dconly: + imul r6d, 181 + add r6d, 128+512 + sar r6d, 8+2 +.dconly2: + imul r6d, 181 + add r6d, 128+2048 + sar r6d, 8+4 + pxor m2, m2 + vpbroadcastw m3, r6d +.dconly_loop: + mova m1, [dstq] + punpcklbw m0, m1, m2 + punpckhbw m1, m2 + paddw m0, m3 + paddw m1, m3 + packuswb m0, m1 + mova [dstq], m0 + add dstq, strideq + dec r3d + jg .dconly_loop + RET +.normal: + WIN64_SPILL_XMM 31 + mova m19, [o(dup16_perm)] + mova m24, [cq+64* 2] + mova m28, [cq+64* 6] + mova m26, [cq+64* 4] + mova m22, [cq+64* 0] + mova m23, [cq+64* 1] + mova m29, [cq+64* 7] + mova m27, [cq+64* 5] + mova m25, [cq+64* 3] + vpermb m8, m19, m24 ; 4 + vpermb m1, m19, m28 ; 12 + vpermb m7, m19, m26 ; 8 + vpermb m9, m19, m22 ; __ 0 + vpermb m21, m19, m23 ; 2 + vpermb m15, m19, m29 ; 14 + vpermb m18, m19, m27 ; 10 + vpermb m14, m19, m25 ; 6 + pslld m9, 16 + vpord m30, m19, [o(pb_32)] {1to16} + REPX {vpermb x, m30, x}, m22, m29, m26, m25, m24, m27, m28, m23 + cmp eobd, 151 + jb .fast + vpermb m0, m19, [cq+64*14] ; 28 + vpermb m5, m19, [cq+64*10] ; 20 + vpermb m3, m19, [cq+64*12] ; 24 + vpermb m6, m19, [cq+64* 8] ; __ 16 + pslld m6, 16 + call m(idct_16x16_internal_8bpc).main_fast + vpermb m17, m19, [cq+64*15] ; 30 + vpermb m20, m19, [cq+64* 9] ; 18 + vpermb m16, m19, [cq+64*11] ; 22 + vpermb m19, m19, [cq+64*13] ; 26 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + mova [cq+64* 0], m14 + mova [cq+64* 1], m15 + mova [cq+64* 2], m16 + mova [cq+64* 3], m17 + mova [cq+64* 4], m18 + mova [cq+64* 5], m19 + mova [cq+64* 6], m20 + mova [cq+64* 7], m21 + vpermb m21, m30, [cq+64*15] + vpermb m14, m30, [cq+64* 8] + vpermb m17, m30, [cq+64*11] + vpermb m18, m30, [cq+64*12] + vpermb m19, m30, [cq+64*13] + vpermb m16, m30, [cq+64*10] + vpermb m15, m30, [cq+64* 9] + vpermb m20, m30, [cq+64*14] + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf + jmp .end +.fast: ; bottom half is zero + call m(idct_16x16_internal_8bpc).main_fast2 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 + mova [cq+64* 0], m14 + mova [cq+64* 1], m15 + mova [cq+64* 2], m16 + mova [cq+64* 3], m17 + mova [cq+64* 4], m18 + mova [cq+64* 5], m19 + mova [cq+64* 6], m20 + mova [cq+64* 7], m21 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast +.end: + mova [cq+64* 8], m4 + mova [cq+64* 9], m5 + mova [cq+64*10], m6 + mova [cq+64*11], m7 + mova [cq+64*12], m26 + mova [cq+64*13], m27 + mova [cq+64*14], m28 + mova [cq+64*15], m29 + vpbroadcastd m13, [o(pw_8192)] + call .pass1_end + call .pass2 + mova [cq+64* 0], m0 + mova [cq+64* 1], m1 + mova [cq+64* 2], m2 + mova [cq+64* 3], m3 + mova [cq+64* 4], m4 + mova [cq+64* 5], m5 + mova [cq+64* 6], m6 + mova [cq+64* 7], m7 + pmulhrsw m0, m13, [cq+64* 8] + pmulhrsw m1, m13, [cq+64* 9] + pmulhrsw m2, m13, [cq+64*10] + pmulhrsw m3, m13, [cq+64*11] + vpbroadcastd m30, [o(pw_2048)] + pmulhrsw m4, m13, m22 + pmulhrsw m5, m13, m23 + pmulhrsw m6, m13, m24 + pmulhrsw m7, m13, m25 + pmulhrsw m22, m30, m14 + pmulhrsw m14, m13, m26 + pmulhrsw m23, m30, m15 + pmulhrsw m15, m13, m27 + pmulhrsw m24, m30, m16 + pmulhrsw m16, m13, m28 + pmulhrsw m25, m30, m17 + pmulhrsw m17, m13, m29 + pmulhrsw m26, m30, m18 + pmulhrsw m18, m13, [cq+64*12] + pmulhrsw m27, m30, m19 + pmulhrsw m19, m13, [cq+64*13] + pmulhrsw m28, m30, m20 + pmulhrsw m20, m13, [cq+64*14] + pmulhrsw m29, m30, m21 + pmulhrsw m21, m13, [cq+64*15] + call .transpose_round + call .pass2 + pxor m10, m10 + lea r3, [strideq*3] +%macro IDCT_64x16_END 4 + mova m9, [dstq+%4] +%if %1 < 8 + pmulhrsw m%3, m30, [cq+64*%1] +%endif + pmulhrsw m%2, m30 + mova [cq+64*%1], m10 + punpcklbw m8, m9, m10 + punpckhbw m9, m10 + paddw m8, m%3 + paddw m9, m%2 + packuswb m8, m9 + mova [dstq+%4], m8 +%if %1 == 3 || %1 == 7 || %1 == 11 + lea dstq, [dstq+strideq*4] +%endif +%endmacro + IDCT_64x16_END 0, 0, 11, strideq*0 + IDCT_64x16_END 1, 1, 11, strideq*1 + IDCT_64x16_END 2, 2, 11, strideq*2 + IDCT_64x16_END 3, 3, 11, r3 + IDCT_64x16_END 4, 4, 11, strideq*0 + IDCT_64x16_END 5, 5, 11, strideq*1 + IDCT_64x16_END 6, 6, 11, strideq*2 + IDCT_64x16_END 7, 7, 11, r3 + IDCT_64x16_END 8, 14, 22, strideq*0 + IDCT_64x16_END 9, 15, 23, strideq*1 + IDCT_64x16_END 10, 16, 24, strideq*2 + IDCT_64x16_END 11, 17, 25, r3 + IDCT_64x16_END 12, 18, 26, strideq*0 + IDCT_64x16_END 13, 19, 27, strideq*1 + IDCT_64x16_END 14, 20, 28, strideq*2 + IDCT_64x16_END 15, 21, 29, r3 + RET +ALIGN function_align +.pass1_end: + mova m4, [cq+64* 0] + mova m5, [cq+64* 1] + mova m6, [cq+64* 2] + mova m7, [cq+64* 3] + mova m8, [cq+64* 4] + mova m9, [cq+64* 5] + mova m11, [cq+64* 6] + mova m12, [cq+64* 7] + psubsw m29, m4, m21 ; out47 out46 + paddsw m4, m21 ; out16 out17 + psubsw m28, m5, m20 ; out44 out45 + paddsw m5, m20 ; out19 out18 + REPX {pmulhrsw x, m13}, m0, m1, m2, m3 + psubsw m27, m6, m19 ; out43 out42 + paddsw m6, m19 ; out20 out21 + psubsw m26, m7, m18 ; out40 out41 + paddsw m7, m18 ; out23 out22 + pmulhrsw m18, m13, m22 + pmulhrsw m19, m13, m23 + pmulhrsw m20, m13, m24 + pmulhrsw m21, m13, m25 + paddsw m25, m12, m14 ; out31 out30 + psubsw m14, m12, m14 ; out32 out33 + paddsw m24, m11, m15 ; out28 out29 + psubsw m15, m11, m15 ; out35 out34 + REPX {pmulhrsw x, m13}, m4, m5, m6, m7 + paddsw m23, m9, m16 ; out27 out26 + psubsw m16, m9, m16 ; out36 out37 + paddsw m22, m8, m17 ; out24 out25 + psubsw m17, m8, m17 ; out39 out38 + REPX {pmulhrsw x, m13}, m14, m15, m16, m17 +.transpose_round: +%macro TRANSPOSE_8x4_PACKED 4 + punpckhwd m8, m%1, m%3 ; b0 f0 b1 f1 b2 f2 b3 f3 + punpcklwd m%1, m%3 ; a0 e0 a1 e1 a2 e2 a3 e3 + punpcklwd m%3, m%2, m%4 ; d0 h0 d1 h1 d2 h2 d3 h3 + punpckhwd m%2, m%4 ; c0 g0 c1 g1 c2 g2 c3 g3 + punpckhwd m%4, m%1, m%2 ; a2 c2 e2 g2 a3 c3 e3 g3 + punpcklwd m%1, m%2 ; a0 c0 e0 g0 a1 c1 e1 g1 + punpckhwd m%2, m8, m%3 ; b2 d2 f2 h2 b3 d3 f3 h3 + punpcklwd m8, m%3 ; b0 d0 f0 h0 b1 d1 f1 h1 + punpcklwd m%3, m%4, m%2 ; 2 + punpckhwd m%4, m%2 ; 3 + punpckhwd m%2, m%1, m8 ; 1 + punpcklwd m%1, m8 ; 0 +%endmacro + TRANSPOSE_8x4_PACKED 0, 1, 2, 3 + TRANSPOSE_8x4_PACKED 18, 19, 20, 21 + TRANSPOSE_8x4_PACKED 4, 5, 6, 7 + TRANSPOSE_8x4_PACKED 14, 15, 16, 17 + vshufi32x4 m8, m0, m4, q3232 ; a02 a03 b02 b03 + vinserti32x8 m0, ym4, 1 ; a00 a01 b00 b01 + vshufi32x4 m4, m1, m5, q3232 ; a12 a13 b12 b13 + vinserti32x8 m9, m1, ym5, 1 ; a10 a11 b10 b11 + vshufi32x4 m5, m2, m6, q3232 ; a22 a23 b22 b23 + vinserti32x8 m1, m2, ym6, 1 ; a20 a21 b20 b21 + vshufi32x4 m6, m3, m7, q3232 ; a32 a33 b32 b33 + vinserti32x8 m11, m3, ym7, 1 ; a30 a31 b30 b31 + vshufi32x4 m2, m14, m18, q3232 ; c02 c03 d02 d03 + vinserti32x8 m3, m14, ym18, 1 ; c00 c01 d00 d01 + vshufi32x4 m18, m15, m19, q3232 ; c12 c13 d12 d13 + vinserti32x8 m15, ym19, 1 ; c10 c11 d10 d11 + vshufi32x4 m19, m16, m20, q3232 ; c22 c23 d22 d23 + vinserti32x8 m16, ym20, 1 ; c20 c21 d20 d21 + vshufi32x4 m20, m17, m21, q3232 ; c32 c33 d32 d33 + vinserti32x8 m17, ym21, 1 ; c30 c31 d30 d31 + ret +.pass2: + vshufi32x4 m7, m5, m19, q3131 ; 14 + vshufi32x4 m5, m19, q2020 ; 10 + vshufi32x4 m21, m6, m20, q3131 ; 15 + vshufi32x4 m19, m6, m20, q2020 ; 11 + vshufi32x4 m20, m4, m18, q3131 ; 13 + vshufi32x4 m18, m4, m18, q2020 ; 9 + vshufi32x4 m6, m8, m2, q3131 ; 12 + vshufi32x4 m4, m8, m2, q2020 ; 8 + vshufi32x4 m2, m0, m3, q3131 ; 4 + vshufi32x4 m0, m3, q2020 ; 0 + vshufi32x4 m3, m1, m16, q3131 ; 6 + vshufi32x4 m1, m16, q2020 ; 2 + vshufi32x4 m16, m9, m15, q3131 ; 5 + vshufi32x4 m14, m9, m15, q2020 ; 1 + vshufi32x4 m15, m11, m17, q2020 ; 3 + vshufi32x4 m17, m11, m17, q3131 ; 7 + call m(inv_txfm_add_dct_dct_32x8_8bpc).main2 + jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + +cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob + lea r5, [o_base] + test eobd, eobd + jz .dconly + PROLOGUE 0, 9, 30, 64*32, dst, stride, c, eob + vpbroadcastd m23, [o(pw_2896x8)] +%undef cmp + cmp eobd, 136 + jb .fast + pmulhrsw m5, m23, [cq+64*20] + pmulhrsw m3, m23, [cq+64*12] + pmulhrsw m1, m23, [cq+64* 4] + pmulhrsw m7, m23, [cq+64*28] + pmulhrsw m2, m23, [cq+64* 8] + pmulhrsw m6, m23, [cq+64*24] + pmulhrsw m0, m23, [cq+64* 0] + pmulhrsw m4, m23, [cq+64*16] + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + pmulhrsw m14, m23, [cq+64* 2] + pmulhrsw m21, m23, [cq+64*30] + pmulhrsw m18, m23, [cq+64*18] + pmulhrsw m17, m23, [cq+64*14] + pmulhrsw m16, m23, [cq+64*10] + pmulhrsw m19, m23, [cq+64*22] + pmulhrsw m20, m23, [cq+64*26] + pmulhrsw m15, m23, [cq+64* 6] + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + mova [cq+64* 0], m14 + mova [cq+64* 2], m15 + mova [cq+64* 4], m16 + mova [cq+64* 6], m17 + mova [cq+64* 8], m18 + mova [cq+64*10], m19 + mova [cq+64*12], m20 + mova [cq+64*14], m21 + pmulhrsw m22, m23, [cq+64* 1] + pmulhrsw m21, m23, [cq+64*31] + pmulhrsw m14, m23, [cq+64*17] + pmulhrsw m29, m23, [cq+64*15] + pmulhrsw m26, m23, [cq+64* 9] + pmulhrsw m17, m23, [cq+64*23] + pmulhrsw m18, m23, [cq+64*25] + pmulhrsw m25, m23, [cq+64* 7] + pmulhrsw m24, m23, [cq+64* 5] + pmulhrsw m19, m23, [cq+64*27] + pmulhrsw m16, m23, [cq+64*21] + pmulhrsw m27, m23, [cq+64*11] + pmulhrsw m28, m23, [cq+64*13] + pmulhrsw m15, m23, [cq+64*19] + pmulhrsw m20, m23, [cq+64*29] + pmulhrsw m23, [cq+64* 3] + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf + vpbroadcastd m12, [o(pw_16384)] + psubsw m13, m0, m29 ; 31 + paddsw m0, m29 ; 0 + psubsw m29, m1, m28 ; 30 + paddsw m1, m28 ; 1 + psubsw m28, m2, m27 ; 29 + paddsw m2, m27 ; 2 + psubsw m27, m3, m26 ; 28 + paddsw m3, m26 ; 3 + psubsw m26, m4, m25 ; 27 + paddsw m4, m25 ; 4 + psubsw m25, m5, m24 ; 26 + paddsw m5, m24 ; 5 + psubsw m24, m6, m23 ; 25 + paddsw m6, m23 ; 6 + psubsw m23, m7, m22 ; 24 + paddsw m7, m22 ; 7 + pxor m9, m9 + punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7 + punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3 + punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7 + punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3 + REPX {mova [cq+64*x], m9}, 16, 17, 18, 19 + punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7 + punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3 + punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7 + punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3 + REPX {mova [cq+64*x], m9}, 20, 21, 22, 23 + punpckhwd m3, m23, m24 + punpcklwd m23, m24 + punpckhwd m24, m25, m26 + punpcklwd m25, m26 + REPX {mova [cq+64*x], m9}, 24, 25, 26, 27 + punpckhwd m26, m27, m28 + punpcklwd m27, m28 + punpckhwd m28, m29, m13 + punpcklwd m29, m13 + REPX {mova [cq+64*x], m9}, 28, 29, 30, 31 + punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3 + punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1 + REPX {pmulhrsw x, m12}, m7, m0, m2, m4 + punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7 + punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5 + punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7 + punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5 + REPX {pmulhrsw x, m12}, m6, m8, m1, m22 + punpckhdq m13, m23, m25 + punpckldq m23, m25 + punpckhdq m25, m27, m29 + punpckldq m27, m29 + REPX {pmulhrsw x, m12}, m13, m23, m25, m27 + punpckhdq m9, m3, m24 + punpckldq m3, m24 + punpckhdq m24, m26, m28 + punpckldq m26, m28 + REPX {pmulhrsw x, m12}, m9, m3, m24, m26 + punpckhqdq m5, m23, m27 ; d01 d09 d17 d25 + punpcklqdq m23, m27 ; d00 d08 d16 d24 + punpcklqdq m27, m13, m25 ; d02 d10 d18 d26 + punpckhqdq m13, m25 ; d03 d11 d19 d27 + punpcklqdq m25, m3, m26 ; d04 d12 d20 d28 + punpckhqdq m3, m26 ; d05 d13 d21 d29 + punpcklqdq m26, m9, m24 ; d06 d14 d22 d30 + punpckhqdq m9, m24 ; d07 d15 d23 d31 + mova [cq+64* 3], m23 + mova [cq+64*13], m27 + mova [cq+64* 7], m25 + mova [cq+64*15], m26 + punpckhqdq m24, m8, m22 ; a05 a13 a21 a29 + punpcklqdq m8, m22 ; a04 a12 a20 a28 + punpckhqdq m22, m0, m4 ; a01 a09 a17 a25 + punpcklqdq m0, m4 ; a00 a08 a16 a24 + punpckhqdq m23, m7, m2 ; a03 a11 a19 a27 + punpcklqdq m7, m2 ; a02 a10 a18 a26 + punpckhqdq m25, m6, m1 ; a07 a15 a23 a31 + punpcklqdq m6, m1 ; a06 a14 a22 a30 + mova [cq+64* 1], m0 + mova [cq+64* 9], m7 + mova [cq+64* 5], m8 + mova [cq+64*11], m6 + mova m2, [cq+64* 0] + mova m11, [cq+64* 2] + mova m8, [cq+64* 4] + mova m29, [cq+64* 6] + mova m27, [cq+64* 8] + mova m26, [cq+64*10] + mova m4, [cq+64*12] + mova m28, [cq+64*14] + psubsw m1, m2, m21 ; 23 + paddsw m2, m21 ; 8 + psubsw m21, m11, m20 ; 22 + paddsw m11, m20 ; 9 + psubsw m20, m8, m19 ; 21 + paddsw m8, m19 ; 10 + psubsw m19, m29, m18 ; 20 + paddsw m29, m18 ; 11 + psubsw m18, m27, m17 ; 19 + paddsw m27, m17 ; 12 + psubsw m17, m26, m16 ; 18 + paddsw m26, m16 ; 13 + psubsw m16, m4, m15 ; 17 + paddsw m4, m15 ; 14 + psubsw m15, m28, m14 ; 16 + paddsw m28, m14 ; 15 + punpcklwd m14, m15, m16 + punpckhwd m15, m16 + punpckhwd m16, m17, m18 + punpcklwd m17, m18 + punpckhwd m18, m19, m20 + punpcklwd m19, m20 + punpckhwd m20, m21, m1 + punpcklwd m21, m1 + punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7 + punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3 + punpckhwd m11, m8, m29 ; k4 l4 k5 l5 k6 l6 k7 l7 + punpcklwd m8, m29 ; k0 l0 k1 l1 k2 l2 k3 l3 + punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7 + punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3 + punpckhwd m26, m4, m28 ; o4 p4 o5 p5 o6 p6 o7 p7 + punpcklwd m4, m28 ; o0 p0 o1 p1 o2 p2 o3 p3 + punpckhdq m28, m2, m8 ; i2 j2 k2 l2 i3 j3 k3 l3 + punpckldq m2, m8 ; i0 j0 k0 l0 i1 j1 k1 l1 + punpckhdq m8, m27, m4 ; m2 n2 o2 p2 m3 n3 o3 p3 + punpckldq m27, m4 ; m0 n0 o0 p0 m1 n1 o1 p1 + REPX {pmulhrsw x, m12}, m28, m2, m8, m27 + punpckhdq m4, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7 + punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5 + punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7 + punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5 + REPX {pmulhrsw x, m12}, m4, m1, m11, m29 + punpckhdq m26, m19, m21 + punpckldq m19, m21 + punpckhdq m21, m15, m16 + punpckldq m15, m16 + REPX {pmulhrsw x, m12}, m26, m19, m21, m15 + punpckhdq m16, m18, m20 + punpckldq m18, m20 + punpckhdq m20, m14, m17 + punpckldq m14, m17 + REPX {pmulhrsw x, m12}, m16, m18, m20, m14 + punpckhqdq m17, m28, m8 ; b03 b11 b19 b27 + punpcklqdq m28, m8 ; b02 b10 b18 b26 + punpckhqdq m8, m2, m27 ; b01 b09 b17 b25 + punpcklqdq m2, m27 ; b00 b08 b16 b24 + punpcklqdq m27, m1, m29 ; b04 b12 b20 b28 + punpckhqdq m1, m29 ; b05 b13 b21 b29 + punpcklqdq m29, m4, m11 ; b06 b14 b22 b30 + punpckhqdq m4, m11 ; b07 b15 b23 b31 + mova [cq+64* 0], m2 + mova [cq+64* 8], m28 + mova [cq+64* 4], m27 + mova [cq+64*10], m29 + punpckhqdq m27, m20, m26 ; c03 c11 c19 c27 + punpcklqdq m20, m26 ; c02 c10 c18 c26 + punpckhqdq m26, m14, m19 ; c01 c09 c17 c25 + punpcklqdq m14, m19 ; c00 c08 c16 c24 + punpckhqdq m28, m15, m18 ; c05 c13 c21 c29 + punpcklqdq m15, m18 ; c04 c12 c20 c28 + punpckhqdq m29, m21, m16 ; c07 c15 c23 c31 + punpcklqdq m21, m16 ; c06 c14 c22 c30 + mova [cq+64* 2], m14 + mova [cq+64*12], m20 + mova [cq+64* 6], m15 + mova [cq+64*14], m21 + vshufi32x4 m14, m22, m8, q3232 ; a17 a25 b17 b25 + vinserti32x8 m22, ym8, 1 ; a01 a09 b01 b09 + vshufi32x4 m15, m23, m17, q3232 ; a19 a27 b19 b27 + vinserti32x8 m23, ym17, 1 ; a03 a11 b03 b11 + vshufi32x4 m16, m24, m1, q3232 ; a21 a29 b21 b29 + vinserti32x8 m24, ym1, 1 ; a05 a13 b05 b13 + vshufi32x4 m17, m25, m4, q3232 ; a23 a31 b23 b31 + vinserti32x8 m25, ym4, 1 ; a07 a15 b07 b15 + vinserti32x8 m19, m26, ym5, 1 ; c01 c09 d01 d09 + vshufi32x4 m26, m5, q3232 ; c17 c25 d17 d25 + vinserti32x8 m20, m27, ym13, 1 ; c03 c11 d03 d11 + vshufi32x4 m27, m13, q3232 ; c19 c27 d19 d27 + vinserti32x8 m21, m28, ym3, 1 ; c05 c13 d05 d13 + vshufi32x4 m28, m3, q3232 ; c21 c29 d21 d29 + vinserti32x8 m18, m29, ym9, 1 ; c07 c15 d07 d15 + vshufi32x4 m29, m9, q3232 ; c23 c31 d23 d31 + mov r4, rsp + vshufi32x4 m0, m22, m19, q2020 ; 1 + vshufi32x4 m1, m17, m29, q3131 ; 31 + vshufi32x4 m2, m14, m26, q2020 ; 17 + vshufi32x4 m3, m25, m18, q3131 ; 15 + call .main_part1 + vshufi32x4 m0, m25, m18, q2020 ; 7 + vshufi32x4 m1, m14, m26, q3131 ; 25 + vshufi32x4 m2, m17, m29, q2020 ; 23 + vshufi32x4 m3, m22, m19, q3131 ; 9 + call .main_part1 + vshufi32x4 m0, m24, m21, q2020 ; 5 + vshufi32x4 m1, m15, m27, q3131 ; 27 + vshufi32x4 m2, m16, m28, q2020 ; 21 + vshufi32x4 m3, m23, m20, q3131 ; 11 + call .main_part1 + vshufi32x4 m0, m23, m20, q2020 ; 3 + vshufi32x4 m1, m16, m28, q3131 ; 29 + vshufi32x4 m2, m15, m27, q2020 ; 19 + vshufi32x4 m3, m24, m21, q3131 ; 13 + call .main_part1 + call .main_part2 + mova m0, [cq+64* 1] ; a0 + mova m15, [cq+64* 0] ; b0 + mova m3, [cq+64* 2] ; c0 + mova m16, [cq+64* 3] ; d0 + mova m14, [cq+64* 5] ; a4 + mova m8, [cq+64* 4] ; b4 + mova m17, [cq+64* 6] ; c4 + mova m1, [cq+64* 7] ; d4 + vshufi32x4 m2, m0, m15, q3232 ; a16 a24 b16 b24 + vinserti32x8 m0, ym15, 1 ; a00 a08 b00 b08 + vshufi32x4 m15, m3, m16, q3232 ; c16 c24 d16 d24 + vinserti32x8 m3, ym16, 1 ; c00 c08 d00 d08 + vshufi32x4 m16, m14, m8, q3232 ; a20 a28 b20 b28 + vinserti32x8 m14, ym8, 1 ; a04 a12 b04 b12 + vshufi32x4 m8, m17, m1, q3232 ; c20 c28 d20 d28 + vinserti32x8 m17, ym1, 1 ; c04 c12 d04 d12 + vshufi32x4 m1, m0, m3, q3131 ; 8 + vshufi32x4 m0, m3, q2020 ; 0 + vshufi32x4 m3, m2, m15, q3131 ; 24 + vshufi32x4 m2, m15, q2020 ; 16 + vshufi32x4 m15, m14, m17, q3131 ; 12 + vshufi32x4 m14, m17, q2020 ; 4 + vshufi32x4 m17, m16, m8, q3131 ; 28 + vshufi32x4 m16, m8, q2020 ; 20 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast + mova m8, [cq+64* 8] + mova m9, [cq+64*12] + mova m11, [cq+64*10] + mova m12, [cq+64*14] + mova [cq+64* 0], m14 + mova [cq+64* 2], m15 + mova [cq+64* 4], m16 + mova [cq+64* 6], m17 + mova [cq+64* 8], m18 + mova [cq+64*10], m19 + mova [cq+64*12], m20 + mova [cq+64*14], m21 + mova m22, [cq+64* 9] + mova m27, [cq+64*13] + mova m23, [cq+64*11] + mova m24, [cq+64*15] + vshufi32x4 m26, m22, m8, q3232 ; a18 a26 b18 b26 + vinserti32x8 m22, ym8, 1 ; a02 a10 b02 b10 + vshufi32x4 m8, m9, m27, q3232 ; c18 c26 d18 d26 + vinserti32x8 m9, ym27, 1 ; c02 c10 d02 d10 + vshufi32x4 m27, m23, m11, q3232 ; a22 a30 b22 b30 + vinserti32x8 m23, ym11, 1 ; a06 a14 b06 b14 + vshufi32x4 m11, m12, m24, q3232 ; c22 c30 d22 d30 + vinserti32x8 m12, ym24, 1 ; c06 c14 d06 d14 + vshufi32x4 m28, m26, m8, q3131 ; 26 + vshufi32x4 m26, m8, q2020 ; 18 + vshufi32x4 m24, m22, m9, q3131 ; 10 + vshufi32x4 m22, m9, q2020 ; 2 + vshufi32x4 m29, m27, m11, q3131 ; 30 + vshufi32x4 m27, m11, q2020 ; 22 + vshufi32x4 m25, m23, m12, q3131 ; 14 + vshufi32x4 m23, m12, q2020 ; 6 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast + jmp .end +.fast: ; bottom/right halves are zero + pmulhrsw ym9, ym23, [cq+64* 0] + pmulhrsw ym6, ym23, [cq+64* 8] + mova m14, [o(dup16_perm)] + pmulhrsw ym8, ym23, [cq+64* 2] + pmulhrsw xm0, xm23, [cq+64*14] + pmulhrsw xm5, xm23, [cq+64*10] + pmulhrsw ym1, ym23, [cq+64* 6] + pmulhrsw ym7, ym23, [cq+64* 4] + pmulhrsw xm3, xm23, [cq+64*12] + pmovzxwd m9, ym9 + pmovzxwd m6, ym6 + vpermb m8, m14, m8 + punpcklwd xm0, xm0 + vpermb ym5, ym14, ym5 + vpermb m1, m14, m1 + vpermb m7, m14, m7 + punpcklwd xm3, xm3 + pslld m9, 16 + pslld m6, 16 + call m(idct_16x16_internal_8bpc).main_fast + vpmulhrsw ym21, ym23, [cq+64* 1] + {evex}vpmulhrsw xm17, xm23, [cq+64*15] ; force EVEX encoding, which + {evex}vpmulhrsw xm20, xm23, [cq+64* 9] ; reduces code size due to + {evex}vpmulhrsw ym15, ym23, [cq+64* 7] ; compressed displacements + {evex}vpmulhrsw ym18, ym23, [cq+64* 5] + {evex}vpmulhrsw xm16, xm23, [cq+64*11] + {evex}vpmulhrsw xm19, xm23, [cq+64*13] + {evex}vpmulhrsw ym23, [cq+64* 3] + vpermb m21, m14, m21 + punpcklwd xm17, xm17 + vpermb ym20, ym14, ym20 + vpermb m15, m14, m15 + vpermb m18, m14, m18 + vpermb ym16, ym14, ym16 + punpcklwd xm19, xm19 + vpermb m14, m14, m23 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast + vpbroadcastd m9, [o(pw_16384)] + call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round + vshufi32x4 m16, m0, m3, q2020 ; 0 + vshufi32x4 m26, m0, m3, q3131 ; 4 + vshufi32x4 m0, m14, m2, q2020 ; 1 + vshufi32x4 m14, m2, q3131 ; 5 + vshufi32x4 m3, m19, m7, q3131 ; 15 + vshufi32x4 m19, m7, q2020 ; 11 + vshufi32x4 m27, m17, m9, q2020 ; 3 + vshufi32x4 m17, m9, q3131 ; 7 + vshufi32x4 m28, m20, m6, q2020 ; 9 + vshufi32x4 m20, m6, q3131 ; 13 + vshufi32x4 m22, m1, m18, q2020 ; 2 + vshufi32x4 m23, m1, m18, q3131 ; 6 + vshufi32x4 m24, m5, m15, q2020 ; 10 + vshufi32x4 m25, m5, m15, q3131 ; 14 + vshufi32x4 m15, m21, m4, q3131 ; 12 + vshufi32x4 m21, m21, m4, q2020 ; 8 + mov r4, rsp + call .main_part1_fast + mova m0, m17 + mova m3, m28 + call .main_part1_fast + mova m0, m14 + mova m3, m19 + call .main_part1_fast + mova m0, m27 + mova m3, m20 + call .main_part1_fast + call .main_part2 + mova m0, m16 + mova m1, m21 + mova m14, m26 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 + mova [cq+64*14], m21 + mova [cq+64* 0], m14 + mova [cq+64* 6], m17 + mova [cq+64* 8], m18 + mova [cq+64*10], m19 + mova [cq+64* 4], m16 + mova [cq+64* 2], m15 + mova [cq+64*12], m20 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 +.end: + lea r4, [strideq*3] + vpbroadcastd m12, [o(pw_2048)] + movshdup m13, [o(permD)] + lea r5, [r4+strideq] ; stride*4 + lea r3, [dstq+r4*8] + lea r6, [strideq+r5*8] ; stride*33 + lea r8, [r4+r5*8] ; stride*35 + add r3, r5 ; dst+stride*28 + lea r7, [r6+strideq] ; stride*34 +%macro IDCT_32x64_END 6 ; src, mem, stride[1-4] +%if %2 < 8 + paddsw m10, m%2, m%1 + psubsw m11, m%2, m%1 +%else + mova m11, [cq+64*(%2*2-16)] + paddsw m10, m11, m%1 + psubsw m11, m%1 +%endif + mova m9, [rsp+64*(31-%2)] + mova m%1, [rsp+64*%2] + paddsw m8, m10, m9 + psubsw m10, m9 + paddsw m9, m11, m%1 + pmovzxbw m0, [dstq+%3] + psubsw m11, m%1 + pmovzxbw m%1, [r3 +%4] + REPX {pmulhrsw x, m12}, m8, m10, m9, m11 + paddw m8, m0 + pmovzxbw m0, [r3 +%5] + paddw m10, m%1 + pmovzxbw m%1, [dstq+%6] + paddw m9, m0 + paddw m11, m%1 +%if %2 >= 8 +%if %2 == 8 + pxor m1, m1 +%endif + mova [cq+64*(%2*2-16)], m1 + mova [cq+64*(%2*2-15)], m1 +%endif + packuswb m8, m10 + packuswb m9, m11 + vpermq m8, m13, m8 + vpermq m9, m13, m9 + mova [dstq+%3], ym8 + vextracti32x8 [r3 +%4], m8, 1 + mova [r3 +%5], ym9 + vextracti32x8 [dstq+%6], m9, 1 +%if %2 == 3 || %2 == 7 || %2 == 11 + add dstq, r5 + sub r3, r5 +%endif +%endmacro + IDCT_32x64_END 29, 0, strideq*0, r8, r4 , r5*8 + IDCT_32x64_END 28, 1, strideq*1, r7, strideq*2, r6 + IDCT_32x64_END 27, 2, strideq*2, r6, strideq*1, r7 + IDCT_32x64_END 26, 3, r4 , r5*8, strideq*0, r8 + IDCT_32x64_END 25, 4, strideq*0, r8, r4 , r5*8 + IDCT_32x64_END 24, 5, strideq*1, r7, strideq*2, r6 + IDCT_32x64_END 23, 6, strideq*2, r6, strideq*1, r7 + IDCT_32x64_END 22, 7, r4 , r5*8, strideq*0, r8 + IDCT_32x64_END 21, 8, strideq*0, r8, r4 , r5*8 + IDCT_32x64_END 20, 9, strideq*1, r7, strideq*2, r6 + IDCT_32x64_END 19, 10, strideq*2, r6, strideq*1, r7 + IDCT_32x64_END 18, 11, r4 , r5*8, strideq*0, r8 + IDCT_32x64_END 17, 12, strideq*0, r8, r4 , r5*8 + IDCT_32x64_END 16, 13, strideq*1, r7, strideq*2, r6 + IDCT_32x64_END 15, 14, strideq*2, r6, strideq*1, r7 + IDCT_32x64_END 14, 15, r4 , r5*8, strideq*0, r8 + RET +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + imul r6d, 181 + mov r3d, 64 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 128+256 + sar r6d, 8+1 + jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3 +ALIGN function_align ; bottom three-quarters are zero +.main_part1_fast: + vpbroadcastd m1, [o(idct64_mul+4*0)] + vpbroadcastd m8, [o(idct64_mul+4*1)] + vpbroadcastd m2, [o(idct64_mul+4*6)] + vpbroadcastd m9, [o(idct64_mul+4*7)] + pmulhrsw m1, m0 ; t63a + pmulhrsw m0, m8 ; t32a + pmulhrsw m2, m3 ; t60a + pmulhrsw m3, m9 ; t35a + mova m8, m0 + mova m7, m1 + mova m6, m3 + mova m5, m2 + jmp .main_part1b +.main_part1: + ; idct64 steps 1-5: + ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + vpbroadcastd m7, [o(idct64_mul+4*0)] + vpbroadcastd m8, [o(idct64_mul+4*1)] + vpbroadcastd m6, [o(idct64_mul+4*2)] + vpbroadcastd m9, [o(idct64_mul+4*3)] + pmulhrsw m7, m0 ; t63a + vpbroadcastd m5, [o(idct64_mul+4*4)] + pmulhrsw m0, m8 ; t32a + vpbroadcastd m8, [o(idct64_mul+4*5)] + pmulhrsw m6, m1 ; t62a + vpbroadcastd m4, [o(idct64_mul+4*6)] + pmulhrsw m1, m9 ; t33a + vpbroadcastd m9, [o(idct64_mul+4*7)] + pmulhrsw m5, m2 ; t61a + pmulhrsw m2, m8 ; t34a + pmulhrsw m4, m3 ; t60a + pmulhrsw m3, m9 ; t35a + psubsw m8, m0, m1 ; t33 + paddsw m0, m1 ; t32 + psubsw m1, m7, m6 ; t62 + paddsw m7, m6 ; t63 + psubsw m6, m3, m2 ; t34 + paddsw m3, m2 ; t35 + psubsw m2, m4, m5 ; t61 + paddsw m5, m4 ; t60 +.main_part1b: + vpbroadcastd m11, [o(idct64_mul+4*8)] + vpbroadcastd m12, [o(idct64_mul+4*9)] + ITX_MULSUB_2W 1, 8, 4, 9, 10, 11, 12 ; t33a, t62a + vpbroadcastd m11, [o(idct64_mul+4*10)] + ITX_MULSUB_2W 2, 6, 4, 9, 10, 12, 11 ; t34a, t61a + vpbroadcastd m11, [o(idct64_mul+4*11)] + vpbroadcastd m12, [o(idct64_mul+4*12)] + psubsw m4, m0, m3 ; t35a + paddsw m0, m3 ; t32a + psubsw m3, m7, m5 ; t60a + paddsw m7, m5 ; t63a + psubsw m5, m1, m2 ; t34 + paddsw m1, m2 ; t33 + psubsw m2, m8, m6 ; t61 + paddsw m6, m8 ; t62 + add r5, 4*13 + ITX_MULSUB_2W 3, 4, 8, 9, 10, 11, 12 ; t35, t60 + ITX_MULSUB_2W 2, 5, 8, 9, 10, 11, 12 ; t34a, t61a + mova [r4+64*0], m0 + mova [r4+64*7], m7 + mova [r4+64*1], m1 + mova [r4+64*6], m6 + mova [r4+64*3], m3 + mova [r4+64*4], m4 + mova [r4+64*2], m2 + mova [r4+64*5], m5 + add r4, 64*8 + ret +.main_part2: + vpbroadcastd m11, [o(pw_1567_3784 -16*13)] + vpbroadcastd m12, [o(pw_m3784_1567 -16*13)] + lea r6, [r4+64*7] + vpbroadcastd m17, [o(pw_m1567_m3784-16*13)] + vpbroadcastd m18, [o(pw_2896_2896 -16*13)] + vpbroadcastd m19, [o(pw_m2896_2896 -16*13)] + sub r5, 16*13 +.main_part2_loop: + mova m0, [r4-64*32] ; t32a + mova m1, [r6-64*24] ; t39a + mova m2, [r6-64*32] ; t63a + mova m3, [r4-64*24] ; t56a + mova m4, [r4-64*16] ; t40a + mova m5, [r6-64* 8] ; t47a + mova m6, [r6-64*16] ; t55a + mova m7, [r4-64* 8] ; t48a + psubsw m8, m0, m1 ; t39 + paddsw m0, m1 ; t32 + psubsw m1, m2, m3 ; t56 + paddsw m2, m3 ; t63 + psubsw m3, m5, m4 ; t40 + paddsw m5, m4 ; t47 + psubsw m4, m7, m6 ; t55 + paddsw m7, m6 ; t48 + ITX_MULSUB_2W 1, 8, 6, 9, 10, 11, 12 ; t39a, t56a + ITX_MULSUB_2W 4, 3, 6, 9, 10, 12, 17 ; t40a, t55a + psubsw m6, m2, m7 ; t48a + paddsw m2, m7 ; t63a + psubsw m7, m0, m5 ; t47a + paddsw m0, m5 ; t32a + psubsw m5, m8, m3 ; t55 + paddsw m8, m3 ; t56 + psubsw m3, m1, m4 ; t40 + paddsw m1, m4 ; t39 + ITX_MULSUB_2W 6, 7, 4, 9, 10, 18, 19 ; t47, t48 + ITX_MULSUB_2W 5, 3, 4, 9, 10, 18, 19 ; t40a, t55a + mova [r6-64* 8], m2 + mova [r4-64*32], m0 + mova [r4-64* 8], m8 + mova [r6-64*32], m1 + mova [r6-64*24], m6 + mova [r4-64*16], m7 + mova [r4-64*24], m5 + mova [r6-64*16], m3 + add r4, 64 + sub r6, 64 + cmp r4, r6 + jb .main_part2_loop + ret + +cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 7, 0, dst, stride, c, eob + lea r5, [o_base] + test eobd, eobd + jz .dconly + PROLOGUE 0, 7, 30, 64*32, dst, stride, c, eob + vpbroadcastd m23, [o(pw_2896x8)] +%undef cmp + cmp eobd, 136 + jb .fast + pmulhrsw m0, m23, [cq+64* 1] + pmulhrsw m1, m23, [cq+64*31] + pmulhrsw m2, m23, [cq+64*17] + pmulhrsw m3, m23, [cq+64*15] + vpbroadcastd m10, [o(pd_2048)] + mov r4, rsp + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + pmulhrsw m0, m23, [cq+64* 7] + pmulhrsw m1, m23, [cq+64*25] + pmulhrsw m2, m23, [cq+64*23] + pmulhrsw m3, m23, [cq+64* 9] + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + pmulhrsw m0, m23, [cq+64* 5] + pmulhrsw m1, m23, [cq+64*27] + pmulhrsw m2, m23, [cq+64*21] + pmulhrsw m3, m23, [cq+64*11] + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + pmulhrsw m0, m23, [cq+64* 3] + pmulhrsw m1, m23, [cq+64*29] + pmulhrsw m2, m23, [cq+64*19] + pmulhrsw m3, m23, [cq+64*13] + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 + pmulhrsw m3, m23, [cq+64*24] + pmulhrsw m1, m23, [cq+64* 8] + pmulhrsw m2, m23, [cq+64*16] + pmulhrsw m0, m23, [cq+64* 0] + pmulhrsw m14, m23, [cq+64* 4] + pmulhrsw m17, m23, [cq+64*28] + pmulhrsw m16, m23, [cq+64*20] + pmulhrsw m15, m23, [cq+64*12] + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast + pmulhrsw m22, m23, [cq+64* 2] + pmulhrsw m29, m23, [cq+64*30] + pmulhrsw m26, m23, [cq+64*18] + pmulhrsw m25, m23, [cq+64*14] + pmulhrsw m24, m23, [cq+64*10] + pmulhrsw m27, m23, [cq+64*22] + pmulhrsw m28, m23, [cq+64*26] + pmulhrsw m23, [cq+64* 6] + mova [cq+64* 0], m14 + mova [cq+64* 1], m15 + mova [cq+64* 2], m16 + mova [cq+64* 3], m17 + mova [cq+64* 4], m18 + mova [cq+64* 5], m19 + mova [cq+64* 6], m20 + mova [cq+64* 7], m21 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast + vpbroadcastd m13, [o(pw_16384)] + call .pass1_end_part1 + mova [cq+64*16], m1 + mova [cq+64*17], m3 + mova [cq+64*18], m5 + mova [cq+64*19], m7 + mova [cq+64*24], m23 + mova [cq+64*25], m25 + mova [cq+64*26], m27 + mova [cq+64*27], m29 + pmulhrsw m23, m13, m0 ; a0 + pmulhrsw m25, m13, m2 ; a2 + pmulhrsw m27, m13, m4 ; a4 + pmulhrsw m29, m13, m6 ; a6 + REPX {pmulhrsw x, m13}, m22, m24, m26, m28 ; e0 e2 e4 e6 + call .pass1_end_part2 + mova [cq+64*20], m15 + mova [cq+64*21], m17 + mova [cq+64*22], m19 + mova [cq+64*23], m21 + mova [cq+64*28], m1 + mova [cq+64*29], m3 + mova [cq+64*30], m5 + mova [cq+64*31], m7 + REPX {pmulhrsw x, m13}, m14, m16, m18, m20 ; c0 c2 c4 c6 + REPX {pmulhrsw x, m13}, m0, m2, m4, m6 ; g0 g2 g4 g6 + vinserti32x8 m3, m23, ym14, 1 ; a00 a01 c00 c01 + vshufi32x4 m23, m14, q3232 ; a02 a03 c02 c03 + vinserti32x8 m15, m22, ym0, 1 ; e00 e01 g00 g01 + vshufi32x4 m22, m0, q3232 ; e02 e03 g02 g03 + vinserti32x8 m1, m27, ym18, 1 ; a40 a41 c40 c41 + vshufi32x4 m27, m18, q3232 ; a42 a43 c42 c43 + vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41 + vshufi32x4 m26, m4, q3232 ; e42 e43 g42 g43 + vinserti32x8 m14, m25, ym16, 1 ; a20 a21 c20 c21 + vshufi32x4 m25, m16, q3232 ; a22 a23 c22 c23 + vinserti32x8 m17, m24, ym2, 1 ; e20 e21 g20 g21 + vshufi32x4 m24, m2, q3232 ; e22 e23 g22 g23 + vinserti32x8 m19, m29, ym20, 1 ; a60 a61 c60 c61 + vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63 + vinserti32x8 m20, m28, ym6, 1 ; e60 e61 g60 g61 + vshufi32x4 m28, m6, q3232 ; e62 e63 g62 g63 + vshufi32x4 m2, m3, m15, q3131 ; 8 + vshufi32x4 m0, m3, m15, q2020 ; 0 + vshufi32x4 m6, m23, m22, q3131 ; 24 + vshufi32x4 m4, m23, m22, q2020 ; 16 + vshufi32x4 m3, m1, m18, q3131 ; 12 + vshufi32x4 m1, m18, q2020 ; 4 + vshufi32x4 m7, m27, m26, q3131 ; 28 + vshufi32x4 m5, m27, m26, q2020 ; 20 + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + vshufi32x4 m16, m14, m17, q3131 ; 10 + vshufi32x4 m14, m17, q2020 ; 2 + vshufi32x4 m17, m19, m20, q3131 ; 14 + vshufi32x4 m15, m19, m20, q2020 ; 6 + vshufi32x4 m20, m25, m24, q3131 ; 26 + vshufi32x4 m18, m25, m24, q2020 ; 18 + vshufi32x4 m21, m29, m28, q3131 ; 30 + vshufi32x4 m19, m29, m28, q2020 ; 22 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + pmulhrsw m22, m13, [cq+64*16] ; a1 + pmulhrsw m23, m13, [cq+64*20] ; c1 + pmulhrsw m24, m13, [cq+64*24] ; e1 + pmulhrsw m25, m13, [cq+64*28] ; g1 + pmulhrsw m26, m13, [cq+64*17] ; a3 + pmulhrsw m27, m13, [cq+64*21] ; c3 + pmulhrsw m28, m13, [cq+64*25] ; e3 + pmulhrsw m29, m13, [cq+64*29] ; g3 + mova [cq+64* 8], m14 + mova [cq+64* 9], m15 + mova [cq+64*10], m16 + mova [cq+64*11], m17 + mova [cq+64*12], m18 + mova [cq+64*13], m19 + mova [cq+64*14], m20 + mova [cq+64*15], m21 + pmulhrsw m14, m13, [cq+64*18] ; a5 + pmulhrsw m15, m13, [cq+64*22] ; c5 + pmulhrsw m16, m13, [cq+64*26] ; e5 + pmulhrsw m17, m13, [cq+64*30] ; g5 + pmulhrsw m18, m13, [cq+64*19] ; a7 + pmulhrsw m19, m13, [cq+64*23] ; c7 + pmulhrsw m20, m13, [cq+64*27] ; e7 + pmulhrsw m21, m13, [cq+64*31] ; g7 + vinserti32x8 m8, m22, ym23, 1 ; a10 a11 c10 c11 + vshufi32x4 m22, m23, q3232 ; a12 a13 c12 c13 + vinserti32x8 m9, m24, ym25, 1 ; e10 e11 g10 g11 + vshufi32x4 m24, m25, q3232 ; e12 e13 g12 g13 + vinserti32x8 m23, m26, ym27, 1 ; a30 a31 c30 c31 + vshufi32x4 m26, m27, q3232 ; a32 a33 c32 c33 + vinserti32x8 m11, m28, ym29, 1 ; e30 e31 g30 g31 + vshufi32x4 m28, m29, q3232 ; e32 e33 g32 g33 + mova [cq+64* 0], m0 + mova [cq+64* 1], m1 + mova [cq+64* 2], m2 + mova [cq+64* 3], m3 + mova [cq+64* 4], m4 + mova [cq+64* 5], m5 + mova [cq+64* 6], m6 + mova [cq+64* 7], m7 + vinserti32x8 m12, m14, ym15, 1 ; a50 a51 c50 c51 + vshufi32x4 m14, m15, q3232 ; a52 a53 c52 c53 + vinserti32x8 m13, m16, ym17, 1 ; e50 e51 g50 g51 + vshufi32x4 m16, m17, q3232 ; e52 e53 g52 g53 + vinserti32x8 m25, m18, ym19, 1 ; a70 a71 c70 c71 + vshufi32x4 m18, m19, q3232 ; a72 a73 c72 c73 + vinserti32x8 m17, m20, ym21, 1 ; e70 e71 g70 g71 + vshufi32x4 m20, m21, q3232 ; e72 e73 g72 g73 + vshufi32x4 m27, m23, m11, q3131 ; 11 m27 + vshufi32x4 m23, m11, q2020 ; 3 m23 + vshufi32x4 m19, m26, m28, q3131 ; 27 m19 + vshufi32x4 m15, m26, m28, q2020 ; 19 m15 + vshufi32x4 m29, m25, m17, q3131 ; 15 m29 + vshufi32x4 m25, m17, q2020 ; 7 m25 + vshufi32x4 m21, m18, m20, q3131 ; 31 m21 + vshufi32x4 m17, m18, m20, q2020 ; 23 m17 + vshufi32x4 m20, m14, m16, q3131 ; 29 m20 + vshufi32x4 m16, m14, m16, q2020 ; 21 m16 + vshufi32x4 m18, m22, m24, q3131 ; 25 m18 + vshufi32x4 m14, m22, m24, q2020 ; 17 m14 + vshufi32x4 m26, m8, m9, q3131 ; 9 m26 + vshufi32x4 m22, m8, m9, q2020 ; 1 m22 + vshufi32x4 m28, m12, m13, q3131 ; 13 m28 + vshufi32x4 m24, m12, m13, q2020 ; 5 m24 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf + vpbroadcastd m13, [o(pw_16384)] + pmulhrsw m0, m13, [r4-64*21] + pmulhrsw m1, m13, [r4-64*22] + pmulhrsw m2, m13, [r4-64*23] + pmulhrsw m3, m13, [r4-64*24] + pmulhrsw m4, m13, [r4-64*25] + pmulhrsw m5, m13, [r4-64*26] + pmulhrsw m6, m13, [r4-64*27] + pmulhrsw m7, m13, [r4-64*28] + mova [cq+64*16], m14 + mova [cq+64*17], m15 + mova [cq+64*18], m16 + mova [cq+64*19], m17 + mova [cq+64*20], m18 + mova [cq+64*21], m19 + mova [cq+64*22], m20 + mova [cq+64*23], m21 + pmulhrsw m14, m13, [r4-64*12] + pmulhrsw m15, m13, [r4-64*11] + pmulhrsw m16, m13, [r4-64*10] + pmulhrsw m17, m13, [r4-64* 9] + pmulhrsw m18, m13, [r4-64* 8] + pmulhrsw m19, m13, [r4-64* 7] + pmulhrsw m20, m13, [r4-64* 6] + pmulhrsw m21, m13, [r4-64* 5] + mova [cq+64*24], m22 + mova [cq+64*25], m23 + mova [cq+64*26], m24 + mova [cq+64*27], m25 + mova [cq+64*28], m26 + mova [cq+64*29], m27 + mova [cq+64*30], m28 + mova [cq+64*31], m29 + call .transpose_2x8x8_lo + mova [r4-64*12], m1 + mova [r4-64*11], m3 + mova [r4-64*10], m5 + mova [r4-64* 9], m7 + mova [r4-64* 8], m15 + mova [r4-64* 7], m17 + mova [r4-64* 6], m19 + mova [r4-64* 5], m21 + vinserti32x8 m22, m0, ym14, 1 ; f00 f01 h00 h01 + vshufi32x4 m23, m0, m14, q3232 ; f02 f03 h02 h03 + vinserti32x8 m24, m2, ym16, 1 ; f20 f21 h20 h21 + vshufi32x4 m25, m2, m16, q3232 ; f22 f23 h22 h23 + vinserti32x8 m26, m4, ym18, 1 ; f40 f41 h40 h41 + vshufi32x4 m27, m4, m18, q3232 ; f42 f43 h42 h43 + vinserti32x8 m28, m6, ym20, 1 ; f60 f61 h60 h61 + vshufi32x4 m29, m6, m20, q3232 ; f62 f63 h62 h63 + pmulhrsw m0, m13, [r4-64*20] + pmulhrsw m1, m13, [r4-64*19] + pmulhrsw m2, m13, [r4-64*18] + pmulhrsw m3, m13, [r4-64*17] + pmulhrsw m4, m13, [r4-64*16] + pmulhrsw m5, m13, [r4-64*15] + pmulhrsw m6, m13, [r4-64*14] + pmulhrsw m7, m13, [r4-64*13] + pmulhrsw m14, m13, [r4-64*29] + pmulhrsw m15, m13, [r4-64*30] + pmulhrsw m16, m13, [r4-64*31] + pmulhrsw m17, m13, [r4-64*32] + pmulhrsw m18, m13, [r4-64*33] + pmulhrsw m19, m13, [r4-64*34] + pmulhrsw m20, m13, [r4-64*35] + pmulhrsw m21, m13, [r4-64*36] + call .transpose_2x8x8_lo + mova [r4-64*20], m1 + mova [r4-64*19], m3 + mova [r4-64*18], m5 + mova [r4-64*17], m7 + mova [r4-64*16], m15 + mova [r4-64*15], m17 + mova [r4-64*14], m19 + mova [r4-64*13], m21 + vinserti32x8 m1, m4, ym18, 1 ; b40 b41 d40 d41 + vshufi32x4 m5, m4, m18, q3232 ; b42 b43 d42 d43 + vshufi32x4 m4, m0, m14, q3232 ; b02 b03 d02 d03 + vinserti32x8 m0, ym14, 1 ; b00 b01 d00 d01 + vinserti32x8 m14, m2, ym16, 1 ; b20 b21 d20 d21 + vshufi32x4 m18, m2, m16, q3232 ; b22 b23 d22 d23 + vinserti32x8 m15, m6, ym20, 1 ; b60 b61 d60 d61 + vshufi32x4 m19, m6, m20, q3232 ; b62 b63 d62 d63 + vshufi32x4 m2, m0, m22, q3131 ; 8 + vshufi32x4 m0, m22, q2020 ; 0 + vshufi32x4 m3, m1, m26, q3131 ; 12 + vshufi32x4 m1, m26, q2020 ; 4 + vshufi32x4 m6, m4, m23, q3131 ; 24 + vshufi32x4 m4, m23, q2020 ; 16 + vshufi32x4 m7, m5, m27, q3131 ; 28 + vshufi32x4 m5, m27, q2020 ; 20 + call m(inv_txfm_add_dct_dct_32x8_8bpc).main + vshufi32x4 m16, m14, m24, q3131 ; 10 + vshufi32x4 m14, m24, q2020 ; 2 + vshufi32x4 m17, m15, m28, q3131 ; 14 + vshufi32x4 m15, m28, q2020 ; 6 + vshufi32x4 m20, m18, m25, q3131 ; 26 + vshufi32x4 m18, m25, q2020 ; 18 + vshufi32x4 m21, m19, m29, q3131 ; 30 + vshufi32x4 m19, m29, q2020 ; 22 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf + mova m22, [r4-64*20] + mova m26, [r4-64*16] + mova m23, [r4-64*19] + mova m27, [r4-64*15] + mova m24, [r4-64*18] + mova m28, [r4-64*14] + mova m25, [r4-64*17] + mova m29, [r4-64*13] + mova [r4-64*20], m14 + mova [r4-64*19], m15 + mova [r4-64*18], m16 + mova [r4-64*17], m17 + mova [r4-64*16], m18 + mova [r4-64*15], m19 + mova [r4-64*14], m20 + mova [r4-64*13], m21 + mova m19, [r4-64*12] + mova m11, [r4-64* 8] + mova m20, [r4-64*11] + mova m12, [r4-64* 7] + mova m21, [r4-64*10] + mova m8, [r4-64* 6] + mova m9, [r4-64* 9] + mova m18, [r4-64* 5] + vshufi32x4 m14, m22, m26, q3232 ; b12 b13 d12 d13 + vinserti32x8 m22, ym26, 1 ; b10 b11 d10 d11 + vshufi32x4 m15, m23, m27, q3232 ; b32 b33 d32 d33 + vinserti32x8 m23, ym27, 1 ; b30 b31 d30 d31 + vshufi32x4 m16, m24, m28, q3232 ; b52 b53 d52 d53 + vinserti32x8 m24, ym28, 1 ; b50 b51 d50 d51 + vshufi32x4 m17, m25, m29, q3232 ; b72 b73 d72 d73 + vinserti32x8 m25, ym29, 1 ; b70 b71 d70 d71 + vinserti32x8 m27, m19, ym11, 1 ; f10 f11 h10 h11 + vshufi32x4 m19, m11, q3232 ; f12 f13 h12 h13 + vinserti32x8 m28, m20, ym12, 1 ; f30 f31 h30 h31 + vshufi32x4 m20, m12, q3232 ; f32 f33 h32 h33 + vinserti32x8 m29, m21, ym8, 1 ; f50 f51 h50 h51 + vshufi32x4 m21, m8, q3232 ; f52 f53 h52 h53 + vinserti32x8 m8, m9, ym18, 1 ; f70 f71 h70 h71 + vshufi32x4 m9, m18, q3232 ; f72 f73 h72 h73 + vshufi32x4 m26, m22, m27, q3131 ; 9 + vshufi32x4 m22, m27, q2020 ; 1 + vshufi32x4 m27, m23, m28, q3131 ; 11 + vshufi32x4 m23, m28, q2020 ; 3 + vshufi32x4 m28, m24, m29, q3131 ; 13 + vshufi32x4 m24, m29, q2020 ; 5 + vshufi32x4 m29, m25, m8, q3131 ; 15 + vshufi32x4 m25, m8, q2020 ; 7 + vshufi32x4 m18, m14, m19, q3131 ; 25 + vshufi32x4 m14, m19, q2020 ; 17 + vshufi32x4 m19, m15, m20, q3131 ; 27 + vshufi32x4 m15, m20, q2020 ; 19 + vshufi32x4 m20, m16, m21, q3131 ; 29 + vshufi32x4 m16, m21, q2020 ; 21 + vshufi32x4 m21, m17, m9, q3131 ; 31 + vshufi32x4 m17, m9, q2020 ; 23 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf + jmp .end +.fast: ; bottom/right halves are zero + {evex}vpmulhrsw ym8, ym23, [cq+64* 4] + {evex}vpmulhrsw xm1, xm23, [cq+64*12] + mova m28, [o(dup16_perm)] + {evex}vpmulhrsw ym7, ym23, [cq+64* 8] + vpmulhrsw ym22, ym23, [cq+64* 0] + vpermb m8, m28, m8 + vpermb ym1, ym28, ym1 + vpermb m7, m28, m7 + pmovzxwd m9, ym22 + pslld m9, 16 + call m(idct_16x16_internal_8bpc).main_fast2 + {evex}vpmulhrsw ym21, ym23, [cq+64* 2] + {evex}vpmulhrsw xm15, xm23, [cq+64*14] + {evex}vpmulhrsw xm18, xm23, [cq+64*10] + {evex}vpmulhrsw ym14, ym23, [cq+64* 6] + vpermb m21, m28, m21 + punpcklwd xm15, xm15 + vpermb ym18, ym28, ym18 + vpermb m14, m28, m14 + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 + vpmulhrsw ym22, ym23, [cq+64* 1] + {evex}vpmulhrsw xm29, xm23, [cq+64*15] + {evex}vpmulhrsw xm26, xm23, [cq+64* 9] + {evex}vpmulhrsw ym25, ym23, [cq+64* 7] + {evex}vpmulhrsw ym24, ym23, [cq+64* 5] + {evex}vpmulhrsw xm27, xm23, [cq+64*11] + {evex}vpmulhrsw xm8, xm23, [cq+64*13] + {evex}vpmulhrsw ym23, [cq+64* 3] + vpermb m22, m28, m22 + punpcklwd xm29, xm29 + vpermb ym26, ym28, ym26 + vpermb m25, m28, m25 + mova [cq+64* 0], m14 + mova [cq+64* 1], m15 + mova [cq+64* 2], m16 + mova [cq+64* 3], m17 + REPX {vpermb x, m28, x}, m24, m27, m23 + punpcklwd xm28, xm8, xm8 + mova [cq+64* 4], m18 + mova [cq+64* 5], m19 + mova [cq+64* 6], m20 + mova [cq+64* 7], m21 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast + mov r4, rsp + vpbroadcastd m13, [o(pw_16384)] + mova [r4+64*16], m4 + mova [r4+64*17], m5 + mova [r4+64*18], m6 + mova [r4+64*19], m7 + mova [r4+64*28], m26 + mova [r4+64*29], m27 + mova [r4+64*30], m28 + mova [r4+64*31], m29 + call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end + mova [r4+64*20], m22 + mova [r4+64*21], m23 + mova [r4+64*22], m24 + mova [r4+64*23], m25 + mova [r4+64*24], m26 + mova [r4+64*25], m27 + mova [r4+64*26], m28 + mova [r4+64*27], m29 + call .pass2_fast + mova [cq+64* 8], m14 + mova [cq+64* 9], m15 + mova [cq+64*10], m16 + mova [cq+64*11], m17 + mova [cq+64*12], m18 + mova [cq+64*13], m19 + mova [cq+64*14], m20 + mova [cq+64*15], m21 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast + mova [cq+64* 0], m0 + mova [cq+64* 1], m1 + mova [cq+64* 2], m2 + mova [cq+64* 3], m3 + mova [cq+64* 4], m4 + mova [cq+64* 5], m5 + mova [cq+64* 6], m6 + mova [cq+64* 7], m7 + pmulhrsw m0, m13, [r4+64*16] + pmulhrsw m1, m13, [r4+64*17] + pmulhrsw m2, m13, [r4+64*18] + pmulhrsw m3, m13, [r4+64*19] + pmulhrsw m4, m13, [r4+64*20] + pmulhrsw m5, m13, [r4+64*21] + pmulhrsw m6, m13, [r4+64*22] + pmulhrsw m7, m13, [r4+64*23] + mova [cq+64*16], m14 + mova [cq+64*17], m15 + mova [cq+64*18], m16 + mova [cq+64*19], m17 + mova [cq+64*20], m18 + mova [cq+64*21], m19 + mova [cq+64*22], m20 + mova [cq+64*23], m21 + pmulhrsw m14, m13, [r4+64*24] + pmulhrsw m15, m13, [r4+64*25] + pmulhrsw m16, m13, [r4+64*26] + pmulhrsw m17, m13, [r4+64*27] + pmulhrsw m18, m13, [r4+64*28] + pmulhrsw m19, m13, [r4+64*29] + pmulhrsw m20, m13, [r4+64*30] + pmulhrsw m21, m13, [r4+64*31] + mova [cq+64*24], m22 + mova [cq+64*25], m23 + mova [cq+64*26], m24 + mova [cq+64*27], m25 + mova [cq+64*28], m26 + mova [cq+64*29], m27 + mova [cq+64*30], m28 + mova [cq+64*31], m29 + call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round + call .pass2_fast + mova [r4+64*16], m14 + mova [r4+64*17], m15 + mova [r4+64*18], m16 + mova [r4+64*19], m17 + mova [r4+64*20], m18 + mova [r4+64*21], m19 + mova [r4+64*22], m20 + mova [r4+64*23], m21 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast +.end: + vpbroadcastd m13, [o(pw_2048)] + lea r5, [strideq*3] + pxor m12, m12 + lea r3, [dstq+r5*8] + lea r6, [strideq+r5] ; stride*4 + add r3, r6 ; dst+stride*28 +%macro IDCT_64x32_END 5 ; src16, src32, mem, off_lo, off_hi + mova m11, [cq+64*( %3)] ; 0 + mova m9, [cq+64*(31-%3)] ; 31 +%if %3 >= 8 + mova m%1, [rsp+64*(%1+16)] +%endif + mova m10, [dstq+%4] + paddsw m8, m11, m9 + psubsw m11, m9 + paddsw m9, m%1, m%2 + psubsw m%1, m%2 + punpcklbw m%2, m10, m12 + punpckhbw m10, m12 + pmulhrsw m8, m13 + pmulhrsw m9, m13 + paddw m8, m%2 + paddw m9, m10 + mova m10, [r3+%5] + pmulhrsw m11, m13 + pmulhrsw m%1, m13 + mova [cq+64*( %3)], m12 + mova [cq+64*(31-%3)], m12 + punpcklbw m%2, m10, m12 + punpckhbw m10, m12 + packuswb m8, m9 + paddw m11, m%2 + paddw m%1, m10 + packuswb m11, m%1 + mova [dstq+%4], m8 + mova [r3 +%5], m11 +%if %3 == 3 || %3 == 7 || %3 == 11 + add dstq, r6 + sub r3, r6 +%endif +%endmacro + IDCT_64x32_END 0, 29, 0, strideq*0, r5 + IDCT_64x32_END 1, 28, 1, strideq*1, strideq*2 + IDCT_64x32_END 2, 27, 2, strideq*2, strideq*1 + IDCT_64x32_END 3, 26, 3, r5 , strideq*0 + IDCT_64x32_END 4, 25, 4, strideq*0, r5 + IDCT_64x32_END 5, 24, 5, strideq*1, strideq*2 + IDCT_64x32_END 6, 23, 6, strideq*2, strideq*1 + IDCT_64x32_END 7, 22, 7, r5 , strideq*0 + IDCT_64x32_END 0, 21, 8, strideq*0, r5 + IDCT_64x32_END 1, 20, 9, strideq*1, strideq*2 + IDCT_64x32_END 2, 19, 10, strideq*2, strideq*1 + IDCT_64x32_END 3, 18, 11, r5 , strideq*0 + IDCT_64x32_END 4, 17, 12, strideq*0, r5 + IDCT_64x32_END 5, 16, 13, strideq*1, strideq*2 + IDCT_64x32_END 6, 15, 14, strideq*2, strideq*1 + IDCT_64x32_END 7, 14, 15, r5 , strideq*0 + RET +ALIGN function_align +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + imul r6d, 181 + mov r3d, 32 + add r6d, 128 + sar r6d, 8 + imul r6d, 181 + add r6d, 128+256 + sar r6d, 8+1 + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly2 +ALIGN function_align +.pass1_end_part1: +%macro IDCT_64x32_PASS1_END 3 ; src16, src32, src64 +%if %1 != %3 + mova m%1, [cq+64*%1] +%endif + mova m9, [r4+64*(%3-36)] ; idct64 32+n + mova m11, [r4+64*(-5-%3)] ; idct64 63-n + psubsw m8, m%1, m%2 ; idct32 31-n + paddsw m%1, m%2 ; idct32 0+n +%if %1 == %3 + psubsw m%2, m8, m9 ; out 32+n e + paddsw m8, m9 ; out 31-n d + psubsw m9, m%1, m11 ; out 63-n h + paddsw m%1, m11 ; out 0+n a +%else + paddsw m%2, m8, m9 ; out 23-n c + psubsw m8, m9 ; out 40+n f + paddsw m9, m%1, m11 ; out 8+n b + psubsw m%1, m11 ; out 55-n g +%endif + mova [r4+64*(%3-36)], m8 + mova [r4+64*(-5-%3)], m9 +%endmacro + IDCT_64x32_PASS1_END 0, 29, 0 + IDCT_64x32_PASS1_END 1, 28, 1 + IDCT_64x32_PASS1_END 2, 27, 2 + IDCT_64x32_PASS1_END 3, 26, 3 + IDCT_64x32_PASS1_END 4, 25, 4 + IDCT_64x32_PASS1_END 5, 24, 5 + IDCT_64x32_PASS1_END 6, 23, 6 + IDCT_64x32_PASS1_END 7, 22, 7 +.transpose_2x8x8_hi: ; m0-m7 + m22-m29 (inverted) + punpcklwd m8, m25, m24 ; e0 f0 e1 f1 e2 f2 e3 f3 + punpckhwd m25, m24 ; e4 f4 e5 f5 e6 f6 e7 f7 + punpcklwd m24, m23, m22 ; g0 h0 g1 h1 g2 h2 g3 h3 + punpckhwd m23, m22 ; g4 h4 g5 h5 g6 h6 g7 h7 + punpcklwd m22, m29, m28 ; a0 b0 a1 b1 a2 b2 a3 b3 + punpckhwd m29, m28 ; a4 b4 a5 b5 a6 b6 a7 b7 + punpcklwd m28, m27, m26 ; c0 d0 c1 d1 c2 d2 c3 d3 + punpckhwd m27, m26 ; c4 d4 c5 d5 c6 d6 c7 d7 + punpckldq m26, m29, m27 ; a4 b4 c4 d4 a5 b5 c5 d5 + punpckhdq m29, m27 ; a6 b6 c6 d6 a7 b7 c7 d7 + punpckldq m27, m8, m24 ; e0 f0 g0 h0 e1 f1 g1 h1 + punpckhdq m8, m24 ; e2 f2 g2 h2 e3 f3 g3 h3 + punpckhdq m24, m22, m28 ; a2 b2 c2 d2 a3 b3 c3 d3 + punpckldq m22, m28 ; a0 b0 c0 d0 a1 b1 c1 d1 + punpckldq m28, m25, m23 ; e4 f4 g4 h4 e5 f5 g5 h5 + punpckhdq m25, m23 ; e6 f6 g6 h6 e7 f7 g7 h7 + punpckhqdq m23, m22, m27 ; 1 23 + punpcklqdq m22, m27 ; 0 22 + punpckhqdq m27, m26, m28 ; 5 27 + punpcklqdq m26, m28 ; 4 26 + punpcklqdq m28, m29, m25 ; 6 28 + punpckhqdq m29, m25 ; 7 29 + punpckhqdq m25, m24, m8 ; 3 25 + punpcklqdq m24, m8 ; 2 24 +.transpose_8x8: + punpckhwd m8, m4, m5 + punpcklwd m4, m5 + punpckhwd m5, m0, m1 + punpcklwd m0, m1 + punpckhwd m1, m6, m7 + punpcklwd m6, m7 + punpckhwd m7, m2, m3 + punpcklwd m2, m3 + punpckhdq m3, m0, m2 + punpckldq m0, m2 + punpckldq m2, m4, m6 + punpckhdq m4, m6 + punpckhdq m6, m5, m7 + punpckldq m5, m7 + punpckldq m7, m8, m1 + punpckhdq m8, m1 + punpckhqdq m1, m0, m2 + punpcklqdq m0, m2 + punpcklqdq m2, m3, m4 + punpckhqdq m3, m4 + punpcklqdq m4, m5, m7 + punpckhqdq m5, m7 + punpckhqdq m7, m6, m8 + punpcklqdq m6, m8 + ret +.pass1_end_part2: + IDCT_64x32_PASS1_END 0, 21, 8 + IDCT_64x32_PASS1_END 1, 20, 9 + IDCT_64x32_PASS1_END 2, 19, 10 + IDCT_64x32_PASS1_END 3, 18, 11 + IDCT_64x32_PASS1_END 4, 17, 12 + IDCT_64x32_PASS1_END 5, 16, 13 + IDCT_64x32_PASS1_END 6, 15, 14 + IDCT_64x32_PASS1_END 7, 14, 15 +.transpose_2x8x8_lo: ; m0-m7 (inverted) + m14-m21 + punpcklwd m8, m3, m2 + punpckhwd m3, m2 + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m7, m6 + punpckhwd m7, m6 + punpcklwd m6, m5, m4 + punpckhwd m5, m4 + punpckldq m4, m7, m5 + punpckhdq m7, m5 + punpckldq m5, m8, m2 + punpckhdq m8, m2 + punpckhdq m2, m0, m6 + punpckldq m0, m6 + punpckldq m6, m3, m1 + punpckhdq m3, m1 + punpckhqdq m1, m0, m5 + punpcklqdq m0, m5 + punpckhqdq m5, m4, m6 + punpcklqdq m4, m6 + punpcklqdq m6, m7, m3 + punpckhqdq m7, m3 + punpckhqdq m3, m2, m8 + punpcklqdq m2, m8 + punpckhwd m8, m18, m19 + punpcklwd m18, m19 + punpckhwd m19, m14, m15 + punpcklwd m14, m15 + punpckhwd m15, m20, m21 + punpcklwd m20, m21 + punpckhwd m21, m16, m17 + punpcklwd m16, m17 + punpckhdq m17, m14, m16 + punpckldq m14, m16 + punpckldq m16, m18, m20 + punpckhdq m18, m20 + punpckhdq m20, m19, m21 + punpckldq m19, m21 + punpckldq m21, m8, m15 + punpckhdq m8, m15 + punpckhqdq m15, m14, m16 + punpcklqdq m14, m16 + punpcklqdq m16, m17, m18 + punpckhqdq m17, m18 + punpcklqdq m18, m19, m21 + punpckhqdq m19, m21 + punpckhqdq m21, m20, m8 + punpcklqdq m20, m8 + ret +.pass2_fast: + vshufi32x4 m24, m9, m15, q3131 ; 5 + vshufi32x4 m22, m9, m15, q2020 ; 1 + vshufi32x4 m15, m1, m16, q3131 ; 6 + vshufi32x4 m14, m1, m16, q2020 ; 2 + vshufi32x4 m1, m0, m3, q3131 ; 4 + vshufi32x4 m0, m3, q2020 ; 0 + vshufi32x4 m3, m8, m2, q3131 ; 12 + vshufi32x4 m2, m8, m2, q2020 ; 8 + vshufi32x4 m25, m11, m17, q3131 ; 7 + vshufi32x4 m23, m11, m17, q2020 ; 3 + vshufi32x4 m17, m5, m19, q3131 ; 14 + vshufi32x4 m16, m5, m19, q2020 ; 10 + vshufi32x4 m29, m6, m20, q3131 ; 15 + vshufi32x4 m27, m6, m20, q2020 ; 11 + vshufi32x4 m28, m4, m18, q3131 ; 13 + vshufi32x4 m26, m4, m18, q2020 ; 9 + jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast + +cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 7, 0, dst, stride, c, eob + lea r5, [o_base] + test eobd, eobd + jz .dconly + PROLOGUE 0, 7, 30, 64*96, dst, stride, c, eob +%undef cmp + cmp eobd, 136 + jb .fast + mova m0, [cq+64* 1] + mova m1, [cq+64*31] + mova m2, [cq+64*17] + mova m3, [cq+64*15] + vpbroadcastd m10, [o(pd_2048)] + mov r4, rsp + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + mova m0, [cq+64* 7] + mova m1, [cq+64*25] + mova m2, [cq+64*23] + mova m3, [cq+64* 9] + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + mova m0, [cq+64* 5] + mova m1, [cq+64*27] + mova m2, [cq+64*21] + mova m3, [cq+64*11] + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + mova m0, [cq+64* 3] + mova m1, [cq+64*29] + mova m2, [cq+64*19] + mova m3, [cq+64*13] + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 + mova m0, [cq+64* 0] + mova m1, [cq+64* 8] + mova m2, [cq+64*16] + mova m3, [cq+64*24] + mova m14, [cq+64* 4] + mova m15, [cq+64*12] + mova m16, [cq+64*20] + mova m17, [cq+64*28] + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast + mova m22, [cq+64* 2] + mova m29, [cq+64*30] + mova m26, [cq+64*18] + mova m25, [cq+64*14] + mova m24, [cq+64*10] + mova m27, [cq+64*22] + mova m28, [cq+64*26] + mova m23, [cq+64* 6] + mova [cq+64* 0], m14 + mova [cq+64* 1], m15 + mova [cq+64* 2], m16 + mova [cq+64* 3], m17 + mova [cq+64* 4], m18 + mova [cq+64* 5], m19 + mova [cq+64* 6], m20 + mova [cq+64* 7], m21 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast + vpbroadcastd m13, [o(pw_8192)] + call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part1 + mova [r4+64*36], m1 + mova [r4+64*37], m3 + mova [r4+64*38], m5 + mova [r4+64*39], m7 + mova [r4+64*44], m23 + mova [r4+64*45], m25 + mova [r4+64*46], m27 + mova [r4+64*47], m29 + pmulhrsw m23, m13, m0 ; a0 + pmulhrsw m25, m13, m2 ; a2 + pmulhrsw m27, m13, m4 ; a4 + pmulhrsw m29, m13, m6 ; a6 + call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part2 + lea r6, [r4-64*4] + add r4, 64*28 + call .pass2_end + mov r4, rsp + mova m0, [r4+64*23] + mova m1, [r4+64*22] + mova m2, [r4+64*21] + mova m3, [r4+64*20] + mova m4, [r4+64*19] + mova m5, [r4+64*18] + mova m6, [r4+64*17] + mova m7, [r4+64*16] + mova m22, [r4+64*15] + mova m23, [r4+64*14] + mova m24, [r4+64*13] + mova m25, [r4+64*12] + mova m26, [r4+64*11] + mova m27, [r4+64*10] + mova m28, [r4+64* 9] + mova m29, [r4+64* 8] + call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_hi + vpbroadcastd m13, [o(pw_8192)] + mova [r4+64* 8], m1 + mova [r4+64* 9], m3 + mova [r4+64*10], m5 + mova [r4+64*11], m7 + mova [r4+64*16], m23 + mova [r4+64*17], m25 + mova [r4+64*18], m27 + mova [r4+64*19], m29 + pmulhrsw m23, m13, m0 ; b0 + pmulhrsw m25, m13, m2 ; b2 + pmulhrsw m27, m13, m4 ; b4 + pmulhrsw m29, m13, m6 ; b6 + mova m0, [r4+64*31] + mova m1, [r4+64*30] + mova m2, [r4+64*29] + mova m3, [r4+64*28] + mova m4, [r4+64*27] + mova m5, [r4+64*26] + mova m6, [r4+64*25] + mova m7, [r4+64*24] + mova m14, [r4+64* 7] + mova m15, [r4+64* 6] + mova m16, [r4+64* 5] + mova m17, [r4+64* 4] + mova m18, [r4+64* 3] + mova m19, [r4+64* 2] + mova m20, [r4+64* 1] + mova m21, [r4+64* 0] + call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_lo + mov r6, cq + call .pass2_end + jmp .end +.fast: ; bottom/right halves are zero + mova m28, [o(dup16_perm)] + pmovzxwd m9, [cq+64* 0] + vpermb m8, m28, [cq+64* 4] + vpermb ym1, ym28, [cq+64*12] + vpermb m7, m28, [cq+64* 8] + pslld m9, 16 + call m(idct_16x16_internal_8bpc).main_fast2 + vpermb m21, m28, [cq+64* 2] + vpermb ym15, ym28, [cq+64*14] + vpermb ym18, ym28, [cq+64*10] + vpermb m14, m28, [cq+64* 6] + call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2 + vpermb m22, m28, [cq+64* 1] + vpermb ym29, ym28, [cq+64*15] + vpermb ym26, ym28, [cq+64* 9] + vpermb m25, m28, [cq+64* 7] + vpermb m24, m28, [cq+64* 5] + vpermb ym27, ym28, [cq+64*11] + vpermb m23, m28, [cq+64* 3] + vpermb ym28, ym28, [cq+64*13] + mova [cq+64* 0], m14 + mova [cq+64* 1], m15 + mova [cq+64* 2], m16 + mova [cq+64* 3], m17 + mova [cq+64* 4], m18 + mova [cq+64* 5], m19 + mova [cq+64* 6], m20 + mova [cq+64* 7], m21 + call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast + vpbroadcastd m13, [o(pw_8192)] + mova [cq+64*16], m4 + mova [cq+64*17], m5 + mova [cq+64*18], m6 + mova [cq+64*19], m7 + mova [cq+64*28], m26 + mova [cq+64*29], m27 + mova [cq+64*30], m28 + mova [cq+64*31], m29 + call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end + mova [cq+64*20], m22 + mova [cq+64*21], m23 + mova [cq+64*22], m24 + mova [cq+64*23], m25 + mova [cq+64*24], m26 + mova [cq+64*25], m27 + mova [cq+64*26], m28 + mova [cq+64*27], m29 + lea r4, [rsp+64*64] + lea r3, [rsp+64*32] + call .pass2_fast + pmulhrsw m0, m13, [cq+64*16] + pmulhrsw m1, m13, [cq+64*17] + pmulhrsw m2, m13, [cq+64*18] + pmulhrsw m3, m13, [cq+64*19] + pmulhrsw m4, m13, [cq+64*20] + pmulhrsw m5, m13, [cq+64*21] + pmulhrsw m6, m13, [cq+64*22] + pmulhrsw m7, m13, [cq+64*23] + pmulhrsw m14, m13, [cq+64*24] + pmulhrsw m15, m13, [cq+64*25] + pmulhrsw m16, m13, [cq+64*26] + pmulhrsw m17, m13, [cq+64*27] + pmulhrsw m18, m13, [cq+64*28] + pmulhrsw m19, m13, [cq+64*29] + pmulhrsw m20, m13, [cq+64*30] + pmulhrsw m21, m13, [cq+64*31] + call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round + mov r4, rsp + mov r3, cq + call .pass2_fast +.end: + vpbroadcastd m17, [o(pw_2048)] + lea r5, [strideq*8] + mov r3, dstq + pxor m16, m16 + sub r4, 64*5 ; rsp+64*31 + mov r6, rsp +.end_loop: + mova m2, [r6+64*32] ; idct16 0+n lo + mova m7, [r6+64*48] ; idct32 31-n lo + mova m6, [cq+64* 0] ; idct16 0+n hi + mova m0, [cq+64*16] ; idct32 31-n hi + mova m4, [r4+64*64] ; idct64 63-n lo + mova m1, [r4+64* 0] ; idct64 63-n hi + mova m5, [r6+64*64] ; idct64 32+n lo + mova m8, [r6+64* 0] ; idct64 32+n hi + sub r3, strideq + paddsw m3, m2, m7 ; idct32 0+n lo + mova m12, [dstq+r5*0] + psubsw m2, m7 ; idct32 31-n lo + mova m15, [r3 +r5*8] + paddsw m7, m6, m0 ; idct32 0+n hi + mova m13, [r3 +r5*4] + psubsw m6, m0 ; idct32 31-n hi + mova m14, [dstq+r5*4] + paddsw m0, m3, m4 ; out 0+n lo + add r6, 64 + psubsw m3, m4 ; out 63-n lo + sub r4, 64 + paddsw m4, m7, m1 ; out 0+n hi + mova [cq+64* 0], m16 + psubsw m7, m1 ; out 63-n hi + mova [cq+64*16], m16 + paddsw m1, m2, m5 ; out 31-n lo + add cq, 64 + psubsw m2, m5 ; out 32+n lo + paddsw m5, m6, m8 ; out 31-n hi + psubsw m6, m8 ; out 32+n hi + pmulhrsw m0, m17 + punpcklbw m8, m12, m16 + pmulhrsw m4, m17 + punpckhbw m12, m16 + pmulhrsw m3, m17 + punpcklbw m11, m15, m16 + pmulhrsw m7, m17 + punpckhbw m15, m16 + pmulhrsw m1, m17 + punpcklbw m9, m13, m16 + pmulhrsw m5, m17 + punpckhbw m13, m16 + pmulhrsw m2, m17 + punpcklbw m10, m14, m16 + pmulhrsw m6, m17 + punpckhbw m14, m16 + paddw m0, m8 + paddw m4, m12 + packuswb m0, m4 + paddw m3, m11 + paddw m7, m15 + packuswb m3, m7 + paddw m1, m9 + paddw m5, m13 + packuswb m1, m5 + paddw m2, m10 + paddw m6, m14 + packuswb m2, m6 + mova [dstq+r5*0], m0 + mova [r3 +r5*8], m3 + mova [r3 +r5*4], m1 + mova [dstq+r5*4], m2 + add dstq, strideq + cmp r6, r4 + jb .end_loop + RET +.dconly: + movsx r6d, word [cq] + mov [cq], eobd + mov r3d, 64 + jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly +ALIGN function_align +.pass2_end: + REPX {pmulhrsw x, m13}, m22, m24, m26, m28, m14, m16, m18, m20, m0, m2, m4, m6 + mova [r4+64*20], m1 + mova [r4+64*21], m3 + mova [r4+64*22], m5 + mova [r4+64*23], m7 + vinserti32x8 m1, m23, ym14, 1 ; a00 a01 c00 c01 + vshufi32x4 m3, m23, m14, q3232 ; a02 a03 c02 c03 + vinserti32x8 m5, m22, ym0, 1 ; e00 e01 g00 g01 + vshufi32x4 m14, m22, m0, q3232 ; e02 e03 g02 g03 + mova [r4+64*12], m15 + mova [r4+64*13], m17 + mova [r4+64*14], m19 + mova [r4+64*15], m21 + vinserti32x8 m15, m27, ym18, 1 ; a40 a41 c40 c41 + vshufi32x4 m17, m27, m18, q3232 ; a42 a43 c42 c43 + vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41 + vshufi32x4 m19, m26, m4, q3232 ; e42 e43 g42 g43 + vinserti32x8 m22, m25, ym16, 1 ; a20 a21 c20 c21 + vshufi32x4 m26, m25, m16, q3232 ; a22 a23 c22 c23 + vinserti32x8 m25, m24, ym2, 1 ; e20 e21 g20 g21 + vshufi32x4 m27, m24, m2, q3232 ; e22 e23 g22 g23 + vinserti32x8 m23, m29, ym20, 1 ; a60 a61 c60 c61 + vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63 + vshufi32x4 m13, m28, m6, q3232 ; e62 e63 g62 g63 + vinserti32x8 m28, ym6, 1 ; e60 e61 g60 g61 + vshufi32x4 m0, m1, m5, q2020 ; 0 + vshufi32x4 m1, m5, q3131 ; 8 + vshufi32x4 m2, m3, m14, q2020 ; 16 + vshufi32x4 m3, m14, q3131 ; 24 + vshufi32x4 m14, m15, m18, q2020 ; 4 + vshufi32x4 m15, m18, q3131 ; 12 + vshufi32x4 m16, m17, m19, q2020 ; 20 + vshufi32x4 m17, m19, q3131 ; 28 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast + vshufi32x4 m24, m22, m25, q3131 ; 10 + vshufi32x4 m22, m25, q2020 ; 2 + vshufi32x4 m25, m23, m28, q3131 ; 14 + vshufi32x4 m23, m28, q2020 ; 6 + vshufi32x4 m28, m26, m27, q3131 ; 26 + vshufi32x4 m26, m27, q2020 ; 18 + vshufi32x4 m27, m29, m13, q2020 ; 22 + vshufi32x4 m29, m13, q3131 ; 30 + mova [r6+64* 0], m0 + mova [r6+64* 1], m1 + mova [r6+64* 2], m2 + mova [r6+64* 3], m3 + mova [r6+64* 4], m4 + mova [r6+64* 5], m5 + mova [r6+64* 6], m6 + mova [r6+64* 7], m7 + mova [r6+64* 8], m14 + mova [r6+64* 9], m15 + mova [r6+64*10], m16 + mova [r6+64*11], m17 + mova [r6+64*12], m18 + mova [r6+64*13], m19 + mova [r6+64*14], m20 + mova [r6+64*15], m21 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast + vpbroadcastd m13, [o(pw_8192)] + mova [r6+64*16], m29 + mova [r6+64*17], m28 + mova [r6+64*18], m27 + mova [r6+64*19], m26 + mova [r6+64*20], m25 + mova [r6+64*21], m24 + mova [r6+64*22], m23 + mova [r6+64*23], m22 + mova [r6+64*24], m21 + mova [r6+64*25], m20 + mova [r6+64*26], m19 + mova [r6+64*27], m18 + mova [r6+64*28], m17 + mova [r6+64*29], m16 + mova [r6+64*30], m15 + mova [r6+64*31], m14 + pmulhrsw m15, m13, [r4+64* 8] ; 1 9 17 25 + pmulhrsw m16, m13, [r4+64*12] + pmulhrsw m17, m13, [r4+64*16] + pmulhrsw m18, m13, [r4+64*20] + pmulhrsw m19, m13, [r4+64*11] ; 7 15 23 31 + pmulhrsw m20, m13, [r4+64*15] + pmulhrsw m21, m13, [r4+64*19] + pmulhrsw m22, m13, [r4+64*23] + vinserti32x8 m14, m15, ym16, 1 ; a1 a9 c1 c9 + vshufi32x4 m15, m16, q3232 ; a17 a25 c17 c25 + vinserti32x8 m16, m17, ym18, 1 ; e1 e9 g1 g9 + vshufi32x4 m17, m18, q3232 ; e17 e25 g17 g25 + pmulhrsw m23, m13, [r4+64*10] ; 5 13 21 29 + pmulhrsw m24, m13, [r4+64*14] + pmulhrsw m25, m13, [r4+64*18] + pmulhrsw m26, m13, [r4+64*22] + vinserti32x8 m18, m19, ym20, 1 ; a7 a15 c7 c15 + vshufi32x4 m19, m20, q3232 ; a23 a31 c23 c31 + vinserti32x8 m20, m21, ym22, 1 ; e7 e15 g7 g15 + vshufi32x4 m21, m22, q3232 ; e23 e31 g23 g31 + pmulhrsw m27, m13, [r4+64* 9] ; 3 11 19 27 + pmulhrsw m28, m13, [r4+64*13] + pmulhrsw m29, m13, [r4+64*17] + pmulhrsw m13, [r4+64*21] + vshufi32x4 m0, m14, m16, q2020 ; 1 + vshufi32x4 m1, m19, m21, q3131 ; 31 + vshufi32x4 m2, m15, m17, q2020 ; 17 + vshufi32x4 m3, m18, m20, q3131 ; 15 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + vshufi32x4 m0, m18, m20, q2020 ; 7 + vshufi32x4 m1, m15, m17, q3131 ; 25 + vshufi32x4 m2, m19, m21, q2020 ; 23 + vshufi32x4 m3, m14, m16, q3131 ; 9 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + vinserti32x8 m22, m23, ym24, 1 ; a5 a13 c5 c13 + vshufi32x4 m23, m24, q3232 ; a21 a29 c21 c29 + vinserti32x8 m24, m25, ym26, 1 ; e5 e13 g5 g13 + vshufi32x4 m25, m26, q3232 ; e21 e29 g21 g29 + vinserti32x8 m26, m27, ym28, 1 ; a3 a11 c3 c11 + vshufi32x4 m27, m28, q3232 ; a19 a27 c19 c27 + vinserti32x8 m28, m29, ym13, 1 ; e3 e11 g3 g11 + vshufi32x4 m29, m13, q3232 ; e19 e17 g19 g27 + vshufi32x4 m0, m22, m24, q2020 ; 5 + vshufi32x4 m1, m27, m29, q3131 ; 27 + vshufi32x4 m2, m23, m25, q2020 ; 21 + vshufi32x4 m3, m26, m28, q3131 ; 11 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + vshufi32x4 m0, m26, m28, q2020 ; 3 + vshufi32x4 m1, m23, m25, q3131 ; 29 + vshufi32x4 m2, m27, m29, q2020 ; 19 + vshufi32x4 m3, m22, m24, q3131 ; 13 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1 + jmp m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 +ALIGN function_align +.pass2_fast: + vshufi32x4 m23, m1, m16, q3131 ; 6 + vshufi32x4 m22, m1, m16, q2020 ; 2 + vshufi32x4 m14, m0, m3, q3131 ; 4 + vshufi32x4 m26, m0, m3, q2020 ; 0 + vshufi32x4 m28, m9, m15, q3131 ; 5 + vshufi32x4 m0, m9, m15, q2020 ; 1 + vshufi32x4 m16, m11, m17, q3131 ; 7 + vshufi32x4 m29, m11, m17, q2020 ; 3 + vshufi32x4 m15, m8, m2, q3131 ; 12 + vshufi32x4 m27, m8, m2, q2020 ; 8 + vshufi32x4 m25, m5, m19, q3131 ; 14 + vshufi32x4 m24, m5, m19, q2020 ; 10 + vshufi32x4 m3, m6, m20, q3131 ; 15 + vshufi32x4 m19, m6, m20, q2020 ; 11 + vshufi32x4 m17, m4, m18, q3131 ; 13 + vshufi32x4 m18, m4, m18, q2020 ; 9 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast + mova m0, m16 + mova m3, m18 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast + mova m0, m28 + mova m3, m19 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast + mova m0, m29 + mova m3, m17 + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast + call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2 + mova m0, m26 + mova m1, m27 + call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2 + mova [r3+64* 0], m0 + mova [r3+64* 1], m1 + mova [r3+64* 2], m2 + mova [r3+64* 3], m3 + mova [r3+64* 4], m4 + mova [r3+64* 5], m5 + mova [r3+64* 6], m6 + mova [r3+64* 7], m7 + mova [r3+64* 8], m14 + mova [r3+64* 9], m15 + mova [r3+64*10], m16 + mova [r3+64*11], m17 + mova [r3+64*12], m18 + mova [r3+64*13], m19 + mova [r3+64*14], m20 + mova [r3+64*15], m21 + call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2 + mova [r3+64*16], m29 + mova [r3+64*17], m28 + mova [r3+64*18], m27 + mova [r3+64*19], m26 + mova [r3+64*20], m25 + mova [r3+64*21], m24 + mova [r3+64*22], m23 + mova [r3+64*23], m22 + mova [r3+64*24], m21 + mova [r3+64*25], m20 + mova [r3+64*26], m19 + mova [r3+64*27], m18 + mova [r3+64*28], m17 + mova [r3+64*29], m16 + mova [r3+64*30], m15 + mova [r3+64*31], m14 + ret + +%endif ; ARCH_X86_64 |