summaryrefslogtreecommitdiffstats
path: root/third_party/dav1d/src/x86/itx_avx512.asm
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/dav1d/src/x86/itx_avx512.asm')
-rw-r--r--third_party/dav1d/src/x86/itx_avx512.asm7324
1 files changed, 7324 insertions, 0 deletions
diff --git a/third_party/dav1d/src/x86/itx_avx512.asm b/third_party/dav1d/src/x86/itx_avx512.asm
new file mode 100644
index 0000000000..a3d4ebdfb4
--- /dev/null
+++ b/third_party/dav1d/src/x86/itx_avx512.asm
@@ -0,0 +1,7324 @@
+; Copyright © 2020, VideoLAN and dav1d authors
+; Copyright © 2020, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 64
+const \
+int8_permA, db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51
+ db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55
+ db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59
+ db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63
+int8_permB: db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51
+ db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59
+ db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55
+ db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63
+int16_perm: db 0, 1, 32, 33, 2, 3, 34, 35, 4, 5, 36, 37, 6, 7, 38, 39
+ db 8, 9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
+ db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
+ db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
+dup16_perm: db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7
+ db 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15
+ db 16, 17, 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23
+ db 24, 25, 24, 25, 26, 27, 26, 27, 28, 29, 28, 29, 30, 31, 30, 31
+idtx_16x4p: db 0, 1, 4, 5, 16, 17, 20, 21, 2, 3, 6, 7, 18, 19, 22, 23
+ db 32, 33, 36, 37, 48, 49, 52, 53, 34, 35, 38, 39, 50, 51, 54, 55
+ db 8, 9, 12, 13, 24, 25, 28, 29, 10, 11, 14, 15, 26, 27, 30, 31
+ db 40, 41, 44, 45, 56, 57, 60, 61, 42, 43, 46, 47, 58, 59, 62, 63
+idct_8x32p: db 60, 61, 4, 5, 32, 33, 0, 1, 28, 29, 36, 37, 56, 57, 8, 9
+ db 12, 13, 52, 53, 24, 25, 40, 41, 44, 45, 20, 21, 48, 49, 16, 17
+ db 62, 63, 2, 3, 6, 7, 58, 59, 54, 55, 10, 11, 14, 15, 50, 51
+ db 46, 47, 18, 19, 22, 23, 42, 43, 38, 39, 26, 27, 30, 31, 34, 35
+idct_16x32p: db 6, 7, 58, 59, 38, 39, 26, 27, 32, 33, 0, 1, 30, 31, 34, 35
+ db 46, 47, 18, 19, 22, 23, 42, 43, 24, 25, 40, 41, 44, 45, 20, 21
+ db 62, 63, 2, 3, 48, 49, 16, 17, 56, 57, 8, 9, 14, 15, 50, 51
+ db 54, 55, 10, 11, 60, 61, 4, 5, 12, 13, 52, 53, 28, 29, 36, 37
+end_16x32p: db 0, 32, 1, 48, 2, 36, 3, 52, 16, 40, 17, 56, 18, 44, 19, 60
+ db 4, 33, 5, 49, 6, 37, 7, 53, 20, 41, 21, 57, 22, 45, 23, 61
+ db 8, 35, 9, 51, 10, 39, 11, 55, 24, 43, 25, 59, 26, 47, 27, 63
+ db 12, 34, 13, 50, 14, 38, 15, 54, 28, 42, 29, 58, 30, 46, 31, 62
+
+; packed 4-bit qword shuffle indices
+permA: dq 0x1c0d0d1ce0d94040, 0x5849495868fb6262
+ dq 0x3e2f2f3ef1c85151, 0x7a6b6b7a79ea7373
+ dq 0x94858594a451c8d9, 0xd0c1c1d02c73eafb
+ dq 0xb6a7a7b6b540d9c8, 0xf2e3e3f23d62fbea
+permB: dq 0x40acbd0fcadb0f40, 0x518e9f3ce8f99604
+ dq 0xc824352d56128751, 0xd906171e74301e15
+ dq 0x6271604b03472d62, 0x735342782165b426
+ dq 0xeaf9e8699f8ea573, 0xfbdbca5abdac3c37
+permC: dq 0x9d409d041551c2e0, 0xbf62bf263773a486
+ dq 0xc88c8c15409dd3f1, 0xeaaeae3762bfb597
+ dq 0x04d9158c8cc84a68, 0x26fb37aeaeea2c0e
+ dq 0x5115049dd9045b79, 0x733726bffb263d1f
+permD: dq 0x0cda098800041504, 0x0edb09b2028c3726
+ dq 0x0f11fa9c01150415, 0x0988f326039d2637
+ dq 0x05640f1108269d8c, 0x05290edb0aaebfae
+ dq 0x0005000509378c9d, 0xffffffff0bbfaebf
+
+pd_0to15: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+gather8a: dd 0, 2, 1, 3, 8, 10, 9, 11
+gather8b: dd 0, 1, 4, 5, 8, 9, 12, 13
+gather8c: dd 0, 4, 2, 6, 12, 8, 14, 10
+gather8d: dd 0, 3, 1, 2, 8, 11, 9, 10
+
+int_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+int_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7
+int_shuf3: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
+int_shuf4: db 8, 9, 0, 1, 12, 13, 4, 5, 10, 11, 2, 3, 14, 15, 6, 7
+deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+int_mshift: db 12, 20, 0, 0, 44, 52, 0, 0
+
+pb_32: times 4 db 32
+pw_2048: times 2 dw 2048
+pw_4096: times 2 dw 4096
+pw_8192: times 2 dw 8192
+pw_16384: times 2 dw 16384
+pw_1697x16: times 2 dw 1697*16
+pw_1697x8: times 2 dw 1697*8
+pw_2896x8: times 2 dw 2896*8
+pd_2048: dd 2048
+
+%define pw_5 (permD+52)
+%define pd_m1 (permD+60)
+%define pw_3803_1321 (permD+44)
+%define pw_2482_3803 (permD+12)
+%define pw_2440_3290 (permD+ 4)
+%define pw_m3290_2440 (permD+28)
+%define pw_3857_1380 (permD+36)
+%define pw_m1380_3857 (permD+20)
+
+pw_8192_m8192: dw 8192, -8192
+pw_m8192_8192: dw -8192, 8192
+pw_16384_m16384: dw 16384, -16384
+pw_m16384_16384: dw -16384, 16384
+
+pw_m1321_2482: dw -1321, 2482
+pw_m3344_3344: dw -3344, 3344
+pw_2482_3344: dw 2482, 3344
+pw_m3803_3344: dw -3803, 3344
+pd_3344: dd 3344
+pw_m1321_m3344: dw -1321, -3344
+pw_2896_m2896: dw 2896, -2896
+
+pw_1567_m3784: dw 1567, -3784
+pw_3784_m1567: dw 3784, -1567
+pw_4017_m799: dw 4017, -799
+pw_2276_m3406: dw 2276, -3406
+pw_m799_m4017: dw -799, -4017
+pw_m3406_m2276: dw -3406, -2276
+
+%macro COEF_PAIR 2-3 0
+pw_%1_%2: dw %1, %2
+pw_m%2_%1: dw -%2, %1
+%if %3
+pw_m%1_m%2: dw -%1, -%2
+%endif
+%endmacro
+
+COEF_PAIR 2896, 2896
+COEF_PAIR 1567, 3784, 1
+COEF_PAIR 3784, 1567
+COEF_PAIR 201, 4091
+COEF_PAIR 995, 3973
+COEF_PAIR 1751, 3703
+COEF_PAIR 3035, 2751
+COEF_PAIR 3513, 2106
+COEF_PAIR 4052, 601
+COEF_PAIR 3166, 2598, 1
+COEF_PAIR 3920, 1189, 1
+COEF_PAIR 2276, 3406
+COEF_PAIR 4017, 799
+
+%macro COEF_X8 1-*
+%rep %0
+ dw %1*8, %1*8
+ %rotate 1
+%endrep
+%endmacro
+
+pw_m2276x8: COEF_X8 -2276
+pw_3406x8: COEF_X8 3406
+pw_4017x8: COEF_X8 4017
+pw_799x8: COEF_X8 799
+pw_3784x8: COEF_X8 3784
+pw_1567x8: COEF_X8 1567
+
+pw_4076x8: COEF_X8 4076
+pw_401x8: COEF_X8 401
+pw_m2598x8: COEF_X8 -2598
+pw_3166x8: COEF_X8 3166
+pw_3612x8: COEF_X8 3612
+pw_1931x8: COEF_X8 1931
+pw_m1189x8: COEF_X8 -1189
+pw_3920x8: COEF_X8 3920
+
+pw_4091x8: COEF_X8 4091
+pw_201x8: COEF_X8 201
+pw_m2751x8: COEF_X8 -2751
+pw_3035x8: COEF_X8 3035
+pw_3703x8: COEF_X8 3703
+pw_1751x8: COEF_X8 1751
+pw_m1380x8: COEF_X8 -1380
+pw_3857x8: COEF_X8 3857
+pw_3973x8: COEF_X8 3973
+pw_995x8: COEF_X8 995
+pw_m2106x8: COEF_X8 -2106
+pw_3513x8: COEF_X8 3513
+pw_3290x8: COEF_X8 3290
+pw_2440x8: COEF_X8 2440
+pw_m601x8: COEF_X8 -601
+pw_4052x8: COEF_X8 4052
+
+pw_401_4076x8: dw 401*8, 4076*8
+pw_m2598_3166x8: dw -2598*8, 3166*8
+pw_1931_3612x8: dw 1931*8, 3612*8
+pw_m1189_3920x8: dw -1189*8, 3920*8
+pw_799_4017x8: dw 799*8, 4017*8
+pw_m2276_3406x8: dw -2276*8, 3406*8
+
+pw_201_4091x8: dw 201*8, 4091*8
+pw_m601_4052x8: dw -601*8, 4052*8
+pw_995_3973x8: dw 995*8, 3973*8
+pw_m1380_3857x8: dw -1380*8, 3857*8
+pw_1751_3703x8: dw 1751*8, 3703*8
+pw_m2106_3513x8: dw -2106*8, 3513*8
+pw_2440_3290x8: dw 2440*8, 3290*8
+pw_m2751_3035x8: dw -2751*8, 3035*8
+
+pw_101_4095x8: dw 101*8, 4095*8
+pw_m2824_2967x8: dw -2824*8, 2967*8
+pw_1660_3745x8: dw 1660*8, 3745*8
+pw_m1474_3822x8: dw -1474*8, 3822*8
+pw_897_3996x8: dw 897*8, 3996*8
+pw_m2191_3461x8: dw -2191*8, 3461*8
+pw_2359_3349x8: dw 2359*8, 3349*8
+pw_m700_4036x8: dw -700*8, 4036*8
+pw_501_4065x8: dw 501*8, 4065*8
+pw_m2520_3229x8: dw -2520*8, 3229*8
+pw_2019_3564x8: dw 2019*8, 3564*8
+pw_m1092_3948x8: dw -1092*8, 3948*8
+pw_1285_3889x8: dw 1285*8, 3889*8
+pw_m1842_3659x8: dw -1842*8, 3659*8
+pw_2675_3102x8: dw 2675*8, 3102*8
+pw_m301_4085x8: dw -301*8, 4085*8
+
+idct64_mul: COEF_X8 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474
+COEF_PAIR 401, 4076, 1
+COEF_PAIR 799, 4017
+ COEF_X8 -700, 4036, 2359, 3349, -2191, 3461, 897, 3996
+dw -2598, -3166, 3166, -2598, 2598, 3166, -4017, -799, 799, -4017
+ COEF_X8 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092
+COEF_PAIR 1931, 3612, 1
+COEF_PAIR 3406, 2276
+ COEF_X8 -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889
+dw -1189, -3920, 3920, -1189, 1189, 3920, -2276, -3406, 3406, -2276
+
+SECTION .text
+
+%define o_base int8_permA+64*18
+%define o(x) (r5 - (o_base) + (x))
+%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
+
+; flags: 1 = swap, 2 = interleave (l), 4 = interleave (t), 8 = no_pack,
+; 16 = special_mul1, 32 = special_mul2
+%macro ITX_MUL2X_PACK 6-7 0 ; dst/src, tmp[1-2], rnd, coef[1-2], flags
+ mova m%2, m%4
+%if %7 & 16
+ vpdpwssd m%2, m%1, [o(pw_%5)] {bcstd}
+ mova m%3, m%4
+%if %7 & 32
+ vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd}
+%else
+ vpdpwssd m%3, m%1, m%6
+%endif
+%elif %7 & 32
+ vpdpwssd m%2, m%1, m%5
+ mova m%3, m%4
+ vpdpwssd m%3, m%1, [o(pw_%6)] {bcstd}
+%elif %6 < 32
+ vpdpwssd m%2, m%1, m%5
+ mova m%3, m%4
+ vpdpwssd m%3, m%1, m%6
+%elif %7 & 1
+ vpdpwssd m%2, m%1, [o(pw_%5_%6)] {bcstd}
+ mova m%3, m%4
+ vpdpwssd m%3, m%1, [o(pw_m%6_%5)] {bcstd}
+%else
+ vpdpwssd m%2, m%1, [o(pw_m%6_%5)] {bcstd}
+ mova m%3, m%4
+ vpdpwssd m%3, m%1, [o(pw_%5_%6)] {bcstd}
+%endif
+%if %7 & 2
+ psrld m%2, 12
+ pslld m%3, 4
+ vpshrdd m%1, m%3, m%2, 16
+%elif %7 & 4
+ ; compared to using shifts (as above) this has better throughput,
+ ; but worse latency and requires setting up the opmask/index
+ ; registers, so only use this method for the larger transforms
+ pslld m%1, m%2, 4
+ vpmultishiftqb m%1{k7}, m13, m%3
+%else
+ psrad m%2, 12
+ psrad m%3, 12
+%if %7 & 8 == 0
+ packssdw m%1, m%3, m%2
+%endif
+%endif
+%endmacro
+
+; flags: same as ITX_MUL2X_PACK
+%macro ITX_MUL4X_PACK 10-11 0 ; dst/src, tmp[1-2], coef_tmp[1-2], rnd, coef[1-4], flags
+%if %11 & 1
+ vpbroadcastd m%4, [o(pw_%9_%10)]
+ vpbroadcastd m%4{k1}, [o(pw_%7_%8)]
+ vpbroadcastd m%5, [o(pw_m%10_%9)]
+ vpbroadcastd m%5{k1}, [o(pw_m%8_%7)]
+%else
+ vpbroadcastd m%4, [o(pw_m%10_%9)]
+ vpbroadcastd m%4{k1}, [o(pw_m%8_%7)]
+ vpbroadcastd m%5, [o(pw_%9_%10)]
+ vpbroadcastd m%5{k1}, [o(pw_%7_%8)]
+%endif
+ ITX_MUL2X_PACK %1, %2, %3, %6, %4, %5, %11
+%endmacro
+
+; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
+; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
+%macro ITX_MULSUB_2W 7-8 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2], dst2
+ punpcklwd m%3, m%2, m%1
+ punpckhwd m%2, m%1
+%if %7 < 32
+ mova m%1, m%5
+ vpdpwssd m%1, m%3, m%7
+ mova m%4, m%5
+ vpdpwssd m%4, m%2, m%7
+%else
+ mova m%1, m%5
+ vpdpwssd m%1, m%3, [o(pw_m%7_%6)] {bcstd}
+ mova m%4, m%5
+ vpdpwssd m%4, m%2, [o(pw_m%7_%6)] {bcstd}
+%endif
+ psrad m%1, 12
+ psrad m%4, 12
+ packssdw m%1, m%4
+ mova m%4, m%5
+%if %7 < 32
+ vpdpwssd m%4, m%2, m%6
+ mova m%2, m%5
+ vpdpwssd m%2, m%3, m%6
+%else
+ vpdpwssd m%4, m%2, [o(pw_%6_%7)] {bcstd}
+ mova m%2, m%5
+ vpdpwssd m%2, m%3, [o(pw_%6_%7)] {bcstd}
+%endif
+ psrad m%4, 12
+ psrad m%2, 12
+%if %0 == 8
+ packssdw m%8, m%2, m%4
+%else
+ packssdw m%2, m%4
+%endif
+%endmacro
+
+%macro WRAP_XMM 1+
+ %xdefine %%reset RESET_MM_PERMUTATION
+ INIT_XMM cpuname
+ DEFINE_MMREGS xmm
+ AVX512_MM_PERMUTATION
+ %1
+ %%reset
+%endmacro
+
+%macro WRAP_YMM 1+
+ INIT_YMM cpuname
+ %1
+ INIT_ZMM cpuname
+%endmacro
+
+%macro ITX4_END 4-5 2048 ; row[1-4], rnd
+%if %5
+ vpbroadcastd m2, [o(pw_%5)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+%endif
+ lea r2, [dstq+strideq*2]
+%assign %%i 1
+%rep 4
+ %if %1 & 2
+ CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1)
+ %else
+ CAT_XDEFINE %%row_adr, %%i, dstq + strideq*(%1&1)
+ %endif
+ %assign %%i %%i + 1
+ %rotate 1
+%endrep
+ movd m2, [%%row_adr1]
+ pinsrd m2, [%%row_adr2], 1
+ movd m3, [%%row_adr3]
+ pinsrd m3, [%%row_adr4], 1
+ pmovzxbw m2, m2
+ pmovzxbw m3, m3
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ movd [%%row_adr1], m0
+ pextrd [%%row_adr2], m0, 1
+ pextrd [%%row_adr3], m0, 2
+ pextrd [%%row_adr4], m0, 3
+ ret
+%endmacro
+
+%macro INV_TXFM_FN 3 ; type1, type2, size
+cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, 0, dst, stride, c, eob, tx2, base
+ %define %%p1 m(i%1_%3_internal_8bpc)
+ lea baseq, [o_base]
+ ; Jump to the 1st txfm function if we're not taking the fast path, which
+ ; in turn performs an indirect jump to the 2nd txfm function.
+ lea tx2q, [m(i%2_%3_internal_8bpc).pass2]
+%ifidn %1_%2, dct_dct
+ test eobd, eobd
+ jnz %%p1
+%else
+ ; jump to the 1st txfm function unless it's located directly after this
+ times ((%%end - %%p1) >> 31) & 1 jmp %%p1
+ALIGN function_align
+%%end:
+%endif
+%endmacro
+
+%macro INV_TXFM_4X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x4
+%ifidn %1_%2, dct_dct
+ vpbroadcastw m0, [cq]
+ vpbroadcastd m1, [o(pw_2896x8)]
+ pmulhrsw m0, m1
+ mov [cq], eobd
+ pmulhrsw m0, m1
+ mova m1, m0
+ jmp m(iadst_4x4_internal_8bpc).end2
+%endif
+%endmacro
+
+%macro IDCT4_1D_PACKED 0
+ vpbroadcastd m4, [o(pd_2048)]
+ punpckhwd m2, m1, m0
+ punpcklwd m1, m0
+ ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784
+ ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896
+ paddsw m0, m1, m2 ; out0 out1
+ psubsw m1, m2 ; out3 out2
+%endmacro
+
+%macro IADST4_1D_PACKED 0
+ punpcklwd m4, m1, m0 ; in2 in0
+ punpckhwd m5, m1, m0 ; in3 in1
+.main2:
+ vpbroadcastd m3, [o(pd_2048)]
+ mova m0, m3
+ vpdpwssd m0, m4, [o(pw_3803_1321)] {bcstd}
+ mova m2, m3
+ vpdpwssd m2, m4, [o(pw_m1321_2482)] {bcstd}
+ mova m1, m3
+ vpdpwssd m1, m4, [o(pw_m3344_3344)] {bcstd}
+ vpdpwssd m3, m4, [o(pw_2482_3803)] {bcstd}
+ vpdpwssd m0, m5, [o(pw_2482_3344)] {bcstd}
+ vpdpwssd m2, m5, [o(pw_m3803_3344)] {bcstd}
+ vpdpwssd m1, m5, [o(pd_3344)] {bcstd}
+ vpdpwssd m3, m5, [o(pw_m1321_m3344)] {bcstd}
+ REPX {psrad x, 12}, m0, m2, m1, m3
+ packssdw m0, m2 ; out0 out1
+ packssdw m1, m3 ; out2 out3
+%endmacro
+
+INIT_XMM avx512icl
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+INV_TXFM_4X4_FN dct, identity
+
+cglobal idct_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ IDCT4_1D_PACKED
+ mova m2, [o(deint_shuf)]
+ shufps m3, m0, m1, q1331
+ shufps m0, m0, m1, q0220
+ pshufb m0, m2
+ pshufb m1, m3, m2
+ jmp tx2q
+.pass2:
+ IDCT4_1D_PACKED
+ pxor ymm16, ymm16
+ mova [cq], ymm16
+ ITX4_END 0, 1, 3, 2
+
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
+INV_TXFM_4X4_FN adst, identity
+
+cglobal iadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ call .main
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ call .main
+.end:
+ pxor ymm16, ymm16
+ mova [cq], ymm16
+.end2:
+ ITX4_END 0, 1, 2, 3
+ALIGN function_align
+.main:
+ IADST4_1D_PACKED
+ ret
+
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
+INV_TXFM_4X4_FN flipadst, identity
+
+cglobal iflipadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ call m(iadst_4x4_internal_8bpc).main
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ jmp tx2q
+.pass2:
+ call m(iadst_4x4_internal_8bpc).main
+.end:
+ pxor ymm16, ymm16
+ mova [cq], ymm16
+.end2:
+ ITX4_END 3, 2, 1, 0
+
+INV_TXFM_4X4_FN identity, dct
+INV_TXFM_4X4_FN identity, adst
+INV_TXFM_4X4_FN identity, flipadst
+INV_TXFM_4X4_FN identity, identity
+
+cglobal iidentity_4x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+16*0]
+ mova m1, [cq+16*1]
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ jmp tx2q
+.pass2:
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ jmp m(iadst_4x4_internal_8bpc).end
+
+%macro INV_TXFM_4X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x8
+%ifidn %1_%2, dct_dct
+ movd xmm1, [o(pw_2896x8)]
+ pmulhrsw xmm0, xmm1, [cq]
+ movd xmm2, [o(pw_2048)]
+ pmulhrsw xmm0, xmm1
+ pmulhrsw xmm0, xmm1
+ pmulhrsw xmm0, xmm2
+ vpbroadcastw ym0, xmm0
+ mova ym1, ym0
+ jmp m(iadst_4x8_internal_8bpc).end3
+%endif
+%endmacro
+
+%macro IDCT8_1D_PACKED 0
+ punpckhwd m5, m3, m0 ; in7 in1
+ punpckhwd m4, m1, m2 ; in3 in5
+ punpcklwd m3, m1 ; in6 in2
+ punpcklwd m2, m0 ; in4 in0
+.main2:
+ vpbroadcastd m6, [o(pd_2048)]
+ ITX_MUL2X_PACK 5, 0, 1, 6, 799, 4017, 3 ; t4a t7a
+ ITX_MUL2X_PACK 4, 0, 1, 6, 3406, 2276, 3 ; t5a t6a
+ ITX_MUL2X_PACK 3, 0, 1, 6, 1567, 3784 ; t3 t2
+ psubsw m0, m5, m4 ; t5a t6a (interleaved)
+ paddsw m4, m5 ; t4 t7 (interleaved)
+ ITX_MUL2X_PACK 2, 1, 5, 6, 2896, 2896 ; t0 t1
+ ITX_MUL2X_PACK 0, 1, 5, 6, 2896, 2896, 1 ; t6 t5
+%if mmsize > 16
+ vbroadcasti32x4 m1, [o(deint_shuf)]
+ pshufb m4, m1
+%else
+ pshufb m4, [o(deint_shuf)]
+%endif
+ psubsw m1, m2, m3 ; tmp3 tmp2
+ paddsw m3, m2 ; tmp0 tmp1
+ punpckhqdq m2, m4, m0 ; t7 t6
+ punpcklqdq m4, m0 ; t4 t5
+ paddsw m0, m3, m2 ; out0 out1
+ psubsw m3, m2 ; out7 out6
+ psubsw m2, m1, m4 ; out4 out5
+ paddsw m1, m4 ; out3 out2
+%endmacro
+
+%macro IADST8_1D_PACKED 1 ; pass
+ vpbroadcastd m6, [o(pd_2048)]
+%if %1 == 1
+ ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a
+ ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a
+ ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a
+ ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a
+ psubsw m4, m0, m2 ; t5 t4
+ paddsw m0, m2 ; t1 t0
+ psubsw m5, m1, m3 ; t6 t7
+ paddsw m1, m3 ; t2 t3
+ ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a
+ ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a
+%if mmsize > 16
+ vbroadcasti32x4 m2, [o(deint_shuf)]
+%else
+ mova m2, [o(deint_shuf)]
+%endif
+ vprord m1, 16
+ psubsw m3, m0, m1 ; t3 t2
+ paddsw m0, m1 ; -out7 out0
+ psubsw m1, m4, m5 ; t7 t6
+ paddsw m4, m5 ; out6 -out1
+ pshufb m0, m2
+ pshufb m4, m2
+ mova m2, m6
+ vpdpwssd m2, m3, [o(pw_m2896_2896)] {bcstd}
+ mova m5, m6
+ vpdpwssd m5, m1, [o(pw_m2896_2896)] {bcstd}
+ psrad m2, 12
+ psrad m5, 12
+ packssdw m2, m5 ; out4 -out5
+ mova m5, m6
+ vpdpwssd m5, m3, [o(pw_2896_2896)] {bcstd}
+ mova m3, m6
+ vpdpwssd m3, m1, [o(pw_2896_2896)] {bcstd}
+ psrad m5, 12
+ psrad m3, 12
+ packssdw m1, m3, m5 ; out2 -out3
+%else
+ punpckhwd m0, m4, m3 ; 0 7
+ punpckhwd m1, m5, m2 ; 2 5
+ punpcklwd m2, m5 ; 4 3
+ punpcklwd m3, m4 ; 6 1
+ ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a
+ ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a
+ ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a
+ ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a
+ psubsw m4, m0, m2 ; t4 t5
+ paddsw m0, m2 ; t0 t1
+ psubsw m5, m1, m3 ; t6 t7
+ paddsw m1, m3 ; t2 t3
+ shufps m2, m5, m4, q1032
+ punpckhwd m4, m2
+ punpcklwd m5, m2
+ ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784 ; t4a t5a
+ ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 1 ; t6a t7a
+ psubsw m2, m0, m1 ; t2 t3
+ paddsw m0, m1 ; out0 -out7
+ psubsw m1, m4, m5 ; t6 t7
+ paddsw m4, m5 ; -out1 out6
+ vpbroadcastd m5, [o(pw_2896x8)]
+ punpckhqdq m3, m2, m1 ; t3 t7
+ punpcklqdq m2, m1 ; t2 t6
+ paddsw m1, m2, m3 ; t2+t3 t6+t7
+ psubsw m2, m3 ; t2-t3 t6-t7
+ punpckhqdq m3, m4, m0 ; out6 -out7
+ punpcklqdq m0, m4 ; out0 -out1
+ pmulhrsw m2, m5 ; out4 -out5
+ pshufd m1, m1, q1032
+ pmulhrsw m1, m5 ; out2 -out3
+%endif
+%endmacro
+
+INIT_YMM avx512icl
+INV_TXFM_4X8_FN dct, dct
+INV_TXFM_4X8_FN dct, identity
+INV_TXFM_4X8_FN dct, adst
+INV_TXFM_4X8_FN dct, flipadst
+
+cglobal idct_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ IDCT4_1D_PACKED
+ vbroadcasti32x4 m2, [o(deint_shuf)]
+ shufps m3, m0, m1, q1331
+ shufps m0, m0, m1, q0220
+ pshufb m0, m2
+ pshufb m1, m3, m2
+ jmp tx2q
+.pass2:
+ vextracti32x4 xm2, m0, 1
+ vextracti32x4 xm3, m1, 1
+ call .main
+ vpbroadcastd m4, [o(pw_2048)]
+ vinserti32x4 m0, m0, xm2, 1
+ vinserti32x4 m1, m1, xm3, 1
+ pshufd m1, m1, q1032
+ jmp m(iadst_4x8_internal_8bpc).end2
+ALIGN function_align
+.main:
+ WRAP_XMM IDCT8_1D_PACKED
+ ret
+
+INV_TXFM_4X8_FN adst, dct
+INV_TXFM_4X8_FN adst, adst
+INV_TXFM_4X8_FN adst, flipadst
+INV_TXFM_4X8_FN adst, identity
+
+cglobal iadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ call m(iadst_8x4_internal_8bpc).main
+ punpckhwd m3, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ vextracti32x4 xm2, m0, 1
+ vextracti32x4 xm3, m1, 1
+ pshufd xm4, xm0, q1032
+ pshufd xm5, xm1, q1032
+ call .main_pass2
+ vpbroadcastd m4, [o(pw_2048)]
+ vinserti32x4 m0, xm2, 1
+ vinserti32x4 m1, xm3, 1
+ pxor m5, m5
+ psubw m5, m4
+.end:
+ punpcklqdq m4, m5
+.end2:
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+.end3:
+ vpbroadcastd m3, strided
+ pmulld m5, m3, [o(pd_0to15)]
+ kxnorb k1, k1, k1
+ kmovb k2, k1
+ vpgatherdd m3{k1}, [dstq+m5]
+ pxor m4, m4
+ mova [cq], zmm20
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ vpscatterdd [dstq+m5]{k2}, m0
+ RET
+ALIGN function_align
+.main_pass1:
+ punpckhwd xm0, xm4, xm3 ; 0 7
+ punpckhwd xm1, xm5, xm2 ; 2 5
+ punpcklwd xm2, xm5 ; 4 3
+ punpcklwd xm3, xm4 ; 6 1
+ WRAP_XMM IADST8_1D_PACKED 1
+ punpcklqdq xm3, xm4, xm0 ; out6 -out7
+ punpckhqdq xm0, xm4 ; out0 -out1
+ ret
+ALIGN function_align
+.main_pass2:
+ WRAP_XMM IADST8_1D_PACKED 2
+ ret
+
+INV_TXFM_4X8_FN flipadst, dct
+INV_TXFM_4X8_FN flipadst, adst
+INV_TXFM_4X8_FN flipadst, flipadst
+INV_TXFM_4X8_FN flipadst, identity
+
+cglobal iflipadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120
+ vpermq m1, [cq+32*1], q3120
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ call m(iadst_8x4_internal_8bpc).main
+ punpcklwd m3, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m1, m3
+ punpckhwd m1, m3
+ jmp tx2q
+.pass2:
+ vextracti32x4 xm2, m0, 1
+ vextracti32x4 xm3, m1, 1
+ pshufd xm4, xm0, q1032
+ pshufd xm5, xm1, q1032
+ call m(iadst_4x8_internal_8bpc).main_pass2
+ vpbroadcastd m5, [o(pw_2048)]
+ vinserti32x4 m3, xm1, 1
+ vinserti32x4 m2, xm0, 1
+ pxor m4, m4
+ psubw m4, m5
+ pshufd m0, m3, q1032
+ pshufd m1, m2, q1032
+ jmp m(iadst_4x8_internal_8bpc).end
+
+INIT_ZMM avx512icl
+INV_TXFM_4X8_FN identity, dct
+INV_TXFM_4X8_FN identity, adst
+INV_TXFM_4X8_FN identity, flipadst
+INV_TXFM_4X8_FN identity, identity
+
+cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpbroadcastd m0, [o(pw_2896x8)]
+ pmulhrsw m0, [cq]
+ mova m1, [o(int8_permB)]
+ vpbroadcastd m2, [o(pw_1697x8)]
+ vpermb m0, m1, m0
+ pmulhrsw m2, m0
+ paddsw m0, m2
+ vextracti32x8 ym1, m0, 1
+ jmp tx2q
+.pass2:
+ vpbroadcastd ym4, [o(pw_4096)]
+ jmp m(iadst_4x8_internal_8bpc).end2
+
+%macro INV_TXFM_4X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 4x16
+%ifidn %1_%2, dct_dct
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ imul r6d, 181
+ add r6d, 128+256
+ sar r6d, 8+1
+ imul r6d, 181
+ add r6d, 128+2048
+ sar r6d, 8+4
+ vpbroadcastw m0, r6d
+ mova m1, m0
+ jmp m(iadst_4x16_internal_8bpc).end3
+%endif
+%endmacro
+
+%macro IDCT16_1D_PACKED 0
+ punpckhwd m8, m7, m0 ; dct16 in15 in1
+ punpcklwd m9, m4, m0 ; dct4 in2 in0
+ punpckhwd m0, m3, m4 ; dct16 in7 in9
+ punpcklwd m7, m1 ; dct8 in7 in1
+ punpckhwd m1, m6 ; dct16 in3 in13
+ punpcklwd m3, m5 ; dct8 in3 in5
+ punpckhwd m5, m2 ; dct16 in11 in5
+ punpcklwd m6, m2 ; dct4 in3 in1
+cglobal_label .main2
+ vpbroadcastd m10, [o(pd_2048)]
+.main3:
+ vpbroadcastq m13, [o(int_mshift)]
+ vpcmpub k7, m13, m10, 6 ; 0x33...
+ ITX_MUL2X_PACK 8, 2, 4, 10, 401, 4076, 5 ; t8a t15a
+ ITX_MUL2X_PACK 0, 2, 4, 10, 3166, 2598, 5 ; t9a t14a
+ ITX_MUL2X_PACK 5, 2, 4, 10, 1931, 3612, 5 ; t10a t13a
+ ITX_MUL2X_PACK 1, 2, 4, 10, 3920, 1189, 5 ; t11a t12a
+ ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 5 ; t4a t7a
+ ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 5 ; t5a t6a
+.main4:
+ psubsw m2, m8, m0 ; t9 t14
+ paddsw m8, m0 ; t8 t15
+ psubsw m4, m1, m5 ; t10 t13
+ paddsw m1, m5 ; t11 t12
+ ITX_MUL2X_PACK 6, 0, 5, 10, 1567, 3784 ; t3 t2
+ psubsw m0, m8, m1 ; t11a t12a
+ paddsw m8, m1 ; t8a t15a
+ psubsw m1, m7, m3 ; t5a t6a
+ paddsw m7, m3 ; t4 t7
+.main5:
+ ITX_MUL2X_PACK 2, 3, 5, 10, 1567, 3784, 5 ; t9a t14a
+ ITX_MUL2X_PACK 4, 3, 5, 10, m3784, 1567, 5 ; t10a t13a
+%if mmsize > 16
+ vbroadcasti32x4 m5, [o(deint_shuf)]
+%else
+ mova m5, [o(deint_shuf)]
+%endif
+ vpbroadcastd m11, [o(pw_m2896_2896)]
+ vpbroadcastd m12, [o(pw_2896_2896)]
+ paddsw m3, m2, m4 ; t9 t14
+ psubsw m2, m4 ; t10 t13
+ pshufb m8, m5
+ pshufb m7, m5
+ pshufb m3, m5
+ ITX_MUL2X_PACK 9, 4, 5, 10, 11, 12 ; t0 t1
+ ITX_MUL2X_PACK 1, 4, 5, 10, 12, 11 ; t5 t6
+ ITX_MUL2X_PACK 0, 4, 5, 10, 11, 12, 8 ; t11 t12
+ ITX_MUL2X_PACK 2, 0, 11, 10, 11, 12, 8 ; t10a t13a
+ punpckhqdq m2, m7, m1 ; t7 t6
+ punpcklqdq m7, m1 ; t4 t5
+ psubsw m1, m9, m6 ; dct4 out3 out2
+ paddsw m9, m6 ; dct4 out0 out1
+ packssdw m5, m11 ; t12 t13a
+ packssdw m4, m0 ; t11 t10a
+ punpckhqdq m0, m8, m3 ; t15a t14
+ punpcklqdq m8, m3 ; t8a t9
+ psubsw m3, m9, m2 ; dct8 out7 out6
+ paddsw m9, m2 ; dct8 out0 out1
+ psubsw m2, m1, m7 ; dct8 out4 out5
+ paddsw m1, m7 ; dct8 out3 out2
+ psubsw m7, m9, m0 ; out15 out14
+ paddsw m0, m9 ; out0 out1
+ psubsw m6, m1, m5 ; out12 out13
+ paddsw m1, m5 ; out3 out2
+ psubsw m5, m2, m4 ; out11 out10
+ paddsw m2, m4 ; out4 out5
+ psubsw m4, m3, m8 ; out8 out9
+ paddsw m3, m8 ; out7 out6
+%endmacro
+
+INV_TXFM_4X16_FN dct, dct
+INV_TXFM_4X16_FN dct, identity
+INV_TXFM_4X16_FN dct, adst
+INV_TXFM_4X16_FN dct, flipadst
+
+cglobal idct_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova ym1, [cq+32*2]
+ vinserti32x8 m1, [cq+32*0], 1
+ mova m0, [o(int16_perm)]
+ mova ym2, [cq+32*3]
+ vinserti32x8 m2, [cq+32*1], 1
+ vpbroadcastd m4, [o(pd_2048)]
+ vpermb m1, m0, m1 ; c0 a0 c1 a1 c2 a2 c3 a3
+ vpermb m2, m0, m2 ; d0 b0 d1 b1 d2 b2 d3 b3
+ ITX_MUL2X_PACK 1, 0, 3, 4, 2896, 2896, 2
+ ITX_MUL2X_PACK 2, 0, 3, 4, 1567, 3784, 2
+ vpbroadcastd m4, [o(pw_16384)]
+ psubsw m3, m1, m2
+ paddsw m1, m2 ; out0 out1
+ vprord m3, 16 ; out2 out3
+ punpckldq m0, m1, m3
+ punpckhdq m1, m3
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ jmp tx2q
+.pass2:
+ vextracti32x4 xm2, ym0, 1
+ vextracti32x4 xm3, ym1, 1
+ vextracti32x4 xm4, m0, 2
+ vextracti32x4 xm5, m1, 2
+ vextracti32x4 xm6, m0, 3
+ vextracti32x4 xm7, m1, 3
+ call .main
+ vinserti32x4 ym0, xm2, 1
+ vinserti32x4 ym1, xm3, 1
+ vinserti32x4 ym4, xm6, 1
+ vinserti32x4 ym5, xm7, 1
+ vinserti32x8 m0, ym4, 1
+ vinserti32x8 m1, ym5, 1
+ vpbroadcastd m5, [o(pw_2048)]
+ pshufd m1, m1, q1032
+ jmp m(iadst_4x16_internal_8bpc).end2
+ALIGN function_align
+.main:
+ WRAP_XMM IDCT16_1D_PACKED
+ ret
+
+INV_TXFM_4X16_FN adst, dct
+INV_TXFM_4X16_FN adst, adst
+INV_TXFM_4X16_FN adst, flipadst
+INV_TXFM_4X16_FN adst, identity
+
+cglobal iadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m1, [o(permB)]
+ vpermq m0, m1, [cq+64*0]
+ vpermq m1, m1, [cq+64*1]
+ call m(iadst_16x4_internal_8bpc).main
+ vpbroadcastd m3, [o(pw_16384)]
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ pmulhrsw m2, m3
+ pmulhrsw m0, m3
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ jmp tx2q
+.pass2:
+ call .main
+ vpbroadcastd m5, [o(pw_2048)]
+ psrlq m10, 4
+ psubw m6, m8, m5
+.end:
+ vpbroadcastd m7, [o(pw_2896x8)]
+ paddsw ym1, ym2, ym4
+ psubsw ym2, ym4
+ vinserti32x8 m1, ym2, 1
+ pmulhrsw m1, m7 ; -out7 out4 out6 -out5 out8 -out11 -out9 out10
+ psrlq m0, m10, 4
+ vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d
+ vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f
+ punpcklqdq m5, m6
+.end2:
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+.end3:
+ vpbroadcastd m3, strided
+ pmulld m5, m3, [o(pd_0to15)]
+ kxnorw k1, k1, k1
+ kmovw k2, k1
+ vpgatherdd m3{k1}, [dstq+m5]
+ pxor m4, m4
+ mova [cq+64*0], m4
+ mova [cq+64*1], m4
+ punpcklbw m2, m3, m4
+ punpckhbw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ vpscatterdd [dstq+m5]{k2}, m0
+ RET
+ALIGN function_align
+.main:
+ movu m3, [o(permB+1)]
+ psrlq m10, m3, 4
+.main2:
+ vpermi2q m3, m0, m1 ; in15 in12 in13 in14 in11 in8 in9 in10
+ vpermt2q m0, m10, m1 ; in0 in3 in2 in1 in4 in7 in6 in5
+ vpbroadcastd m9, [o(pd_2048)]
+ vpbroadcastq ym13, [o(int_mshift)]
+ kxnorb k1, k1, k1
+ punpckhwd m4, m3, m0 ; in12 in3 in14 in1
+ punpcklwd m0, m3 ; in0 in15 in2 in13
+ kshiftrb k1, k1, 4
+ vextracti32x8 ym3, m4, 1 ; in8 in7 in10 in5
+ vextracti32x8 ym1, m0, 1 ; in4 in11 in6 in9
+INIT_YMM avx512icl
+ vpcmpub k7, m13, m9, 6 ; 0x33...
+ pxor m8, m8
+ ITX_MUL4X_PACK 0, 2, 5, 6, 7, 9, 201, 4091, 995, 3973, 5
+ ITX_MUL4X_PACK 1, 2, 5, 6, 7, 9, 1751, 3703, 2440, 3290, 5
+ ITX_MUL4X_PACK 3, 2, 5, 6, 7, 9, 3035, 2751, 3513, 2106, 5
+ ITX_MUL4X_PACK 4, 2, 5, 6, 7, 9, 3857, 1380, 4052, 601, 5
+ psubsw m2, m0, m3 ; t9a t8a t11a t10a
+ paddsw m0, m3 ; t1a t0a t3a t2a
+ psubsw m3, m1, m4 ; t13a t12a t15a t14a
+ paddsw m4, m1 ; t5a t4a t7a t6a
+ ITX_MUL4X_PACK 2, 1, 5, 6, 7, 9, 799, 4017, 3406, 2276, 5
+ psubw m7, m8, m7
+ ITX_MUL2X_PACK 3, 1, 5, 9, 7, 6, 4
+ vpbroadcastd m6, [o(pw_3784_m1567)]
+ vpbroadcastd m6{k1}, [o(pw_m3784_1567)]
+ psubsw m1, m0, m4 ; t5 t4 t7 t6
+ paddsw m0, m4 ; t1 t0 t3 t2
+ psubsw m4, m2, m3 ; t13a t12a t15a t14a
+ paddsw m2, m3 ; t9a t8a t11a t10a
+ ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t4a t5a t7a t6a
+ ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t12 t13 t15 t14
+ vbroadcasti32x4 m5, [o(deint_shuf)]
+ pshufb m0, m5
+ pshufb m2, m5
+ vshufi32x4 m3, m0, m2, 0x03 ; t3 t2 t11a t10a
+ vinserti32x4 m0, xm2, 1 ; t1 t0 t9a t8a
+ vshufi32x4 m2, m1, m4, 0x03 ; t7a t6a t15 t14
+ vinserti32x4 m1, xm4, 1 ; t4a t5a t12 t13
+ pshufd m2, m2, q1032 ; t6a t7a t14 t15
+ psubsw m4, m0, m3 ; t3a t2a t11 t10
+ paddsw m0, m3 ; -out15 out0 out14 -out1
+ paddsw m3, m1, m2 ; out12 -out3 -out13 out2
+ psubsw m1, m2 ; t7 t6 t15a t14a
+ punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a
+ punpcklqdq m4, m1 ; t3a t7 t11 t15a
+INIT_ZMM avx512icl
+ vinserti32x8 m3, ym0, 1 ; out12 -out3 -out13 out2 -out15 out0 out14 -out1
+ ret
+
+INV_TXFM_4X16_FN flipadst, dct
+INV_TXFM_4X16_FN flipadst, adst
+INV_TXFM_4X16_FN flipadst, flipadst
+INV_TXFM_4X16_FN flipadst, identity
+
+cglobal iflipadst_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m1, [o(permB)]
+ vpermq m0, m1, [cq+64*0]
+ vpermq m1, m1, [cq+64*1]
+ call m(iadst_16x4_internal_8bpc).main
+ vpbroadcastd m3, [o(pw_16384)]
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ jmp tx2q
+.pass2:
+ call m(iadst_4x16_internal_8bpc).main
+ vpbroadcastd m6, [o(pw_2048)]
+ psrlq m10, 12
+ psubw m5, m8, m6
+ jmp m(iadst_4x16_internal_8bpc).end
+
+INV_TXFM_4X16_FN identity, dct
+INV_TXFM_4X16_FN identity, adst
+INV_TXFM_4X16_FN identity, flipadst
+INV_TXFM_4X16_FN identity, identity
+
+cglobal iidentity_4x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m2, [o(int16_perm)]
+ vpermb m1, m2, [cq+64*0]
+ vpermb m2, m2, [cq+64*1]
+ vpbroadcastd m4, [o(pw_1697x8)]
+ vpbroadcastd m0, [o(pd_m1)]
+ pmulhrsw m3, m4, m1 ; we want to do a signed avg, but pavgw is
+ vpcmpw k1, m1, m0, 4 ; unsigned. as long as both signs are equal
+ pmulhrsw m4, m2 ; it still works, but if the input is -1 the
+ vpcmpw k2, m2, m0, 4 ; pmulhrsw result will become 0 which causes
+ vpavgw m1{k1}{z}, m3 ; pavgw to output -32768 instead of 0 unless
+ vpavgw m2{k2}{z}, m4 ; we explicitly deal with that case here.
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ jmp tx2q
+.pass2:
+ vpbroadcastd m3, [o(pw_1697x16)]
+ vpbroadcastd m5, [o(pw_2048)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m0
+ paddsw m1, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ jmp m(iadst_4x16_internal_8bpc).end2
+
+%macro WRITE_8X4 4-7 strideq*1, strideq*2, r6 ; coefs[1-2], tmp[1-2], off[1-3]
+ movq xm%3, [dstq ]
+ movhps xm%3, [dstq+%5]
+ movq xm%4, [dstq+%6]
+ movhps xm%4, [dstq+%7]
+ pmovzxbw m%3, xm%3
+ pmovzxbw m%4, xm%4
+%ifnum %1
+ paddw m%3, m%1
+%else
+ paddw m%3, %1
+%endif
+%ifnum %2
+ paddw m%4, m%2
+%else
+ paddw m%4, %2
+%endif
+ packuswb m%3, m%4
+ vextracti32x4 xm%4, m%3, 1
+ movq [dstq ], xm%3
+ movhps [dstq+%6], xm%3
+ movq [dstq+%5], xm%4
+ movhps [dstq+%7], xm%4
+%endmacro
+
+%macro INV_TXFM_8X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x4
+%ifidn %1_%2, dct_dct
+ movd xm1, [o(pw_2896x8)]
+ pmulhrsw xm0, xm1, [cq]
+ movd xm2, [o(pw_2048)]
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm1
+ pmulhrsw xm0, xm2
+ vpbroadcastw m0, xm0
+ mova m1, m0
+ jmp m(iadst_8x4_internal_8bpc).end3
+%endif
+%endmacro
+
+INIT_YMM avx512icl
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+INV_TXFM_8X4_FN dct, identity
+
+cglobal idct_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpbroadcastd xm3, [o(pw_2896x8)]
+ pmulhrsw xm0, xm3, [cq+16*0]
+ pmulhrsw xm1, xm3, [cq+16*1]
+ pmulhrsw xm2, xm3, [cq+16*2]
+ pmulhrsw xm3, [cq+16*3]
+ call m(idct_4x8_internal_8bpc).main
+ vbroadcasti32x4 m4, [o(deint_shuf)]
+ vinserti32x4 m3, m1, xm3, 1
+ vinserti32x4 m1, m0, xm2, 1
+ shufps m0, m1, m3, q0220
+ shufps m1, m3, q1331
+ pshufb m0, m4
+ pshufb m1, m4
+ jmp tx2q
+.pass2:
+ IDCT4_1D_PACKED
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ jmp m(iadst_8x4_internal_8bpc).end2
+
+INV_TXFM_8X4_FN adst, dct
+INV_TXFM_8X4_FN adst, adst
+INV_TXFM_8X4_FN adst, flipadst
+INV_TXFM_8X4_FN adst, identity
+
+cglobal iadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpbroadcastd xm0, [o(pw_2896x8)]
+ pshufd xm4, [cq+16*0], q1032
+ pmulhrsw xm3, xm0, [cq+16*3]
+ pshufd xm5, [cq+16*1], q1032
+ pmulhrsw xm2, xm0, [cq+16*2]
+ pmulhrsw xm4, xm0
+ pmulhrsw xm5, xm0
+ call m(iadst_4x8_internal_8bpc).main_pass1
+ vinserti32x4 m0, xm2, 1
+ vinserti32x4 m1, xm3, 1
+ pxor m3, m3
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ psubsw m3, m2
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ call .main
+.end:
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q3120
+.end2:
+ vpbroadcastd m2, [o(pw_2048)]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+.end3:
+ pxor m2, m2
+ mova [cq], zmm18
+ lea r6, [strideq*3]
+ WRITE_8X4 0, 1, 4, 5
+ RET
+ALIGN function_align
+.main:
+ IADST4_1D_PACKED
+ ret
+
+INV_TXFM_8X4_FN flipadst, dct
+INV_TXFM_8X4_FN flipadst, adst
+INV_TXFM_8X4_FN flipadst, flipadst
+INV_TXFM_8X4_FN flipadst, identity
+
+cglobal iflipadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpbroadcastd xm0, [o(pw_2896x8)]
+ pshufd xm4, [cq+16*0], q1032
+ pmulhrsw xm3, xm0, [cq+16*3]
+ pshufd xm5, [cq+16*1], q1032
+ pmulhrsw xm2, xm0, [cq+16*2]
+ pmulhrsw xm4, xm0
+ pmulhrsw xm5, xm0
+ call m(iadst_4x8_internal_8bpc).main_pass1
+ vinserti32x4 m3, m3, xm1, 1
+ vinserti32x4 m2, m2, xm0, 1
+ punpckhwd m1, m3, m2
+ punpcklwd m3, m2
+ pxor m0, m0
+ psubsw m0, m1
+ punpckhwd m1, m0, m3
+ punpcklwd m0, m3
+ jmp tx2q
+.pass2:
+ call m(iadst_8x4_internal_8bpc).main
+ mova m2, m1
+ vpermq m1, m0, q2031
+ vpermq m0, m2, q2031
+ jmp m(iadst_8x4_internal_8bpc).end2
+
+INV_TXFM_8X4_FN identity, dct
+INV_TXFM_8X4_FN identity, adst
+INV_TXFM_8X4_FN identity, flipadst
+INV_TXFM_8X4_FN identity, identity
+
+cglobal iidentity_8x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova xm2, [cq+16*0]
+ mova xm0, [cq+16*1]
+ vinserti32x4 m2, [cq+16*2], 1
+ vinserti32x4 m0, [cq+16*3], 1
+ vpbroadcastd m3, [o(pw_2896x8)]
+ punpcklwd m1, m2, m0
+ punpckhwd m2, m0
+ pmulhrsw m1, m3
+ pmulhrsw m2, m3
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ paddsw m0, m0
+ paddsw m1, m1
+ jmp tx2q
+.pass2:
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ jmp m(iadst_8x4_internal_8bpc).end
+
+%macro INV_TXFM_8X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x8
+%ifidn %1_%2, dct_dct
+INIT_ZMM avx512icl
+ movsx r6d, word [cq]
+ mov [cq], eobd
+.dconly:
+ imul r6d, 181
+ add r6d, 128+256
+ sar r6d, 8+1
+.dconly2:
+ vpbroadcastd ym2, strided
+ imul r6d, 181
+ pmulld ym5, ym2, [o(pd_0to15)]
+ kxnorb k1, k1, k1
+ add r6d, 128+2048
+ sar r6d, 8+4
+ pxor m3, m3
+ vpbroadcastw m4, r6d
+.dconly_loop:
+ kmovb k2, k1
+ vpgatherdq m2{k1}, [dstq+ym5]
+ punpcklbw m0, m2, m3
+ punpckhbw m1, m2, m3
+ paddw m0, m4
+ paddw m1, m4
+ packuswb m0, m1
+ kmovb k1, k2
+ vpscatterdq [dstq+ym5]{k2}, m0
+ lea dstq, [dstq+strideq*8]
+ sub r3d, 8
+ jg .dconly_loop
+ RET
+INIT_YMM avx512icl
+%endif
+%endmacro
+
+INV_TXFM_8X8_FN dct, dct
+INV_TXFM_8X8_FN dct, identity
+INV_TXFM_8X8_FN dct, adst
+INV_TXFM_8X8_FN dct, flipadst
+
+cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpermq m0, [cq+32*0], q3120 ; 0 1
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ vpermq m1, [cq+32*1], q3120 ; 2 3
+ call .main
+ shufps m4, m0, m1, q0220
+ shufps m5, m0, m1, q1331
+ shufps m1, m2, m3, q0220
+ shufps m3, m2, m3, q1331
+ vbroadcasti32x4 m0, [o(deint_shuf)]
+ vpbroadcastd m2, [o(pw_16384)]
+ REPX {pshufb x, m0}, m4, m5, m1, m3
+ REPX {pmulhrsw x, m2}, m4, m5, m1, m3
+ vinserti32x4 m0, m4, xm1, 1
+ vshufi32x4 m2, m4, m1, 0x03
+ vinserti32x4 m1, m5, xm3, 1
+ vshufi32x4 m3, m5, m3, 0x03
+ jmp tx2q
+.pass2:
+ call .main
+ vpbroadcastd m4, [o(pw_2048)]
+ vpermq m0, m0, q3120
+ vpermq m1, m1, q2031
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q2031
+ jmp m(iadst_8x8_internal_8bpc).end2
+ALIGN function_align
+cglobal_label .main
+ IDCT8_1D_PACKED
+ ret
+
+INV_TXFM_8X8_FN adst, dct
+INV_TXFM_8X8_FN adst, adst
+INV_TXFM_8X8_FN adst, flipadst
+INV_TXFM_8X8_FN adst, identity
+
+cglobal iadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpermq m4, [cq+32*0], q1302 ; 1 0
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m5, [cq+32*1], q1302 ; 3 2
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ call .main_pass1
+ vpbroadcastd m5, [o(pw_16384_m16384)]
+ punpcklwd m4, m0, m1
+ punpckhwd m0, m1
+ punpcklwd m1, m2, m3
+ punpckhwd m2, m3
+ punpcklwd m3, m4, m0
+ punpckhwd m4, m0
+ punpcklwd m0, m1, m2
+ punpckhwd m1, m2
+ REPX {pmulhrsw x, m5}, m3, m4, m0, m1
+ vshufi32x4 m2, m3, m0, 0x03
+ vinserti32x4 m0, m3, xm0, 1
+ vshufi32x4 m3, m4, m1, 0x03
+ vinserti32x4 m1, m4, xm1, 1
+ jmp tx2q
+.pass2:
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call .main_pass2
+ vpbroadcastd m5, [o(pw_2048)]
+ vpbroadcastd xm4, [o(pw_4096)]
+ psubw m4, m5 ; lower half = 2048, upper half = -2048
+.end:
+ REPX {vpermq x, x, q3120}, m0, m1, m2, m3
+.end2:
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+.end3:
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+.end4:
+ pxor m4, m4
+ mova [cq+32*0], m4
+ mova [cq+32*1], m4
+ mova [cq+32*2], m4
+ mova [cq+32*3], m4
+ lea r6, [strideq*3]
+ WRITE_8X4 0, 1, 4, 5
+ lea dstq, [dstq+strideq*4]
+ WRITE_8X4 2, 3, 4, 5
+ RET
+ALIGN function_align
+.main_pass1:
+ punpckhwd m0, m4, m3 ; 0 7
+ punpckhwd m1, m5, m2 ; 2 5
+ punpcklwd m2, m5 ; 4 3
+ punpcklwd m3, m4 ; 6 1
+ IADST8_1D_PACKED 1
+ punpcklqdq m3, m4, m0 ; out6 -out7
+ punpckhqdq m0, m4 ; out0 -out1
+ ret
+ALIGN function_align
+cglobal_label .main_pass2
+ IADST8_1D_PACKED 2
+ ret
+
+INV_TXFM_8X8_FN flipadst, dct
+INV_TXFM_8X8_FN flipadst, adst
+INV_TXFM_8X8_FN flipadst, flipadst
+INV_TXFM_8X8_FN flipadst, identity
+
+cglobal iflipadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpermq m4, [cq+32*0], q1302 ; 1 0
+ vpermq m3, [cq+32*3], q3120 ; 6 7
+ vpermq m5, [cq+32*1], q1302 ; 3 2
+ vpermq m2, [cq+32*2], q3120 ; 4 5
+ call m(iadst_8x8_internal_8bpc).main_pass1
+ vpbroadcastd m5, [o(pw_m16384_16384)]
+ punpckhwd m4, m3, m2
+ punpcklwd m3, m2
+ punpckhwd m2, m1, m0
+ punpcklwd m1, m0
+ punpckhwd m0, m4, m3
+ punpcklwd m4, m3
+ punpckhwd m3, m2, m1
+ punpcklwd m2, m1
+ REPX {pmulhrsw x, m5}, m0, m4, m3, m2
+ vinserti32x4 m1, m0, xm3, 1
+ vshufi32x4 m3, m0, m3, 0x03
+ vinserti32x4 m0, m4, xm2, 1
+ vshufi32x4 m2, m4, m2, 0x03
+ jmp tx2q
+.pass2:
+ pshufd m4, m0, q1032
+ pshufd m5, m1, q1032
+ call m(iadst_8x8_internal_8bpc).main_pass2
+ vpbroadcastd m4, [o(pw_2048)]
+ vpbroadcastd xm5, [o(pw_4096)]
+ psubw m4, m5 ; lower half = -2048, upper half = 2048
+ vpermq m5, m3, q2031
+ vpermq m3, m0, q2031
+ vpermq m0, m2, q2031
+ vpermq m2, m1, q2031
+ pmulhrsw m1, m0, m4
+ pmulhrsw m0, m5, m4
+ jmp m(iadst_8x8_internal_8bpc).end3
+
+INV_TXFM_8X8_FN identity, dct
+INV_TXFM_8X8_FN identity, adst
+INV_TXFM_8X8_FN identity, flipadst
+INV_TXFM_8X8_FN identity, identity
+
+cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova xm3, [cq+16*0]
+ mova xm2, [cq+16*1]
+ vinserti32x4 m3, [cq+16*4], 1
+ vinserti32x4 m2, [cq+16*5], 1
+ mova xm4, [cq+16*2]
+ mova xm0, [cq+16*3]
+ vinserti32x4 m4, [cq+16*6], 1
+ vinserti32x4 m0, [cq+16*7], 1
+ punpcklwd m1, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m4, m0
+ punpckhwd m4, m0
+ punpckldq m0, m1, m2
+ punpckhdq m1, m2
+ punpckldq m2, m3, m4
+ punpckhdq m3, m4
+ jmp tx2q
+.pass2:
+ vpbroadcastd m4, [o(pw_4096)]
+ jmp m(iadst_8x8_internal_8bpc).end
+
+%macro INV_TXFM_8X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 8x16
+%ifidn %1_%2, dct_dct
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ imul r6d, 181
+ mov r3d, 16
+ add r6d, 128
+ sar r6d, 8
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly
+%endif
+%endmacro
+
+%macro ITX_8X16_LOAD_COEFS 0
+ vpbroadcastd m4, [o(pw_2896x8)]
+ pmulhrsw m0, m4, [cq+32*0]
+ add cq, 32*4
+ pmulhrsw m7, m4, [cq+32*3]
+ pmulhrsw m1, m4, [cq-32*3]
+ pmulhrsw m6, m4, [cq+32*2]
+ pmulhrsw m2, m4, [cq-32*2]
+ pmulhrsw m5, m4, [cq+32*1]
+ pmulhrsw m3, m4, [cq-32*1]
+ pmulhrsw m4, [cq+32*0]
+%endmacro
+
+INIT_ZMM avx512icl
+INV_TXFM_8X16_FN dct, dct
+INV_TXFM_8X16_FN dct, identity
+INV_TXFM_8X16_FN dct, adst
+INV_TXFM_8X16_FN dct, flipadst
+
+cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m3, [o(permB)]
+ vpermq m0, m3, [cq+64*0]
+ vpbroadcastd m4, [o(pw_2896x8)]
+ vpermq m1, m3, [cq+64*1]
+ vpermq m2, m3, [cq+64*2]
+ vpermq m3, m3, [cq+64*3]
+ REPX {pmulhrsw x, m4}, m0, m1, m2, m3
+ call m(idct_16x8_internal_8bpc).main
+ vpbroadcastd m5, [o(pw_16384)]
+ punpckhwd m4, m0, m2 ; b0 f0 b1 f1 b2 f2 b3 f3
+ punpcklwd m0, m2 ; a0 e0 a1 e1 a2 e2 a3 e3
+ punpckhwd m2, m1, m3 ; c0 g0 c1 g1 c2 g2 c3 g3
+ punpcklwd m1, m3 ; d0 h0 d1 h1 d2 h2 d3 h3
+ REPX {pmulhrsw x, m5}, m4, m0, m2, m1
+ punpckhwd m3, m0, m4 ; a2 b2 e2 f2 a3 b3 e3 f3
+ punpcklwd m0, m4 ; a0 b0 e0 f0 a1 b1 e1 f1
+ punpckhwd m4, m2, m1 ; c2 d2 g2 h2 c3 d3 g3 h3
+ punpcklwd m2, m1 ; c0 d0 g0 h0 c1 d1 g1 h1
+ punpckhdq m1, m0, m2 ; 1 5 9 13
+ punpckldq m0, m2 ; 0 4 8 12
+ punpckldq m2, m3, m4 ; 2 6 10 14
+ punpckhdq m3, m4 ; 3 7 11 15
+ jmp tx2q
+.pass2:
+ vprord m5, [o(int16_perm)], 16
+ vshufi32x4 m2, m2, q1320 ; 2 10 14 6
+ vshufi32x4 m4, m1, m3, q2310 ; 1 5 15 11
+ vshufi32x4 m1, m3, q0132 ; 9 13 7 3
+ vpermb m9, m5, m0
+ vpermb m7, m5, m2
+ vpermb m8, m5, m4
+ vpermb m0, m5, m1
+ vextracti32x8 ym6, m9, 1
+ vextracti32x8 ym3, m7, 1
+ vextracti32x8 ym5, m8, 1
+ vextracti32x8 ym1, m0, 1
+ call .main2
+ mova ym8, [o(gather8a)]
+ lea r3, [dstq+strideq*4]
+ pmovzxdq m9, ym8
+ pshufd ym8, ym8, q1230
+ vpermt2q m0, m9, m4
+ vpermt2q m1, m9, m5
+ vpermt2q m2, m9, m6
+ vpermt2q m3, m9, m7
+.end:
+ vpbroadcastd m7, [o(pw_2048)]
+.end2:
+ pmulhrsw m0, m7
+ pmulhrsw m1, m7
+.end3:
+ pmulhrsw m2, m7
+ pmulhrsw m3, m7
+.end4:
+ vpbroadcastd ym6, strided
+ kxnorb k1, k1, k1
+ pxor m4, m4
+ pmulld ym8, ym6
+ kmovb k2, k1
+ vpgatherdq m6{k1}, [dstq+ym8]
+ kmovb k1, k2
+ vpgatherdq m7{k2}, [r3+ym8]
+ mova [cq+64*0], m4
+ mova [cq+64*1], m4
+ kmovb k2, k1
+ mova [cq+64*2], m4
+ mova [cq+64*3], m4
+ punpcklbw m5, m6, m4
+ punpckhbw m6, m4
+ paddw m0, m5
+ paddw m1, m6
+ packuswb m0, m1
+ vpscatterdq [dstq+ym8]{k1}, m0
+ punpcklbw m6, m7, m4
+ punpckhbw m7, m4
+ paddw m2, m6
+ paddw m3, m7
+ packuswb m2, m3
+ vpscatterdq [r3+ym8]{k2}, m2
+ RET
+ALIGN function_align
+cglobal_label .main
+ WRAP_YMM IDCT16_1D_PACKED
+ ret
+
+INV_TXFM_8X16_FN adst, dct
+INV_TXFM_8X16_FN adst, adst
+INV_TXFM_8X16_FN adst, flipadst
+INV_TXFM_8X16_FN adst, identity
+
+cglobal iadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ call m(iadst_16x8_internal_8bpc).main_pass1
+ vbroadcasti32x4 m6, [o(int_shuf1)]
+ vpbroadcastd m7, [o(pw_16384_m16384)]
+ punpckhwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpcklwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3
+ pshufb m5, m1, m6 ; c0 d0 c1 d1 c2 d2 c3 d3
+ pshufb m2, m6 ; e0 f0 e1 f1 e2 f2 e3 f3
+.pass1_end:
+ REPX {pmulhrsw x, m7}, m3, m5, m4, m2
+ punpckldq m0, m3, m5 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m3, m5 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckhdq m5, m2, m4 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpckldq m2, m4 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m5
+ punpckhqdq m3, m5
+ jmp tx2q
+.pass2:
+ call .main_pass2
+ vpbroadcastd m6, [o(pw_2048)]
+ psrlq m10, 4
+ psubw m7, m8, m6
+.pass2_end:
+ vpbroadcastd m5, [o(pw_2896x8)]
+ paddsw m1, m2, m4
+ psubsw m2, m4
+ pmulhrsw m1, m5 ; -out7 out4 out6 -out5
+ pmulhrsw m5, m2 ; out8 -out11 -out9 out10
+ mova ym8, [o(gather8c)]
+ lea r3, [dstq+strideq]
+ psrlq m2, m10, 4
+ vpermi2q m2, m0, m3 ; 1 3 13 15
+ vpermt2q m0, m10, m3 ; 0 2 12 14
+ psrlq m3, m10, 8
+ vpermi2q m3, m1, m5 ; 5 7 9 11
+ psrlq m10, 12
+ vpermt2q m1, m10, m5 ; 4 6 8 10
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ jmp m(idct_8x16_internal_8bpc).end3
+ALIGN function_align
+.main_pass1:
+ vpbroadcastd m2, [o(pw_2896x8)]
+ pmulhrsw m5, m2, [cq+64*0]
+ pmulhrsw m3, m2, [cq+64*3]
+ pmulhrsw m1, m2, [cq+64*1]
+ pmulhrsw m2, [cq+64*2]
+ movu m4, [o(permA+3)]
+ psrlq m10, m4, 4
+ mova m6, m4
+ vpermi2q m4, m5, m3 ; in0 in12 in2 in14
+ vpermt2q m5, m10, m3 ; in15 in3 in13 in1
+ vpermi2q m6, m1, m2 ; in4 in8 in6 in10
+ vpermt2q m1, m10, m2 ; in11 in7 in9 in5
+ jmp .main
+ALIGN function_align
+.main_pass2:
+ mova m4, [o(permC)]
+ psrlq m5, m4, 4
+ vpermi2q m4, m0, m2 ; in0 in12 in2 in14
+ psrlq m6, m5, 4
+ vpermi2q m5, m1, m3 ; in15 in3 in13 in1
+ psrlq m10, m6, 4
+ vpermi2q m6, m0, m2 ; in4 in8 in6 in10
+ vpermt2q m1, m10, m3 ; in11 in7 in9 in5
+.main:
+ punpcklwd m0, m4, m5 ; in0 in15 in2 in13
+ punpckhwd m4, m5 ; in12 in3 in14 in1
+ punpcklwd m5, m6, m1 ; in4 in11 in6 in9
+ punpckhwd m6, m1 ; in8 in7 in10 in5
+cglobal_label .main2
+ vpbroadcastd m9, [o(pd_2048)]
+ vpbroadcastq m13, [o(int_mshift)]
+ kxnorb k1, k1, k1
+ vpcmpub k7, m13, m9, 6 ; 0x33...
+ pxor m8, m8
+ ITX_MUL4X_PACK 0, 1, 2, 3, 7, 9, 201, 4091, 995, 3973, 5
+ ITX_MUL4X_PACK 6, 1, 2, 3, 7, 9, 3035, 2751, 3513, 2106, 5
+ ITX_MUL4X_PACK 4, 1, 2, 3, 7, 9, 3857, 1380, 4052, 601, 5
+ ITX_MUL4X_PACK 5, 1, 2, 3, 7, 9, 1751, 3703, 2440, 3290, 5
+ psubsw m2, m0, m6 ; t9a t8a t11a t10a
+ paddsw m0, m6 ; t1a t0a t3a t2a
+ psubsw m3, m5, m4 ; t13a t12a t15a t14a
+ paddsw m5, m4 ; t5a t4a t7a t6a
+ ITX_MUL4X_PACK 2, 4, 1, 6, 7, 9, 799, 4017, 3406, 2276, 5
+ psubw m7, m8, m7
+ ITX_MUL2X_PACK 3, 4, 1, 9, 7, 6, 4
+ vpbroadcastd m6, [o(pw_3784_m1567)]
+ vpbroadcastd m6{k1}, [o(pw_m3784_1567)]
+ psubsw m1, m0, m5 ; t5 t4 t7 t6
+ paddsw m0, m5 ; t1 t0 t3 t2
+ psubsw m4, m2, m3 ; t13a t12a t15a t14a
+ paddsw m2, m3 ; t9a t8a t11a t10a
+ ITX_MUL2X_PACK 1, 3, 5, 9, 1567_3784, 6, 16 ; t5a t4a t6a t7a
+ ITX_MUL2X_PACK 4, 3, 5, 9, 1567_3784, 6, 16 ; t13 t12 t14 t15
+ vbroadcasti32x4 m5, [o(deint_shuf)]
+ pshufb m0, m5
+ pshufb m2, m5
+ vshufi32x4 m3, m0, m2, q3232 ; t3 t2 t11a t10a
+ vinserti32x8 m0, ym2, 1 ; t1 t0 t9a t8a
+ vshufi32x4 m2, m1, m4, q3232 ; t6a t7a t14 t15
+ vinserti32x8 m1, ym4, 1 ; t5a t4a t13 t12
+ pshufd m2, m2, q1032 ; t7a t6a t15 t14
+ psubsw m4, m0, m3 ; t3a t2a t11 t10
+ paddsw m0, m3 ; -out15 out0 out14 -out1
+ paddsw m3, m1, m2 ; out12 -out3 -out13 out2
+ psubsw m1, m2 ; t7 t6 t15a t14a
+ punpckhqdq m2, m4, m1 ; t2a t6 t10 t14a
+ punpcklqdq m4, m1 ; t3a t7 t11 t15a
+ ret
+
+INV_TXFM_8X16_FN flipadst, dct
+INV_TXFM_8X16_FN flipadst, adst
+INV_TXFM_8X16_FN flipadst, flipadst
+INV_TXFM_8X16_FN flipadst, identity
+
+cglobal iflipadst_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ call m(iadst_16x8_internal_8bpc).main_pass1
+ vbroadcasti32x4 m6, [o(int_shuf2)]
+ vpbroadcastd m7, [o(pw_m16384_16384)]
+ punpcklwd m3, m0, m4 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpckhwd m4, m0 ; g0 h0 g1 h1 g2 h2 g3 h3
+ pshufb m5, m2, m6 ; c0 d0 c1 d1 c2 d2 c3 d3
+ pshufb m2, m1, m6 ; e0 f0 e1 f1 e2 f2 e3 f3
+ jmp m(iadst_8x16_internal_8bpc).pass1_end
+.pass2:
+ call m(iadst_8x16_internal_8bpc).main_pass2
+ vpbroadcastd m7, [o(pw_2048)]
+ psrlq m10, 36
+ psubw m6, m8, m7
+ jmp m(iadst_8x16_internal_8bpc).pass2_end
+
+INV_TXFM_8X16_FN identity, dct
+INV_TXFM_8X16_FN identity, adst
+INV_TXFM_8X16_FN identity, flipadst
+INV_TXFM_8X16_FN identity, identity
+
+cglobal iidentity_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [o(int16_perm)]
+ vpermb m3, m0, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3
+ vpermb m2, m0, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3
+ vpermb m4, m0, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3
+ vpermb m0, m0, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3
+ vpbroadcastd m5, [o(pw_2896x8)]
+ punpckldq m1, m3, m2 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m3, m2 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m2, m4, m0 ; e0 f0 g0 h0 a1 f1 g1 h1
+ punpckhdq m4, m0 ; e2 f2 g2 h2 e3 f3 g3 h3
+ REPX {pmulhrsw x, m5}, m1, m2, m3, m4
+ punpcklqdq m0, m1, m2 ; a0 b0 c0 d0 e0 f0 g0 h0
+ punpckhqdq m1, m2 ; a1 b1 c1 d1 e1 f1 g1 h1
+ punpcklqdq m2, m3, m4 ; a2 b2 c2 d2 e2 f2 g2 h2
+ punpckhqdq m3, m4 ; a3 b3 c3 d3 e3 f3 g3 h3
+ jmp tx2q
+.pass2:
+ vpbroadcastd m7, [o(pw_1697x16)]
+ mova ym8, [o(gather8b)]
+ lea r3, [dstq+strideq*2]
+ pmulhrsw m4, m7, m0
+ pmulhrsw m5, m7, m1
+ pmulhrsw m6, m7, m2
+ pmulhrsw m7, m3
+ REPX {paddsw x, x}, m0, m1, m2, m3
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m2, m6
+ paddsw m3, m7
+ jmp m(idct_8x16_internal_8bpc).end
+
+%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2]
+ pmovzxbw m%3, [dstq+%5]
+%ifnum %1
+ paddw m%3, m%1
+%else
+ paddw m%3, %1
+%endif
+ pmovzxbw m%4, [dstq+%6]
+%ifnum %2
+ paddw m%4, m%2
+%else
+ paddw m%4, %2
+%endif
+ packuswb m%3, m%4
+ vpermq m%3, m%3, q3120
+ mova [dstq+%5], xm%3
+ vextracti32x4 [dstq+%6], m%3, 1
+%endmacro
+
+%macro INV_TXFM_16X4_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x4
+%ifidn %1_%2, dct_dct
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly2
+%endif
+%endmacro
+
+INIT_ZMM avx512icl
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+INV_TXFM_16X4_FN dct, identity
+
+cglobal idct_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova xm0, [cq+16*0]
+ mova xm1, [cq+16*1]
+ mova xm2, [cq+16*2]
+ mova xm3, [cq+16*3]
+ mova xm4, [cq+16*4]
+ mova xm5, [cq+16*5]
+ mova xm6, [cq+16*6]
+ mova xm7, [cq+16*7]
+ call m(idct_4x16_internal_8bpc).main
+ vpbroadcastd m8, [o(pw_16384)]
+ vinserti32x4 ym1, xm3, 1 ; 3 2 7 6
+ vinserti32x4 ym5, xm7, 1 ; b a f e
+ vinserti32x4 ym0, xm2, 1 ; 0 1 4 5
+ vinserti32x4 ym4, xm6, 1 ; 8 9 c d
+ vinserti32x8 m1, ym5, 1 ; 3 2 7 6 b a f e
+ vinserti32x8 m0, ym4, 1 ; 0 1 4 5 8 9 c d
+ pmulhrsw m1, m8
+ pmulhrsw m0, m8
+ pshufd m1, m1, q1032
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ jmp tx2q
+.pass2:
+ IDCT4_1D_PACKED
+ mova m2, [o(permA)]
+ jmp m(iadst_16x4_internal_8bpc).end
+
+INV_TXFM_16X4_FN adst, dct
+INV_TXFM_16X4_FN adst, adst
+INV_TXFM_16X4_FN adst, flipadst
+INV_TXFM_16X4_FN adst, identity
+
+cglobal iadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+64*0]
+ mova m1, [cq+64*1]
+ movshdup m3, [o(permB)]
+ psrlq m10, m3, 4
+ call m(iadst_4x16_internal_8bpc).main2
+ vpbroadcastd m6, [o(pw_16384_m16384)]
+ psrlq m0, m10, 4
+ psrlq m10, 8
+.pass1_end:
+ punpcklwd ym5, ym4, ym2
+ punpckhwd ym4, ym2
+ vinserti32x8 m5, ym4, 1
+ mova m1, m9
+ vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16}
+ mova m4, m9
+ vpdpwssd m4, m5, [o(pw_2896_2896)] {1to16}
+ psrad m1, 12
+ psrad m4, 12
+ packssdw m1, m4 ; out8 -out7 -out9 out6 -out11 out4 out10 -out5
+ vpermi2q m0, m1, m3 ; 0 1 4 5 8 9 c d
+ vpermt2q m1, m10, m3 ; 2 3 6 7 a b e f
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m0, m2
+ punpcklwd m0, m2
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ jmp tx2q
+.pass2:
+ call .main
+ movu m2, [o(permA+1)]
+.end:
+ vpbroadcastd m3, [o(pw_2048)]
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+.end2:
+ psrlq m3, m2, 4
+ vpermi2q m2, m0, m1
+ vpermi2q m3, m0, m1
+.end3:
+ lea r3, [dstq+strideq*2]
+ mova xm1, [dstq+strideq*0]
+ vinserti32x4 ym1, [dstq+strideq*1], 1
+ vinserti32x4 m1, [r3 +strideq*0], 2
+ vinserti32x4 m1, [r3 +strideq*1], 3
+ pxor m4, m4
+ mova [cq+64*0], m4
+ mova [cq+64*1], m4
+ punpcklbw m0, m1, m4
+ punpckhbw m1, m4
+ paddw m0, m2
+ paddw m1, m3
+ packuswb m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [r3 +strideq*0], m0, 2
+ vextracti32x4 [r3 +strideq*1], m0, 3
+ RET
+ALIGN function_align
+.main:
+ IADST4_1D_PACKED
+ ret
+
+INV_TXFM_16X4_FN flipadst, dct
+INV_TXFM_16X4_FN flipadst, adst
+INV_TXFM_16X4_FN flipadst, flipadst
+INV_TXFM_16X4_FN flipadst, identity
+
+cglobal iflipadst_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m0, [cq+64*0]
+ mova m1, [cq+64*1]
+ movshdup m3, [o(permB)]
+ psrlq m10, m3, 4
+ call m(iadst_4x16_internal_8bpc).main2
+ vpbroadcastd m6, [o(pw_m16384_16384)]
+ psrlq m0, m10, 12
+ psrlq m10, 16
+ jmp m(iadst_16x4_internal_8bpc).pass1_end
+.pass2:
+ call m(iadst_16x4_internal_8bpc).main
+ movu m2, [o(permA+2)]
+ jmp m(iadst_16x4_internal_8bpc).end
+
+INV_TXFM_16X4_FN identity, dct
+INV_TXFM_16X4_FN identity, adst
+INV_TXFM_16X4_FN identity, flipadst
+INV_TXFM_16X4_FN identity, identity
+
+cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m1, [cq+64*0]
+ mova m2, [cq+64*1]
+ vpbroadcastd m3, [o(pw_1697x16)]
+ vpbroadcastd m4, [o(pw_16384)]
+ mova m5, [o(idtx_16x4p)]
+ shufps m0, m1, m2, q2020
+ shufps m1, m2, q3131
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ pmulhrsw m2, m4
+ pmulhrsw m3, m4
+ paddsw m0, m2
+ paddsw m1, m3
+ vpermb m0, m5, m0
+ vpermb m1, m5, m1
+ jmp tx2q
+.pass2:
+ vpbroadcastd m3, [o(pw_1697x8)]
+ pmulhrsw m2, m3, m0
+ pmulhrsw m3, m1
+ paddsw m0, m2
+ paddsw m1, m3
+ movu m2, [o(permA+1)]
+ jmp m(iadst_16x4_internal_8bpc).end
+
+%macro INV_TXFM_16X8_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x8
+%ifidn %1_%2, dct_dct
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ mov r3d, 8
+.dconly:
+ imul r6d, 181
+ add r6d, 128
+ sar r6d, 8
+.dconly2:
+ imul r6d, 181
+ add r6d, 128+256
+ sar r6d, 8+1
+.dconly3:
+ imul r6d, 181
+ lea r2, [strideq*3]
+ add r6d, 128+2048
+ sar r6d, 8+4
+ pxor m2, m2
+ vpbroadcastw m3, r6d
+.dconly_loop:
+ mova xm1, [dstq+strideq*0]
+ vinserti32x4 ym1, [dstq+strideq*1], 1
+ vinserti32x4 m1, [dstq+strideq*2], 2
+ vinserti32x4 m1, [dstq+r2 ], 3
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ paddw m0, m3
+ paddw m1, m3
+ packuswb m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [dstq+strideq*2], m0, 2
+ vextracti32x4 [dstq+r2 ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub r3d, 4
+ jg .dconly_loop
+ RET
+%endif
+%endmacro
+
+%macro ITX_16X8_LOAD_COEFS 1 ; shuf_odd
+ vpbroadcastd m8, [o(pw_2896x8)]
+ vpermq m0, [cq+32*0], q3120
+ add cq, 32*4
+ vpermq m7, [cq+32*3], q%1
+ vpermq m1, [cq-32*3], q%1
+ vpermq m6, [cq+32*2], q3120
+ vpermq m2, [cq-32*2], q3120
+ vpermq m5, [cq+32*1], q%1
+ vpermq m3, [cq-32*1], q%1
+ vpermq m4, [cq+32*0], q3120
+ REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4
+%endmacro
+
+INV_TXFM_16X8_FN dct, dct
+INV_TXFM_16X8_FN dct, identity
+INV_TXFM_16X8_FN dct, adst
+INV_TXFM_16X8_FN dct, flipadst
+
+cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpbroadcastd m1, [o(pw_2896x8)]
+ vpermq m0, [cq+64*0], q3120
+ vpermq m2, [cq+64*1], q3120
+ vpermq m4, [cq+64*2], q3120
+ vpermq m6, [cq+64*3], q3120
+ REPX {pmulhrsw x, m1}, m0, m2, m4, m6
+ vextracti32x8 ym1, m0, 1
+ vextracti32x8 ym3, m2, 1
+ vextracti32x8 ym5, m4, 1
+ vextracti32x8 ym7, m6, 1
+ call m(idct_8x16_internal_8bpc).main
+ vbroadcasti32x4 m8, [o(int_shuf1)]
+ vbroadcasti32x4 m9, [o(int_shuf2)]
+ vinserti32x8 m0, ym2, 1 ; a0 a1 a2 a3 b0 b1 b2 b3
+ vinserti32x8 m1, ym3, 1 ; d0 d1 d2 d3 c0 c1 c2 c3
+ vinserti32x8 m4, ym6, 1 ; i0 i1 i2 i3 j0 j1 j2 j3
+ vinserti32x8 m5, ym7, 1 ; l0 l1 l2 l3 k0 k1 k2 k3
+ vpbroadcastd m2, [o(pw_16384)]
+ pshufb m0, m8 ; a0 b0 a1 b1 a2 b2 a3 b3
+ pshufb m1, m9 ; c0 d0 c1 d1 c2 d2 c3 d3
+ pshufb m6, m4, m8 ; i0 j0 i1 j1 i2 j2 i3 j3
+ pshufb m7, m5, m9 ; m0 n0 m1 n1 m2 n2 m3 n3
+ REPX {pmulhrsw x, m2}, m0, m1, m6, m7
+ punpckldq m2, m0, m1 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m3, m0, m1 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m4, m6, m7 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhdq m5, m6, m7 ; i2 j2 k2 l2 i3 j3 k3 l3
+ jmp tx2q
+.pass2:
+ vshufi32x4 m0, m2, m4, q2020 ; 0 1
+ vshufi32x4 m2, m4, q3131 ; 4 5
+ vshufi32x4 m1, m3, m5, q2020 ; 2 3
+ vshufi32x4 m3, m5, q3131 ; 6 7
+ call .main
+ movshdup m4, [o(permC)]
+ psrlq m6, m4, 4
+ vpermq m5, m4, q1032
+ vpermi2q m4, m0, m2 ; a2 a3 b2 b3 e2 e3 f2 f3
+ vpermt2q m0, m6, m2 ; a0 a1 b0 b1 e0 e1 f0 f1
+ psrlq m6, m5, 4
+ vpermi2q m5, m1, m3 ; c2 c3 d2 d3 g2 g3 h2 h3
+ vpermt2q m1, m6, m3 ; c0 c1 d0 d1 g0 g1 h0 h1
+ vpbroadcastd m6, [o(pw_2048)]
+.end:
+ REPX {pmulhrsw x, m6}, m0, m4, m1, m5
+.end2:
+ lea r3, [dstq+strideq*4]
+ lea r4, [strideq*3]
+ mova xm3, [dstq+strideq*0]
+ mova xm6, [dstq+strideq*2]
+ vinserti32x4 ym3, [dstq+strideq*1], 1
+ vinserti32x4 ym6, [dstq+r4 ], 1
+ vinserti32x4 m3, [r3 +strideq*0], 2
+ vinserti32x4 m6, [r3 +strideq*2], 2
+ vinserti32x4 m3, [r3 +strideq*1], 3
+ vinserti32x4 m6, [r3 +r4 ], 3
+ pxor m7, m7
+ mova [cq+64*0], m7
+ mova [cq+64*1], m7
+ mova [cq+64*2], m7
+ mova [cq+64*3], m7
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+ paddw m0, m2
+ paddw m4, m3
+ packuswb m0, m4
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ vextracti32x4 [r3 +strideq*0], m0, 2
+ vextracti32x4 [r3 +strideq*1], m0, 3
+ punpcklbw m3, m6, m7
+ punpckhbw m6, m7
+ paddw m1, m3
+ paddw m5, m6
+ packuswb m1, m5
+ mova [dstq+strideq*2], xm1
+ vextracti32x4 [dstq+r4 ], ym1, 1
+ vextracti32x4 [r3 +strideq*2], m1, 2
+ vextracti32x4 [r3 +r4 ], m1, 3
+ RET
+ALIGN function_align
+cglobal_label .main
+ IDCT8_1D_PACKED
+ ret
+
+INV_TXFM_16X8_FN adst, dct
+INV_TXFM_16X8_FN adst, adst
+INV_TXFM_16X8_FN adst, flipadst
+INV_TXFM_16X8_FN adst, identity
+
+cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ call m(iadst_8x16_internal_8bpc).main_pass1
+ vpbroadcastd m7, [o(pw_16384_m16384)]
+ psrlq m10, 4
+.pass1_end:
+ punpcklwd m5, m4, m2
+ punpckhwd m4, m2
+ mova m1, m9
+ vpdpwssd m1, m5, [o(pw_m2896_2896)] {1to16}
+ mova m6, m9
+ vpdpwssd m6, m5, [o(pw_2896_2896)] {1to16}
+ mova m2, m9
+ vpdpwssd m2, m4, [o(pw_m2896_2896)] {1to16}
+ vpdpwssd m9, m4, [o(pw_2896_2896)] {1to16}
+ psrad m1, 12
+ psrad m6, 12
+ packssdw m1, m6 ; out8 -out7 -out9 out6
+ psrad m2, 12
+ psrad m9, 12
+ packssdw m2, m9 ; -out11 out4 out10 -out5
+ psrlq m4, m10, 4
+ vpermi2q m4, m0, m2
+ vpermt2q m0, m10, m2
+ psrlq m5, m10, 8
+ vpermi2q m5, m1, m3
+ psrlq m10, 12
+ vpermt2q m1, m10, m3
+ punpcklwd m3, m4, m5 ; a0 c0 a1 c1 a2 c2 a3 c3
+ punpckhwd m4, m5 ; b0 d0 b1 d1 b2 d2 b3 d3
+ punpcklwd m5, m1, m0 ; i0 k0 i1 k1 2i k2 i3 k3
+ punpckhwd m1, m0 ; j0 l0 j1 l1 j2 l2 j3 l3
+ punpcklwd m2, m3, m4 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhwd m3, m4 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpcklwd m4, m5, m1 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhwd m5, m1 ; i2 j2 k2 l2 i3 j3 k3 l3
+ REPX {pmulhrsw x, m7}, m2, m3, m4, m5
+ jmp tx2q
+.pass2:
+ vshufi32x4 m0, m2, m4, q2020
+ vshufi32x4 m2, m4, q3131 ; 4 5
+ vshufi32x4 m1, m3, m5, q2020
+ vshufi32x4 m3, m5, q3131 ; 6 7
+ pshufd m4, m0, q1032 ; 1 0
+ pshufd m5, m1, q1032 ; 3 2
+ call .main_pass2
+ movshdup m4, [o(permC)]
+ pmulhrsw m0, m6
+ pmulhrsw m1, m6
+ psrlq m6, m4, 4
+ mova m5, m4
+ vpermi2q m4, m0, m2
+ vpermt2q m0, m6, m2
+ vpermi2q m5, m1, m3
+ vpermt2q m1, m6, m3
+ jmp m(idct_16x8_internal_8bpc).end2
+ALIGN function_align
+.main_pass1:
+ vpbroadcastd m4, [o(pw_2896x8)]
+ pmulhrsw m3, m4, [cq+64*0]
+ pmulhrsw m1, m4, [cq+64*3]
+ pmulhrsw m2, m4, [cq+64*1]
+ pmulhrsw m4, [cq+64*2]
+ mova m5, [o(int16_perm)]
+ kxnorb k1, k1, k1
+ vpblendmd m0{k1}, m1, m3 ; 0 7
+ vmovdqa32 m3{k1}, m1 ; 6 1
+ vpblendmd m1{k1}, m4, m2 ; 2 5
+ vmovdqa32 m2{k1}, m4 ; 4 3
+ REPX {vpermb x, m5, x}, m0, m1, m2, m3
+ IADST8_1D_PACKED 1
+ ret
+ALIGN function_align
+cglobal_label .main_pass2
+ IADST8_1D_PACKED 2
+ pxor m5, m5
+ psubd m5, m6
+ packssdw m6, m5
+ pmulhrsw m2, m6
+ pmulhrsw m3, m6
+ ret
+
+INV_TXFM_16X8_FN flipadst, dct
+INV_TXFM_16X8_FN flipadst, adst
+INV_TXFM_16X8_FN flipadst, flipadst
+INV_TXFM_16X8_FN flipadst, identity
+
+cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ call m(iadst_8x16_internal_8bpc).main_pass1
+ vpbroadcastd m7, [o(pw_m16384_16384)]
+ psrlq m10, 20
+ jmp m(iadst_16x8_internal_8bpc).pass1_end
+.pass2:
+ vshufi32x4 m0, m2, m4, q2020
+ vshufi32x4 m2, m4, q3131 ; 4 5
+ vshufi32x4 m1, m3, m5, q2020
+ vshufi32x4 m3, m5, q3131 ; 6 7
+ pshufd m4, m0, q1032 ; 1 0
+ pshufd m5, m1, q1032 ; 3 2
+ call m(iadst_16x8_internal_8bpc).main_pass2
+ movshdup m4, [o(permC)]
+ pmulhrsw m5, m6, m0
+ pmulhrsw m0, m6, m1
+ psrlq m1, m4, 12
+ psrlq m4, 8
+ mova m7, m4
+ vpermi2q m4, m0, m3
+ vpermt2q m0, m1, m3
+ vpermi2q m1, m5, m2
+ vpermt2q m5, m7, m2
+ jmp m(idct_16x8_internal_8bpc).end2
+
+INV_TXFM_16X8_FN identity, dct
+INV_TXFM_16X8_FN identity, adst
+INV_TXFM_16X8_FN identity, flipadst
+INV_TXFM_16X8_FN identity, identity
+
+cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ vpbroadcastd m0, [o(pw_2896x8)]
+ pmulhrsw m3, m0, [cq+64*0]
+ pmulhrsw m4, m0, [cq+64*1]
+ pmulhrsw m5, m0, [cq+64*2]
+ pmulhrsw m0, [cq+64*3]
+ vpbroadcastd m7, [o(pw_1697x16)]
+ vpbroadcastd m8, [o(pw_16384)]
+ shufps m2, m3, m4, q2020 ; a0 a1 a4 a5 e0 e1 e4 e5
+ shufps m3, m4, q3131 ; a2 a3 a6 a7 e2 e3 e6 e7
+ shufps m4, m5, m0, q2020 ; i0 i1 i4 i5 m0 m1 m4 m5
+ shufps m5, m0, q3131 ; i2 i3 i6 i7 m2 m3 m6 m7
+ mova m9, [o(int8_permA)]
+ pmulhrsw m0, m7, m2
+ pmulhrsw m1, m7, m3
+ pmulhrsw m6, m7, m4
+ pmulhrsw m7, m5
+ REPX {pmulhrsw x, m8}, m0, m1, m6, m7
+ paddsw m2, m0
+ paddsw m3, m1
+ paddsw m4, m6
+ paddsw m5, m7
+ REPX {vpermb x, m9, x}, m2, m3, m4, m5
+ jmp tx2q
+.pass2:
+ mova m7, [o(permB)]
+ vpbroadcastd m6, [o(pw_4096)]
+ vpermq m0, m7, m2
+ vpermq m4, m7, m4
+ vpermq m1, m7, m3
+ vpermq m5, m7, m5
+ jmp m(idct_16x8_internal_8bpc).end
+
+%macro INV_TXFM_16X16_FN 2 ; type1, type2
+ INV_TXFM_FN %1, %2, 16x16
+%ifidn %1_%2, dct_dct
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ imul r6d, 181
+ mov r3d, 16
+ add r6d, 128+512
+ sar r6d, 8+2
+ jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3
+%endif
+%endmacro
+
+INV_TXFM_16X16_FN dct, dct
+INV_TXFM_16X16_FN dct, identity
+INV_TXFM_16X16_FN dct, adst
+INV_TXFM_16X16_FN dct, flipadst
+
+cglobal idct_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m7, [o(permB)]
+ vpermq m0, m7, [cq+64*0]
+ vpermq m1, m7, [cq+64*1]
+ vpermq m2, m7, [cq+64*2]
+ vpermq m3, m7, [cq+64*3]
+ vpermq m4, m7, [cq+64*4]
+ vpermq m5, m7, [cq+64*5]
+ vpermq m6, m7, [cq+64*6]
+ vpermq m7, m7, [cq+64*7]
+ call .main
+ vbroadcasti32x4 m12, [o(int_shuf1)]
+ vbroadcasti32x4 m11, [o(int_shuf2)]
+ vpbroadcastd m13, [o(pw_8192)]
+ pshufb m0, m12
+ pshufb m8, m1, m11
+ pshufb m2, m12
+ pshufb m9, m3, m11
+ pshufb m4, m12
+ pshufb m10, m5, m11
+ pshufb m6, m12
+ pshufb m11, m7, m11
+ REPX {pmulhrsw x, m13}, m0, m8, m2, m9, m4, m10, m6, m11
+ punpckhdq m1, m0, m8
+ punpckldq m0, m8
+ punpckhdq m3, m2, m9
+ punpckldq m2, m9
+ punpckhdq m5, m4, m10
+ punpckldq m4, m10
+ punpckhdq m7, m6, m11
+ punpckldq m6, m11
+ jmp tx2q
+.pass2:
+ vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc
+ vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4
+ vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec
+ vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4
+ vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me
+ vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6
+ vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee
+ vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6
+ vshufi32x4 m2, m0, m4, q3131 ; 4 5
+ vshufi32x4 m0, m4, q2020 ; 0 1
+ vshufi32x4 m4, m6, m8, q2020 ; 8 9
+ vshufi32x4 m6, m8, q3131 ; 12 13
+ vshufi32x4 m3, m1, m5, q3131 ; 6 7
+ vshufi32x4 m1, m5, q2020 ; 2 3
+ vshufi32x4 m5, m7, m9, q2020 ; 10 11
+ vshufi32x4 m7, m9, q3131 ; 14 15
+ call .main
+ mova m8, [o(permD)]
+ psrlq m12, m8, 4
+ psrlq m9, m8, 8
+ psrlq m13, m8, 12
+ mova m10, m8
+ vpermi2q m8, m0, m2 ; 0 1 4 5
+ vpermt2q m0, m12, m2
+ mova m11, m9
+ vpermi2q m9, m1, m3 ; 2 3 6 7
+ vpermt2q m1, m13, m3
+ vpermi2q m10, m4, m6 ; 8 9 12 13
+ vpermt2q m4, m12, m6
+ vpermi2q m11, m5, m7 ; 10 11 14 15
+ vpermt2q m5, m13, m7
+.end:
+ vpbroadcastd m12, [o(pw_2048)]
+.end2:
+ REPX {pmulhrsw x, m12}, m0, m1, m4, m5
+.end3:
+ REPX {pmulhrsw x, m12}, m8, m9, m10, m11
+ lea r3, [strideq*3]
+ lea r4, [dstq+strideq*4]
+ lea r5, [dstq+strideq*8]
+ lea r6, [r4 +strideq*8]
+ mova xm3, [dstq+strideq*0]
+ mova xm6, [dstq+strideq*2]
+ vinserti32x4 ym3, [dstq+strideq*1], 1
+ vinserti32x4 ym6, [dstq+r3 ], 1
+ vinserti32x4 m3, [r4+strideq*0], 2
+ vinserti32x4 m6, [r4+strideq*2], 2
+ vinserti32x4 m3, [r4+strideq*1], 3
+ vinserti32x4 m6, [r4+r3 ], 3
+ mova xm12, [r5+strideq*0]
+ mova xm13, [r5+strideq*2]
+ vinserti32x4 ym12, [r5+strideq*1], 1
+ vinserti32x4 ym13, [r5+r3 ], 1
+ vinserti32x4 m12, [r6+strideq*0], 2
+ vinserti32x4 m13, [r6+strideq*2], 2
+ vinserti32x4 m12, [r6+strideq*1], 3
+ vinserti32x4 m13, [r6+r3 ], 3
+ pxor m7, m7
+ REPX {mova [cq+64*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
+ punpcklbw m2, m3, m7
+ punpckhbw m3, m7
+ paddw m0, m2
+ paddw m8, m3
+ packuswb m0, m8
+ punpcklbw m2, m6, m7
+ punpckhbw m6, m7
+ paddw m1, m2
+ paddw m9, m6
+ packuswb m1, m9
+ punpcklbw m2, m12, m7
+ punpckhbw m12, m7
+ paddw m2, m4
+ paddw m10, m12
+ packuswb m2, m10
+ punpcklbw m3, m13, m7
+ punpckhbw m13, m7
+ paddw m3, m5
+ paddw m11, m13
+ packuswb m3, m11
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], ym0, 1
+ mova [dstq+strideq*2], xm1
+ vextracti32x4 [dstq+r3 ], ym1, 1
+ vextracti32x4 [r4+strideq*0], m0, 2
+ vextracti32x4 [r4+strideq*1], m0, 3
+ vextracti32x4 [r4+strideq*2], m1, 2
+ vextracti32x4 [r4+r3 ], m1, 3
+ mova [r5+strideq*0], xm2
+ vextracti32x4 [r5+strideq*1], ym2, 1
+ mova [r5+strideq*2], xm3
+ vextracti32x4 [r5+r3 ], ym3, 1
+ vextracti32x4 [r6+strideq*0], m2, 2
+ vextracti32x4 [r6+strideq*1], m2, 3
+ vextracti32x4 [r6+strideq*2], m3, 2
+ vextracti32x4 [r6+r3 ], m3, 3
+ RET
+ALIGN function_align
+.main_fast2: ; bottom three-quarters are zero
+ vpbroadcastd m10, [o(pd_2048)]
+ vpbroadcastq m13, [o(int_mshift)]
+ vpcmpub k7, m13, m10, 6
+.main_fast4:
+ vpbroadcastd m2, [o(pw_401_4076x8)]
+ vpbroadcastd m4, [o(pw_m1189_3920x8)]
+ vpbroadcastd m3, [o(pw_799_4017x8)]
+ pmulhrsw m2, m8 ; t8a t15a
+ pmulhrsw m4, m1 ; t11a t12a
+ pmulhrsw m7, m3 ; t4a t7a
+ pxor m6, m6
+ psubsw m0, m2, m4 ; t11a t12a
+ paddsw m8, m2, m4 ; t8a t15a
+ mova m1, m7
+ jmp .main5
+ALIGN function_align
+.main_fast: ; bottom half is zero
+ vpbroadcastd m10, [o(pd_2048)]
+.main_fast3:
+ vpbroadcastq m13, [o(int_mshift)]
+ vpcmpub k7, m13, m10, 6
+.main_fast5:
+ vpbroadcastd m2, [o(pw_401_4076x8)]
+ vpbroadcastd m4, [o(pw_m2598_3166x8)]
+ vpbroadcastd m11, [o(pw_1931_3612x8)]
+ vpbroadcastd m12, [o(pw_m1189_3920x8)]
+ pmulhrsw m8, m2 ; t8a t15a
+ vpbroadcastd m2, [o(pw_799_4017x8)]
+ pmulhrsw m0, m4 ; t9a t14a
+ vpbroadcastd m4, [o(pw_m2276_3406x8)]
+ pmulhrsw m5, m11 ; t10a t13a
+ pmulhrsw m1, m12 ; t11a t12a
+ pmulhrsw m7, m2 ; t4a t7a
+ pmulhrsw m3, m4 ; t5a t6a
+ jmp .main4
+ALIGN function_align
+cglobal_label .main
+ IDCT16_1D_PACKED
+ ret
+
+INV_TXFM_16X16_FN adst, dct
+INV_TXFM_16X16_FN adst, adst
+INV_TXFM_16X16_FN adst, flipadst
+
+cglobal iadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ call .main_pass1
+ vpbroadcastd m10, [o(pw_8192_m8192)]
+ punpcklwd m8, m0, m1 ; b0 d0 b1 d1 b2 d2 b3 d3
+ punpckhwd m0, m1 ; a0 c0 a1 c1 a2 c2 a3 c3
+ punpckhwd m1, m0, m8 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpcklwd m0, m8 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpcklwd m8, m2, m3 ; f0 h0 f1 h1 f2 h2 f3 h3
+ punpckhwd m2, m3 ; e0 g0 e1 g1 e2 g2 e3 g3
+ punpckhwd m3, m2, m8 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpcklwd m2, m8 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhwd m8, m4, m5 ; i0 k0 i1 k1 i2 k2 i3 k3
+ punpcklwd m4, m5 ; j0 l0 j1 l1 j2 l2 j3 l3
+ punpckhwd m5, m4, m8 ; i2 j2 k2 l2 i3 j3 k3 l3
+ punpcklwd m4, m8 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhwd m8, m6, m7 ; m0 o0 m1 o1 m2 o2 m3 o3
+ punpcklwd m6, m7 ; n0 p0 n1 p1 n2 p2 n3 p3
+ punpckhwd m7, m6, m8 ; m2 n2 o2 p2 m3 n3 o3 p3
+ punpcklwd m6, m8 ; m0 n0 o0 p0 m1 n1 o1 p1
+.pass1_end:
+ REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
+ jmp tx2q
+.pass2:
+ call .main_pass2
+ mova m10, [o(permD)]
+ psrlq m8, m10, 8
+ psrlq m12, m10, 12
+ psrlq m13, m10, 4
+ mova m9, m8
+ vpermi2q m8, m0, m2 ; 0 1 4 5
+ vpermt2q m0, m12, m2
+ vpermi2q m9, m1, m3 ; 2 3 6 7
+ vpermt2q m1, m12, m3
+ vpbroadcastd m12, [o(pw_2048)]
+ mov r3d, 0xff00ff00
+ mova m11, m10
+ vpermi2q m10, m4, m6 ; 8 9 12 13
+ vpermt2q m4, m13, m6
+ kmovd k1, r3d
+ vpermi2q m11, m5, m7 ; 10 11 14 15
+ vpermt2q m5, m13, m7
+ pxor m7, m7
+ vpsubw m12{k1}, m7, m12
+ jmp m(idct_16x16_internal_8bpc).end2
+ALIGN function_align
+.main_pass1:
+ mova m4, [o(permB)]
+ psrlq m3, m4, 4
+ vpermq m0, m4, [cq+64*0]
+ vpermq m7, m3, [cq+64*7]
+ vpermq m6, m4, [cq+64*6]
+ vpermq m1, m3, [cq+64*1]
+ vpermq m2, m4, [cq+64*2]
+ vpermq m5, m3, [cq+64*5]
+ vpermq m4, m4, [cq+64*4]
+ vpermq m3, m3, [cq+64*3]
+ call .main
+ vpbroadcastd m13, [o(pw_2896_2896)]
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ mova m2, m10
+ vpdpwssd m2, m5, m13 ; -out5
+ mova m8, m10
+ vpdpwssd m8, m11, m13 ; out4
+ mova m9, m10
+ vpdpwssd m9, m5, m12 ; out10
+ mova m5, m10
+ vpdpwssd m5, m11, m12 ; -out11
+ mova m11, m10
+ vpdpwssd m11, m3, m13 ; -out7
+ mova m14, m10
+ vpdpwssd m14, m4, m13 ; out6
+ mova m13, m10
+ vpdpwssd m13, m3, m12 ; out8
+ vpdpwssd m10, m4, [o(pw_2896_m2896)] {1to16} ; -out9
+ REPX {psrad x, 12}, m2, m8, m9, m5, m11, m14, m13, m10
+ packssdw m2, m8 ; -out5 out4
+ packssdw m5, m9, m5 ; out10 -out11
+ packssdw m3, m11, m14 ; -out7 out6
+ packssdw m4, m13, m10 ; out8 -out9
+ ret
+ALIGN function_align
+.main_pass2:
+ vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc
+ vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4
+ vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec
+ vinserti32x8 m0, ym2, 1 ; a0 a4 e0 e4
+ vshufi32x4 m9, m5, m7, q3232 ; ia ie ma me
+ vinserti32x8 m5, ym7, 1 ; i2 i6 m2 m6
+ vshufi32x4 m7, m1, m3, q3232 ; aa ae ea ee
+ vinserti32x8 m1, ym3, 1 ; a2 a6 e2 e6
+ vshufi32x4 m2, m0, m4, q3131 ; 4 5
+ vshufi32x4 m0, m4, q2020 ; 0 1
+ vshufi32x4 m4, m6, m8, q2020 ; 8 9
+ vshufi32x4 m6, m8, q3131 ; 12 13
+ vshufi32x4 m3, m1, m5, q3131 ; 6 7
+ vshufi32x4 m1, m5, q2020 ; 2 3
+ vshufi32x4 m5, m7, m9, q2020 ; 10 11
+ vshufi32x4 m7, m9, q3131 ; 14 15
+cglobal_label .main_pass2b
+ REPX {pshufd x, x, q1032}, m1, m3, m5, m7
+ call .main
+ vpbroadcastd m8, [o(pw_2896x8)]
+ pshufb m2, m11, m12
+ pshufb m5, m12
+ pshufb m3, m12
+ pshufb m4, m12
+ punpcklqdq m9, m5, m2 ; t15a t7
+ punpckhqdq m5, m2 ; t14a t6
+ shufps m2, m3, m4, q1032 ; t2a t10
+ shufps m3, m4, q3210 ; t3a t11
+ psubsw m4, m2, m3 ; out8 -out9
+ paddsw m3, m2 ; -out7 out6
+ paddsw m2, m5, m9 ; -out5 out4
+ psubsw m5, m9 ; out10 -out11
+ REPX {pmulhrsw x, m8}, m2, m3, m4, m5
+ ret
+ALIGN function_align
+.main:
+ vpbroadcastd m10, [o(pd_2048)]
+ vpbroadcastq m13, [o(int_mshift)]
+ punpckhwd m8, m7, m0 ; in14 in1
+ punpcklwd m0, m7 ; in0 in15
+ punpcklwd m7, m6, m1 ; in12 in3
+ punpckhwd m1, m6 ; in2 in13
+ punpckhwd m6, m5, m2 ; in10 in5
+ punpcklwd m2, m5 ; in4 in11
+ punpcklwd m5, m4, m3 ; in8 in7
+ punpckhwd m3, m4 ; in6 in9
+ vpcmpub k7, m13, m10, 6 ; 0x33...
+ ITX_MUL2X_PACK 0, 4, 9, 10, 201, 4091, 5 ; t0 t1
+ ITX_MUL2X_PACK 1, 4, 9, 10, 995, 3973, 5 ; t2 t3
+ ITX_MUL2X_PACK 2, 4, 9, 10, 1751, 3703, 5 ; t4 t5
+ ITX_MUL2X_PACK 3, 4, 9, 10, 2440, 3290, 5 ; t6 t7
+ ITX_MUL2X_PACK 5, 4, 9, 10, 3035, 2751, 5 ; t8 t9
+ ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 5 ; t10 t11
+ ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 5 ; t12 t13
+ ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 5 ; t14 t15
+ psubsw m4, m0, m5 ; t9a t8a
+ paddsw m0, m5 ; t1a t0a
+ psubsw m5, m1, m6 ; t11a t10a
+ paddsw m1, m6 ; t3a t2a
+ psubsw m6, m2, m7 ; t13a t12a
+ paddsw m2, m7 ; t5a t4a
+ psubsw m7, m3, m8 ; t15a t14a
+ paddsw m3, m8 ; t7a t6a
+ ITX_MUL2X_PACK 4, 8, 9, 10, 799, 4017, 4 ; t8 t9
+ ITX_MUL2X_PACK 6, 8, 9, 10, 799_4017, 4017_m799, 52 ; t12 t13
+ ITX_MUL2X_PACK 5, 8, 9, 10, 3406, 2276, 4 ; t10 t11
+ ITX_MUL2X_PACK 7, 8, 9, 10, 3406_2276, 2276_m3406, 52 ; t14 t15
+ psubsw m8, m1, m3 ; t7 t6
+ paddsw m1, m3 ; t3 t2
+ psubsw m3, m0, m2 ; t5 t4
+ paddsw m0, m2 ; t1 t0
+ psubsw m2, m5, m7 ; t14a t15a
+ paddsw m7, m5 ; t10a t11a
+ psubsw m5, m4, m6 ; t12a t13a
+ paddsw m4, m6 ; t8a t9a
+ ITX_MUL2X_PACK 3, 6, 9, 10, 1567, 3784, 5 ; t5a t4a
+ ITX_MUL2X_PACK 8, 6, 9, 10, 3784_m1567, 1567_3784, 52 ; t7a t6a
+ ITX_MUL2X_PACK 2, 6, 9, 10, 3784, 1567, 4 ; t15 t14
+ ITX_MUL2X_PACK 5, 6, 9, 10, 3784_1567, 1567_m3784, 52 ; t13 t12
+ vbroadcasti32x4 m12, [o(deint_shuf)]
+ paddsw m6, m4, m7 ; -out1 out14
+ psubsw m4, m7 ; t10 t11
+ psubsw m11, m3, m8 ; t7 t6
+ paddsw m8, m3 ; out12 -out3
+ psubsw m3, m0, m1 ; t3a t2a
+ paddsw m0, m1 ; -out15 out0
+ paddsw m1, m2, m5 ; -out13 out2
+ psubsw m5, m2 ; t15a t14a
+ pshufb m0, m12
+ pshufb m6, m12
+ pshufb m8, m12
+ pshufb m1, m12
+ shufps m7, m6, m0, q1032 ; out14 -out15
+ shufps m0, m6, m0, q3210 ; -out1 out0
+ punpcklqdq m6, m8, m1 ; out12 -out13
+ punpckhqdq m1, m8, m1 ; -out3 out2
+ ret
+
+INV_TXFM_16X16_FN flipadst, dct
+INV_TXFM_16X16_FN flipadst, adst
+INV_TXFM_16X16_FN flipadst, flipadst
+
+cglobal iflipadst_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ call m(iadst_16x16_internal_8bpc).main_pass1
+ vpbroadcastd m10, [o(pw_m8192_8192)]
+ punpcklwd m8, m1, m0 ; m0 o0 m1 o1 m2 o2 m3 o3
+ punpckhwd m9, m1, m0 ; n0 p0 n1 p1 n2 p2 n3 p3
+ punpckhwd m1, m7, m6 ; a0 c0 a1 c1 a2 c2 a3 c3
+ punpcklwd m7, m6 ; b0 d0 b1 d1 b2 d2 b3 d3
+ punpcklwd m0, m1, m7 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhwd m1, m7 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpcklwd m6, m8, m9 ; m0 n0 o0 p0 m1 n1 o1 p1
+ punpckhwd m7, m8, m9 ; m2 n2 o2 p2 m3 n3 o3 p3
+ punpcklwd m8, m3, m2 ; i0 k0 i1 k1 i2 k2 i3 k3
+ punpckhwd m9, m3, m2 ; j0 l0 j1 l1 j2 l2 j3 l3
+ punpckhwd m3, m5, m4 ; e0 g0 e1 g1 e2 g2 e3 g3
+ punpcklwd m5, m4 ; f0 h0 f1 h1 f2 h2 f3 h3
+ punpcklwd m2, m3, m5 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhwd m3, m5 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpcklwd m4, m8, m9 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhwd m5, m8, m9 ; i2 j2 k2 l2 i3 j3 k3 l3
+ jmp m(iadst_16x16_internal_8bpc).pass1_end
+.pass2:
+ call m(iadst_16x16_internal_8bpc).main_pass2
+ mova m10, [o(permD)]
+ psrlq m8, m10, 8
+ psrlq m12, m10, 12
+ psrlq m13, m10, 4
+ mova m9, m8
+ vpermi2q m8, m7, m5 ; 0 1 4 5
+ vpermt2q m7, m12, m5
+ vpermi2q m9, m6, m4 ; 2 3 6 7
+ vpermt2q m6, m12, m4
+ vpbroadcastd m12, [o(pw_2048)]
+ mov r3d, 0x00ff00ff
+ mova m11, m10
+ vpermi2q m10, m3, m1 ; 8 9 12 13
+ vpermt2q m3, m13, m1
+ kmovd k1, r3d
+ vpermi2q m11, m2, m0 ; 10 11 14 15
+ vpermt2q m2, m13, m0
+ pxor m0, m0
+ vpsubw m12{k1}, m0, m12
+ pmulhrsw m0, m7, m12
+ pmulhrsw m1, m6, m12
+ pmulhrsw m4, m3, m12
+ pmulhrsw m5, m2, m12
+ jmp m(idct_16x16_internal_8bpc).end3
+
+INV_TXFM_16X16_FN identity, dct
+INV_TXFM_16X16_FN identity, identity
+
+cglobal iidentity_16x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
+ mova m8, [o(int16_perm)]
+ vpermb m1, m8, [cq+64*0] ; a0 b0 a1 b1 a2 b2 a3 b3
+ vpermb m2, m8, [cq+64*1] ; c0 d0 c1 d1 c2 d2 c3 d3
+ vpbroadcastd m0, [o(pw_1697x16)]
+ vpermb m3, m8, [cq+64*2] ; e0 f0 e1 f1 e2 f2 e3 f3
+ vpermb m4, m8, [cq+64*3] ; g0 h0 g1 h1 g2 h2 g3 h3
+ vpermb m5, m8, [cq+64*4] ; i0 j0 i1 j1 i2 j2 i3 j3
+ vpermb m6, m8, [cq+64*5] ; k0 l0 k1 l1 k2 l2 k3 l3
+ vpermb m7, m8, [cq+64*6] ; m0 n0 m1 n1 m2 n2 m3 n3
+ vpermb m8, m8, [cq+64*7] ; o0 p0 o1 p1 o2 p2 o3 p3
+ pmulhrsw m9, m0, m1
+ pmulhrsw m10, m0, m2
+ pmulhrsw m11, m0, m3
+ pmulhrsw m12, m0, m4
+ pmulhrsw m13, m0, m5
+ pmulhrsw m14, m0, m6
+ pmulhrsw m15, m0, m7
+ pmulhrsw m0, m8
+ REPX {psraw x, 1}, m9, m10, m11, m12
+ pavgw m1, m9
+ pavgw m2, m10
+ pavgw m3, m11
+ pavgw m4, m12
+ REPX {psraw x, 1}, m13, m14, m15, m0
+ pavgw m5, m13
+ pavgw m6, m14
+ pavgw m7, m15
+ pavgw m8, m0
+ punpckldq m0, m1, m2 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m1, m2 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m2, m3, m4 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhdq m3, m4 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpckldq m4, m5, m6 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhdq m5, m6 ; i2 j2 k2 l2 i3 j3 k3 l3
+ punpckldq m6, m7, m8 ; m0 n0 o0 p0 m1 n1 o1 p1
+ punpckhdq m7, m8 ; m2 n2 o2 p2 m3 n3 o3 p3
+ jmp tx2q
+ALIGN function_align
+.pass2:
+ vpbroadcastd m11, [o(pw_1697x16)]
+ pmulhrsw m12, m11, m0
+ pmulhrsw m13, m11, m1
+ pmulhrsw m14, m11, m2
+ pmulhrsw m15, m11, m3
+ pmulhrsw m8, m11, m4
+ pmulhrsw m9, m11, m5
+ pmulhrsw m10, m11, m6
+ pmulhrsw m11, m7
+ REPX {paddsw x, x}, m0, m1, m2, m3, m4, m5, m6, m7
+ paddsw m0, m12
+ paddsw m1, m13
+ paddsw m2, m14
+ paddsw m3, m15
+ paddsw m8, m4
+ movu m4, [o(permD+2)]
+ paddsw m9, m5
+ paddsw m6, m10
+ paddsw m7, m11
+ psrlq m12, m4, 4
+ mova m5, m4
+ mova m10, m4
+ mova m11, m4
+ vpermi2q m4, m0, m2 ; 8 9 12 13
+ vpermt2q m0, m12, m2 ; 0 1 4 5
+ vpermi2q m5, m1, m3 ; 10 11 14 15
+ vpermt2q m1, m12, m3 ; 2 3 6 7
+ vpermi2q m10, m8, m6
+ vpermt2q m8, m12, m6
+ vpermi2q m11, m9, m7
+ vpermt2q m9, m12, m7
+ jmp m(idct_16x16_internal_8bpc).end
+
+%macro ITX_UNPACK_MULHRSW 7 ; dst1, dst2/src, tmp, coef[1-4]
+ vpbroadcastd m%3, [o(pw_%4_%5x8)]
+ punpcklwd m%1, m%2, m%2
+ pmulhrsw m%1, m%3
+ vpbroadcastd m%3, [o(pw_%6_%7x8)]
+ punpckhwd m%2, m%2
+ pmulhrsw m%2, m%3
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ cmp eobd, 107
+ jb .fast
+ mova m5, [cq+64*5]
+ mova m3, [cq+64*3]
+ mova m1, [cq+64*1]
+ mova m7, [cq+64*7]
+ mova m2, [cq+64*2]
+ mova m6, [cq+64*6]
+ mova m0, [cq+64*0]
+ mova m4, [cq+64*4]
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ mova m8, [o(idct_8x32p)]
+ vpbroadcastd m9, [o(pw_8192)]
+ REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7
+ punpckldq m8, m0, m1 ; ab
+ punpckhdq m0, m1
+ punpckldq m1, m2, m3 ; cd
+ punpckhdq m2, m3
+ punpckldq m3, m4, m5 ; ef
+ punpckhdq m4, m5
+ punpckldq m5, m6, m7 ; gh
+ punpckhdq m6, m7
+ REPX {pmulhrsw x, m9}, m8, m0, m1, m2, m3, m4, m5, m6
+ punpcklqdq m18, m8, m1 ; 30 2 6 26 31 1 23 9
+ punpckhqdq m14, m8, m1 ; 16 0 12 20 3 29 11 21
+ punpcklqdq m21, m0, m2 ; 14 18 22 10 27 5 19 13
+ punpckhqdq m15, m0, m2 ; 18 4 24 8 7 25 15 17
+ punpcklqdq m20, m3, m5
+ punpckhqdq m16, m3, m5
+ punpcklqdq m19, m4, m6
+ punpckhqdq m17, m4, m6
+ vinserti32x4 ym8, ym18, xm20, 1
+ vshufi32x4 ym1, ym18, ym20, 0x03
+ vinserti32x4 ym9, ym14, xm16, 1
+ vshufi32x4 ym3, ym14, ym16, 0x03
+ vinserti32x4 ym0, ym21, xm19, 1
+ vshufi32x4 ym5, ym21, ym19, 0x03
+ vinserti32x4 ym7, ym15, xm17, 1
+ vshufi32x4 ym6, ym15, ym17, 0x03
+ call m(idct_8x16_internal_8bpc).main2
+ psrlq m12, [o(permB)], 60
+ vpermt2q m14, m12, m16
+ vpermt2q m21, m12, m19
+ vpermt2q m15, m12, m17
+ vpermi2q m12, m18, m20
+ vextracti32x8 ym16, m14, 1
+ vextracti32x8 ym19, m21, 1
+ vextracti32x8 ym17, m15, 1
+ vextracti32x8 ym20, m12, 1
+ call .main2
+ jmp .end
+.fast: ; right half is zero
+ mova m0, [o(int16_perm)]
+ mova ym2, [cq+64*4]
+ vinserti32x8 m2, [cq+64*0], 1
+ mova ym3, [cq+64*6]
+ vinserti32x8 m3, [cq+64*2], 1
+ mova ym4, [cq+64*3]
+ vinserti32x8 m4, [cq+64*5], 1
+ mova ym5, [cq+64*7]
+ vinserti32x8 m5, [cq+64*1], 1
+ REPX {vpermb x, m0, x}, m2, m3, m4, m5
+ call m(idct_16x8_internal_8bpc).main2
+ vbroadcasti32x4 m4, [o(int_shuf3)]
+ vbroadcasti32x4 m5, [o(int_shuf4)]
+ pshufb m2, m4 ; e0 f0 e2 f2 e1 f1 e3 f3
+ pshufb m3, m5 ; g0 h0 g2 h2 g1 h1 g3 h3
+ pshufb m0, m4 ; a0 b0 a2 b2 a1 b1 a3 b3
+ pshufb m1, m5 ; c0 d0 c2 d2 c1 d1 c3 d3
+ vpbroadcastd m4, [o(pw_8192)]
+ psrlq m5, [o(permB)], 60
+ punpckldq m6, m2, m3 ; e0 f0 g0 h0 e2 f2 g2 h2
+ punpckhdq m17, m2, m3 ; e1 f1 g1 h1 e3 f3 g3 h3
+ punpckldq m2, m0, m1 ; a0 b0 c0 d0 a2 b2 c2 d2
+ punpckhdq m16, m0, m1 ; a1 b1 c1 d1 a3 b3 c3 d3
+ REPX {pmulhrsw x, m4}, m6, m17, m2, m16
+ vinserti32x4 ym0, ym2, xm6, 1 ; 0 2
+ vshufi32x4 ym1, ym2, ym6, 0x03 ; 4 6
+ vinserti32x4 ym14, ym16, xm17, 1 ; 1 3
+ vshufi32x4 ym15, ym16, ym17, 0x03 ; 5 7
+ pxor ym4, ym4
+ vpermt2q m2, m5, m6 ; 8 10
+ vpermt2q m16, m5, m17 ; 9 11
+ mova ym5, ym4
+ mova ym6, ym4
+ mova ym7, ym4
+ vextracti32x8 ym3, m2, 1 ; 12 14
+ vextracti32x8 ym17, m16, 1 ; 13 15
+ call m(idct_8x16_internal_8bpc).main
+ call .main_fast
+.end:
+ vpbroadcastd ym12, strided
+ vpbroadcastd m13, [o(pw_2048)]
+ pmulld ym7, ym12, [o(gather8d)]
+ REPX {pmulhrsw x, m13}, m0, m1, m2, m3, m8, m9, m10, m11
+ lea r3, [dstq+strideq*4]
+ shl strideq, 4
+ lea r4, [dstq+strideq]
+ add r1, r3
+ kxnorb k1, k1, k1
+ pxor m6, m6
+ kmovb k2, k1
+ vpgatherdq m12{k1}, [r0+ym7]
+ kmovb k1, k2
+ vpgatherdq m13{k2}, [r3+ym7]
+ kmovb k2, k1
+ vpgatherdq m14{k1}, [r4+ym7]
+ kmovb k1, k2
+ vpgatherdq m15{k2}, [r1+ym7]
+ REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
+ punpcklbw m4, m12, m6
+ punpckhbw m12, m6
+ paddw m0, m4
+ paddw m1, m12
+ packuswb m0, m1
+ kmovb k2, k1
+ vpscatterdq [r0+ym7]{k1}, m0
+ punpcklbw m4, m13, m6
+ punpckhbw m13, m6
+ paddw m2, m4
+ paddw m3, m13
+ packuswb m2, m3
+ kmovb k1, k2
+ vpscatterdq [r3+ym7]{k2}, m2
+ punpcklbw m4, m14, m6
+ punpckhbw m14, m6
+ paddw m8, m4
+ paddw m9, m14
+ packuswb m8, m9
+ kmovb k2, k1
+ vpscatterdq [r4+ym7]{k1}, m8
+ punpcklbw m4, m15, m6
+ punpckhbw m15, m6
+ paddw m10, m4
+ paddw m11, m15
+ packuswb m10, m11
+ vpscatterdq [r1+ym7]{k2}, m10
+ RET
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ mov r3d, 32
+ imul r6d, 181
+ add r6d, 128+512
+ sar r6d, 8+2
+ jmp m(inv_txfm_add_dct_dct_8x8_8bpc).dconly2
+INIT_YMM avx512icl
+ALIGN function_align
+.main_fast: ; bottom half is zero
+ ITX_UNPACK_MULHRSW 12, 14, 8, 201, 4091, m601, 4052 ; t16a, t31a, t23a, t24a
+ ITX_UNPACK_MULHRSW 21, 15, 8, 995, 3973, m1380, 3857 ; t20a, t27a, t19a, t28a
+ ITX_UNPACK_MULHRSW 20, 16, 8, 1751, 3703, m2106, 3513 ; t18a, t29a, t21a, t26a
+ ITX_UNPACK_MULHRSW 19, 17, 8, 2440, 3290, m2751, 3035 ; t22a, t25a, t17a, t30a
+ jmp .main3
+ALIGN function_align
+.main:
+ punpcklwd m12, m21, m14 ; in31 in1
+ punpckhwd m14, m21 ; in3 in29
+ punpcklwd m21, m20, m15 ; in27 in5
+ punpckhwd m15, m20 ; in7 in25
+ punpcklwd m20, m19, m16 ; in23 in9
+ punpckhwd m16, m19 ; in11 in21
+ punpcklwd m19, m18, m17 ; in19 in13
+ punpckhwd m17, m18 ; in15 in17
+.main2:
+ ITX_MUL2X_PACK 12, 8, 9, 10, 201, 4091, 5 ; t16a, t31a
+ ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a
+ ITX_MUL2X_PACK 21, 8, 9, 10, 995, 3973, 5 ; t20a, t27a
+ ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a
+ ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a
+ ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a
+ ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a
+ ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a
+.main3:
+ psubsw m11, m12, m17 ; t17 t30
+ paddsw m12, m17 ; t16 t31
+ psubsw m17, m15, m20 ; t18 t29
+ paddsw m20, m15 ; t19 t28
+ psubsw m15, m21, m16 ; t21 t26
+ paddsw m21, m16 ; t20 t27
+ psubsw m16, m14, m19 ; t22 t25
+ paddsw m14, m19 ; t23 t24
+ ITX_MUL2X_PACK 11, 18, 19, 10, 799, 4017, 5 ; t17a t30a
+ ITX_MUL2X_PACK 17, 18, 19, 10, m4017, 799, 5 ; t18a t29a
+ ITX_MUL2X_PACK 15, 18, 19, 10, 3406, 2276, 5 ; t21a t26a
+ ITX_MUL2X_PACK 16, 18, 19, 10, m2276, 3406, 5 ; t22a t25a
+ vpbroadcastd m8, [o(pw_m3784_1567)]
+ psubsw m19, m12, m20 ; t19a t28a
+ paddsw m20, m12 ; t16a t31a
+ psubsw m12, m14, m21 ; t20a t27a
+ paddsw m14, m21 ; t23a t24a
+ psubsw m21, m11, m17 ; t18 t29
+ paddsw m11, m17 ; t17 t30
+ psubsw m17, m16, m15 ; t21 t26
+ paddsw m16, m15 ; t22 t25
+ ITX_MUL2X_PACK 21, 18, 15, 10, 1567_3784, 8, 20 ; t18a t29a
+ ITX_MUL2X_PACK 19, 18, 15, 10, 1567_3784, 8, 20 ; t19 t28
+ ITX_MUL2X_PACK 12, 18, 15, 10, 8, m1567_m3784, 36 ; t20 t27
+ ITX_MUL2X_PACK 17, 18, 15, 10, 8, m1567_m3784, 36 ; t21a t26a
+ vbroadcasti32x4 m18, [o(deint_shuf)]
+ vpbroadcastd m8, [o(pw_m2896_2896)]
+ vpbroadcastd m9, [o(pw_2896_2896)]
+ psubsw m15, m20, m14 ; t23 t24
+ paddsw m20, m14 ; t16 t31
+ psubsw m14, m11, m16 ; t22a t25a
+ paddsw m11, m16 ; t17a t30a
+ psubsw m16, m21, m17 ; t21 t26
+ paddsw m21, m17 ; t18 t29
+ psubsw m17, m19, m12 ; t20a t27a
+ paddsw m19, m12 ; t19a t28a
+ REPX {pshufb x, m18}, m20, m11, m21, m19
+ ITX_MUL2X_PACK 15, 18, 12, 10, 8, 9, 8 ; t23a t22a
+ ITX_MUL2X_PACK 14, 13, 15, 10, 8, 9, 8 ; t22 t25
+ packssdw m18, m13 ; t23a t22
+ packssdw m12, m15 ; t24a t25
+ ITX_MUL2X_PACK 16, 13, 15, 10, 8, 9, 8 ; t21a t26a
+ ITX_MUL2X_PACK 17, 16, 14, 10, 8, 9, 8 ; t20 t27
+ packssdw m16, m13 ; t20 t21a
+ packssdw m14, m15 ; t27 t26a
+ punpcklqdq m13, m19, m21 ; t19a t18
+ punpckhqdq m19, m21 ; t28a t29
+ punpcklqdq m21, m20, m11 ; t16 t17a
+ punpckhqdq m20, m11 ; t31 t30a
+ psubsw m15, m1, m19 ; out28 out29
+ paddsw m1, m19 ; out3 out2
+ psubsw m9, m6, m13 ; out19 out18
+ paddsw m6, m13 ; out12 out13
+ psubsw m10, m5, m16 ; out20 out21
+ paddsw m5, m16 ; out11 out10
+ psubsw m19, m3, m12 ; out24 out25
+ paddsw m3, m12 ; out7 out6
+ psubsw m8, m7, m21 ; out16 out17
+ paddsw m7, m21 ; out15 out14
+ psubsw m21, m0, m20 ; out31 out30
+ paddsw m0, m20 ; out0 out1
+ psubsw m11, m4, m18 ; out23 out22
+ paddsw m4, m18 ; out8 out9
+ psubsw m18, m2, m14 ; out27 out26
+ paddsw m2, m14 ; out4 out5
+INIT_ZMM avx512icl
+ movu m16, [o(permD+3)]
+ vpermt2q m0, m16, m4 ; 0 1 8 9
+ vpermt2q m8, m16, m19 ; 16 17 24 25
+ vpermt2q m1, m16, m5 ; 3 2 11 10
+ vpermt2q m9, m16, m18 ; 19 18 27 26
+ vpermt2q m2, m16, m6 ; 4 5 12 13
+ vpermt2q m10, m16, m15 ; 20 21 28 29
+ vpermt2q m3, m16, m7 ; 7 6 15 14
+ vpermt2q m11, m16, m21 ; 23 22 31 30
+ vzeroupper
+ ret
+
+%macro LOAD_PACKED_16X2 3 ; dst, row[1-2]
+ vbroadcasti32x4 ym%1, [cq+16*%2]
+ vbroadcasti32x4 ym8, [cq+16*%3]
+ shufpd ym%1, ym8, 0x0c
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, stride, c, eob
+%undef cmp
+ test eobd, eobd
+ jz .dconly
+ lea r5, [o_base]
+ LOAD_PACKED_16X2 0, 0, 2 ; in0 in2
+ LOAD_PACKED_16X2 1, 4, 6 ; in4 in6
+ LOAD_PACKED_16X2 2, 8, 10 ; in8 in10
+ LOAD_PACKED_16X2 3, 12, 14 ; in12 in14
+ LOAD_PACKED_16X2 14, 1, 3 ; in1 in3
+ LOAD_PACKED_16X2 15, 5, 7 ; in5 in7
+ LOAD_PACKED_16X2 16, 9, 11 ; in9 in11
+ LOAD_PACKED_16X2 17, 13, 15 ; in13 in15
+ pxor m4, m4
+ REPX {mova [cq+64*x], m4}, 0, 1, 2, 3
+ cmp eobd, 107
+ jb .fast
+ LOAD_PACKED_16X2 4, 16, 18 ; in16 in18
+ LOAD_PACKED_16X2 5, 20, 22 ; in20 in22
+ LOAD_PACKED_16X2 6, 24, 26 ; in24 in26
+ LOAD_PACKED_16X2 7, 28, 30 ; in28 in30
+ call m(idct_8x16_internal_8bpc).main
+ LOAD_PACKED_16X2 18, 19, 17 ; in19 in17
+ LOAD_PACKED_16X2 19, 23, 21 ; in23 in21
+ LOAD_PACKED_16X2 20, 27, 25 ; in27 in25
+ LOAD_PACKED_16X2 21, 31, 29 ; in31 in29
+ pxor m8, m8
+ REPX {mova [cq+64*x], m8}, 4, 5, 6, 7
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main
+ jmp .pass2
+.fast: ; bottom half is zero
+ mova ym5, ym4
+ mova ym6, ym4
+ mova ym7, ym4
+ call m(idct_8x16_internal_8bpc).main
+ call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
+.pass2:
+ vpbroadcastd m12, [o(pw_8192)]
+ vshufi32x4 m7, m3, m11, q2020 ; 7 15 23 31
+ vshufi32x4 m6, m3, m11, q3131 ; 6 14 22 30
+ vshufi32x4 m5, m2, m10, q3131 ; 5 13 21 29
+ vshufi32x4 m4, m2, m10, q2020 ; 4 12 20 28
+ vshufi32x4 m3, m1, m9, q2020 ; 3 11 19 27
+ vshufi32x4 m2, m1, m9, q3131 ; 2 10 18 26
+ vshufi32x4 m1, m0, m8, q3131 ; 1 9 17 15
+ vshufi32x4 m0, m8, q2020 ; 0 8 16 24
+ REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8
+ call .main
+ vpbroadcastd m8, [o(pw_2048)]
+ REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
+ lea r2, [strideq*3]
+ lea r3, [dstq+strideq*4]
+ movshdup m12, [o(permD)]
+ pmovzxbw m8, [dstq+strideq*0]
+ pmovzxbw m9, [dstq+strideq*1]
+ pmovzxbw m10, [dstq+strideq*2]
+ pmovzxbw m11, [dstq+r2 ]
+ paddw m0, m8
+ paddw m1, m9
+ paddw m2, m10
+ paddw m3, m11
+ pmovzxbw m8, [r3+strideq*0]
+ pmovzxbw m9, [r3+strideq*1]
+ pmovzxbw m10, [r3+strideq*2]
+ pmovzxbw m11, [r3+r2 ]
+ paddw m4, m8
+ paddw m5, m9
+ paddw m6, m10
+ paddw m7, m11
+ packuswb m0, m1
+ packuswb m2, m3
+ vpermq m0, m12, m0
+ vpermq m2, m12, m2
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym2
+ vextracti32x8 [dstq+r2 ], m2, 1
+ packuswb m4, m5
+ packuswb m6, m7
+ vpermq m4, m12, m4
+ vpermq m6, m12, m6
+ mova [r3+strideq*0], ym4
+ vextracti32x8 [r3+strideq*1], m4, 1
+ mova [r3+strideq*2], ym6
+ vextracti32x8 [r3+r2 ], m6, 1
+ RET
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ mov r3d, 8
+.dconly2:
+ imul r6d, 181
+ add r6d, 128+512
+ sar r6d, 8+2
+.dconly3:
+ imul r6d, 181
+ add r6d, 128+2048
+ sar r6d, 8+4
+ pxor m2, m2
+ vpbroadcastw m3, r6d
+.dconly_loop:
+ mova ym1, [dstq+strideq*0]
+ vinserti32x8 m1, [dstq+strideq*1], 1
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ paddw m0, m3
+ paddw m1, m3
+ packuswb m0, m1
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub r3d, 2
+ jg .dconly_loop
+ RET
+ALIGN function_align
+.main:
+ vpbroadcastd m10, [o(pd_2048)]
+.main2:
+ ITX_MULSUB_2W 5, 3, 8, 9, 10, 3406, 2276 ; t5a, t6a
+ ITX_MULSUB_2W 1, 7, 8, 9, 10, 799, 4017 ; t4a, t7a
+ ITX_MULSUB_2W 2, 6, 8, 9, 10, 1567, 3784 ; t2, t3
+ vpbroadcastd m11, [o(pw_2896_2896)]
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ ITX_MULSUB_2W 0, 4, 8, 9, 10, 11, 12 ; t1, t0
+.main3:
+ paddsw m8, m1, m5 ; t4
+ psubsw m1, m5 ; t5a
+ paddsw m9, m7, m3 ; t7
+ psubsw m7, m3 ; t6a
+ ITX_MULSUB_2W 7, 1, 3, 5, 10, 11, 12 ; t5, t6
+ psubsw m5, m0, m2 ; dct4 out2
+ paddsw m2, m0 ; dct4 out1
+ paddsw m0, m4, m6 ; dct4 out0
+ psubsw m4, m6 ; dct4 out3
+ psubsw m6, m2, m1 ; out6
+ paddsw m1, m2 ; out1
+ paddsw m2, m5, m7 ; out2
+ psubsw m5, m7 ; out5
+ psubsw m7, m0, m9 ; out7
+ paddsw m0, m9 ; out0
+ paddsw m3, m4, m8 ; out3
+ psubsw m4, m8 ; out4
+ ret
+
+cglobal inv_txfm_add_identity_identity_8x32_8bpc, 3, 5, 0, dst, stride, c
+ vpbroadcastd m7, [pw_5]
+ paddsw m0, m7, [cq+64*0]
+ paddsw m1, m7, [cq+64*1]
+ vpbroadcastd ym9, strided
+ paddsw m2, m7, [cq+64*2]
+ paddsw m3, m7, [cq+64*3]
+ paddsw m4, m7, [cq+64*4]
+ paddsw m5, m7, [cq+64*5]
+ paddsw m6, m7, [cq+64*6]
+ paddsw m7, [cq+64*7]
+ pmulld ym14, ym9, [pd_0to15]
+ lea r3, [dstq+strideq*1]
+ lea r4, [dstq+strideq*2]
+ kxnorb k1, k1, k1
+ pxor m13, m13
+ add r1, r4 ; dstq+strideq*3
+ kmovb k2, k1
+ vpgatherdq m9{k1}, [r0+ym14*4]
+ kmovb k1, k2
+ vpgatherdq m10{k2}, [r3+ym14*4]
+ kmovb k2, k1
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_8x8
+ REPX {psraw x, 3}, m0, m1, m2, m3, m4, m5, m6, m7
+ vpgatherdq m11{k1}, [r4+ym14*4]
+ kmovb k1, k2
+ vpgatherdq m12{k2}, [r1+ym14*4]
+ REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7
+ punpcklbw m8, m9, m13 ; 0 8 16 24
+ punpckhbw m9, m13 ; 4 12 20 28
+ paddw m0, m8
+ paddw m4, m9
+ packuswb m0, m4
+ kmovb k2, k1
+ vpscatterdq [r0+ym14*4]{k1}, m0
+ punpcklbw m8, m10, m13 ; 1 9 17 25
+ punpckhbw m10, m13 ; 5 13 21 29
+ paddw m1, m8
+ paddw m5, m10
+ packuswb m1, m5
+ kmovb k1, k2
+ vpscatterdq [r3+ym14*4]{k2}, m1
+ punpcklbw m8, m11, m13 ; 2 10 18 26
+ punpckhbw m11, m13 ; 6 14 22 30
+ paddw m2, m8
+ paddw m6, m11
+ packuswb m2, m6
+ kmovb k2, k1
+ vpscatterdq [r4+ym14*4]{k1}, m2
+ punpcklbw m8, m12, m13 ; 3 11 19 27
+ punpckhbw m12, m13 ; 7 15 23 31
+ paddw m3, m8
+ paddw m7, m12
+ packuswb m3, m7
+ vpscatterdq [r1+ym14*4]{k2}, m3
+ RET
+
+cglobal inv_txfm_add_identity_identity_32x8_8bpc, 3, 5, 0, dst, stride, c
+ vpbroadcastd m0, [pw_4096]
+ pmulhrsw m3, m0, [cq+64*0]
+ pmulhrsw m4, m0, [cq+64*4]
+ pmulhrsw m6, m0, [cq+64*1]
+ pmulhrsw m5, m0, [cq+64*5]
+ pmulhrsw m7, m0, [cq+64*2]
+ pmulhrsw m2, m0, [cq+64*6]
+ pmulhrsw m8, m0, [cq+64*3]
+ pmulhrsw m0, [cq+64*7]
+ mova m13, [int8_permA]
+ lea r3, [strideq*3]
+ lea r4, [dstq+strideq*4]
+ punpckldq m1, m3, m4
+ punpckhdq m3, m4
+ punpckldq m4, m6, m5
+ punpckhdq m6, m5
+ punpckldq m5, m7, m2
+ punpckhdq m7, m2
+ punpckldq m2, m8, m0
+ punpckhdq m8, m0
+ mova ym9, [dstq+strideq*0]
+ vinserti32x8 m9, [dstq+strideq*2], 1
+ mova ym10, [dstq+strideq*1]
+ vinserti32x8 m10, [dstq+r3 ], 1
+ mova ym11, [r4+strideq*0]
+ vinserti32x8 m11, [r4+strideq*2], 1
+ mova ym12, [r4+strideq*1]
+ vinserti32x8 m12, [r4+r3 ], 1
+ REPX {vpermb x, m13, x}, m1, m4, m5, m2, m3, m6, m7, m8
+ pxor m13, m13
+ REPX {mova [cq+64*x], m13}, 0, 1, 2, 3, 4, 5, 6, 7
+ punpcklqdq m0, m1, m4 ; a0 a2 c0 c2
+ punpckhqdq m1, m4 ; b0 b2 d0 d2
+ punpcklqdq m4, m5, m2 ; a1 a3 c1 c3
+ punpckhqdq m5, m2 ; b1 b3 d1 d3
+ punpcklqdq m2, m3, m6 ; e0 e2 g0 g2
+ punpckhqdq m3, m6 ; f0 f2 h0 h2
+ punpcklqdq m6, m7, m8 ; e1 e3 g1 g3
+ punpckhqdq m7, m8 ; f1 f3 h1 h3
+ punpcklbw m8, m9, m13
+ punpckhbw m9, m13
+ paddw m0, m8
+ paddw m4, m9
+ packuswb m0, m4
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*2], m0, 1
+ punpcklbw m8, m10, m13
+ punpckhbw m10, m13
+ paddw m1, m8
+ paddw m5, m10
+ packuswb m1, m5
+ mova [dstq+strideq*1], ym1
+ vextracti32x8 [dstq+r3 ], m1, 1
+ punpcklbw m8, m11, m13
+ punpckhbw m11, m13
+ paddw m2, m8
+ paddw m6, m11
+ packuswb m2, m6
+ mova [r4+strideq*0], ym2
+ vextracti32x8 [r4+strideq*2], m2, 1
+ punpcklbw m8, m12, m13
+ punpckhbw m12, m13
+ paddw m3, m8
+ paddw m7, m12
+ packuswb m3, m7
+ mova [r4+strideq*1], ym3
+ vextracti32x8 [r4+r3 ], m3, 1
+ RET
+
+%macro IDCT_16x32_END 3 ; src[1-2], row
+ mova xm8, [dstq+strideq*0]
+ vinserti32x4 ym8, [dstq+strideq*1], 1
+ mova xm9, [dstq+r3 ]
+ vinserti32x4 ym9, [dstq+strideq*2], 1
+ pmulhrsw m%1, m10
+ pmulhrsw m%2, m10
+ vpermb m8, m11, m8
+ vpermb m9, m11, m9
+ mova [cq+64*(%3*2+0)], m13
+ mova [cq+64*(%3*2+1)], m13
+ paddw m8, m%1
+ paddw m9, m%2
+ packuswb m8, m9
+ vpermd m8, m12, m8
+ mova [dstq+strideq*0], xm8
+ vextracti32x4 [dstq+strideq*1], ym8, 1
+ vextracti32x4 [dstq+strideq*2], m8, 2
+ vextracti32x4 [dstq+r3 ], m8, 3
+%if %1 != 20
+ lea dstq, [dstq+strideq*4]
+%endif
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 22, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ vpbroadcastd m15, [o(pw_2896x8)]
+ cmp eobd, 151
+ jb .fast
+ pmulhrsw m5, m15, [cq+64*10]
+ pmulhrsw m3, m15, [cq+64* 6]
+ pmulhrsw m1, m15, [cq+64* 2]
+ pmulhrsw m7, m15, [cq+64*14]
+ pmulhrsw m2, m15, [cq+64* 4]
+ pmulhrsw m6, m15, [cq+64*12]
+ pmulhrsw m0, m15, [cq+64* 0]
+ pmulhrsw m4, m15, [cq+64* 8]
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ pmulhrsw m14, m15, [cq+64* 1]
+ pmulhrsw m21, m15, [cq+64*15]
+ pmulhrsw m18, m15, [cq+64* 9]
+ pmulhrsw m17, m15, [cq+64* 7]
+ pmulhrsw m16, m15, [cq+64* 5]
+ pmulhrsw m19, m15, [cq+64*11]
+ pmulhrsw m20, m15, [cq+64*13]
+ pmulhrsw m15, [cq+64* 3]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ mova m8, [o(idct_16x32p)]
+ vpbroadcastd m9, [o(pw_16384)]
+ REPX {vpermb x, m8, x}, m0, m1, m2, m3, m4, m5, m6, m7, \
+ m14, m15, m16, m17, m18, m19, m20, m21
+ punpckldq m8, m0, m1
+ punpckhdq m0, m1
+ punpckldq m1, m2, m3
+ punpckhdq m2, m3
+ REPX {pmulhrsw x, m9}, m8, m0, m1, m2
+ punpckldq m3, m4, m5
+ punpckhdq m4, m5
+ punpckldq m5, m6, m7
+ punpckhdq m6, m7
+ REPX {pmulhrsw x, m9}, m3, m4, m5, m6
+ punpckldq m7, m14, m15
+ punpckhdq m14, m15
+ punpckldq m15, m16, m17
+ punpckhdq m16, m17
+ REPX {pmulhrsw x, m9}, m7, m14, m15, m16
+ punpckldq m17, m18, m19
+ punpckhdq m18, m19
+ punpckldq m19, m20, m21
+ punpckhdq m20, m21
+ REPX {pmulhrsw x, m9}, m17, m18, m19, m20
+ punpcklqdq m21, m8, m1
+ punpckhqdq m8, m1
+ punpcklqdq m1, m0, m2
+ punpckhqdq m0, m2
+ punpcklqdq m2, m3, m5
+ punpckhqdq m3, m5
+ punpcklqdq m5, m4, m6
+ punpckhqdq m4, m6
+ punpcklqdq m6, m7, m15
+ punpckhqdq m7, m15
+ punpcklqdq m15, m14, m16
+ punpckhqdq m14, m16
+ punpcklqdq m16, m17, m19
+ punpckhqdq m17, m19
+ punpcklqdq m19, m18, m20
+ punpckhqdq m18, m20
+ vinserti32x8 m20, m21, ym2, 1
+ vshufi32x4 m21, m2, q3232
+ vinserti32x8 m2, m8, ym3, 1
+ vshufi32x4 m8, m3, q3232
+ vinserti32x8 m3, m1, ym5, 1
+ vshufi32x4 m1, m5, q3232
+ vinserti32x8 m5, m0, ym4, 1
+ vshufi32x4 m0, m4, q3232
+ vinserti32x8 m4, m6, ym16, 1
+ vshufi32x4 m6, m16, q3232
+ vinserti32x8 m16, m7, ym17, 1
+ vshufi32x4 m7, m17, q3232
+ vinserti32x8 m17, m15, ym19, 1
+ vshufi32x4 m15, m19, q3232
+ vinserti32x8 m19, m14, ym18, 1
+ vshufi32x4 m14, m18, q3232
+ vshufi32x4 m18, m21, m6, q3131 ; 27 5
+ vshufi32x4 m21, m6, q2020 ; 31 1
+ vshufi32x4 m6, m8, m7, q2020 ; 24 8
+ vshufi32x4 m8, m7, q3131 ; 30 2
+ vshufi32x4 m7, m1, m15, q2020 ; 28 4
+ vshufi32x4 m1, m15, q3131 ; 6 26
+ vshufi32x4 m15, m0, m14, q2020 ; 7 25
+ vshufi32x4 m0, m14, q3131 ; 14 18
+ vshufi32x4 m14, m20, m4, q2020 ; 3 29
+ vshufi32x4 m20, m4, q3131 ; 23 9
+ vshufi32x4 m9, m3, m17, q2020 ; 16 0
+ vshufi32x4 m3, m17, q3131 ; 12 20
+ vshufi32x4 m17, m5, m19, q2020 ; 15 17
+ vshufi32x4 m5, m19, q3131 ; 22 10
+ vshufi32x4 m19, m2, m16, q2020 ; 19 13
+ vshufi32x4 m16, m2, m16, q3131 ; 11 21
+ call m(idct_16x16_internal_8bpc).main3
+ call .main_oddhalf
+ jmp .pass2
+.fast: ; right half is zero
+ mova ym8, [cq+64*15]
+ vinserti32x8 m8, [cq+64* 1], 1
+ mova m2, [o(int16_perm)]
+ mova ym9, [cq+64* 8]
+ vinserti32x8 m9, [cq+64* 0], 1
+ mova ym0, [cq+64* 7]
+ vinserti32x8 m0, [cq+64* 9], 1
+ mova ym7, [cq+64*14]
+ vinserti32x8 m7, [cq+64* 2], 1
+ mova ym1, [cq+64* 3]
+ vinserti32x8 m1, [cq+64*13], 1
+ mova ym3, [cq+64* 6]
+ vinserti32x8 m3, [cq+64*10], 1
+ mova ym5, [cq+64*11]
+ vinserti32x8 m5, [cq+64* 5], 1
+ mova ym6, [cq+64*12]
+ vinserti32x8 m6, [cq+64* 4], 1
+ REPX {pmulhrsw x, m15}, m8, m9, m0, m7, m1, m3, m5, m6
+ REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6
+ call m(idct_16x16_internal_8bpc).main2
+ vbroadcasti32x4 m8, [o(int_shuf3)]
+ vbroadcasti32x4 m9, [o(int_shuf4)]
+ vpbroadcastd m11, [o(pw_16384)]
+ pshufb m0, m8
+ pshufb m1, m9
+ pshufb m2, m8
+ pshufb m3, m9
+ REPX {pmulhrsw x, m11}, m0, m1, m2, m3
+ pshufb m4, m8
+ pshufb m5, m9
+ pshufb m6, m8
+ pshufb m7, m9
+ REPX {pmulhrsw x, m11}, m4, m5, m6, m7
+ punpckhdq m17, m0, m1
+ punpckldq m0, m1
+ punpckhdq m16, m2, m3
+ punpckldq m2, m3
+ punpckhdq m18, m4, m5
+ punpckldq m4, m5
+ punpckhdq m5, m6, m7
+ punpckldq m6, m7
+ vinserti32x8 m1, m0, ym2, 1
+ vshufi32x4 m3, m0, m2, q3232
+ vinserti32x8 m2, m4, ym6, 1
+ vshufi32x4 m4, m6, q3232
+ vinserti32x8 m15, m17, ym16, 1
+ vshufi32x4 m17, m16, q3232
+ vinserti32x8 m16, m18, ym5, 1
+ vshufi32x4 m18, m5, q3232
+ vshufi32x4 m0, m1, m2, q2020 ; 0 2
+ vshufi32x4 m1, m2, q3131 ; 4 6
+ vshufi32x4 m2, m3, m4, q2020 ; 8 10
+ vshufi32x4 m3, m4, q3131 ; 12 14
+ vshufi32x4 m14, m15, m16, q2020 ; 1 3
+ vshufi32x4 m15, m16, q3131 ; 5 7
+ vshufi32x4 m16, m17, m18, q2020 ; 9 11
+ vshufi32x4 m17, m18, q3131 ; 13 15
+ pxor m6, m6
+ punpckhwd m8, m0, m0
+ punpcklwd m9, m6, m0
+ punpckhwd m0, m3, m3
+ punpckhwd m5, m2, m2
+ punpcklwd m7, m1, m1
+ punpckhwd m1, m1
+ punpcklwd m3, m3
+ punpcklwd m6, m2
+ call m(idct_16x16_internal_8bpc).main_fast5
+ punpcklwd m21, m14, m14
+ punpckhwd m14, m14
+ punpcklwd m18, m15, m15
+ punpckhwd m15, m15
+ punpcklwd m20, m16, m16
+ punpckhwd m16, m16
+ punpcklwd m19, m17, m17
+ punpckhwd m17, m17
+ call .main_oddhalf_fast
+.pass2:
+ vpbroadcastd m10, [o(pw_2048)]
+ mova m11, [o(end_16x32p)]
+ lea r3, [strideq*3]
+ pxor m13, m13
+ psrld m12, m11, 8
+ IDCT_16x32_END 0, 1, 0
+ IDCT_16x32_END 2, 3, 1
+ IDCT_16x32_END 4, 5, 2
+ IDCT_16x32_END 6, 7, 3
+ IDCT_16x32_END 14, 15, 4
+ IDCT_16x32_END 16, 17, 5
+ IDCT_16x32_END 18, 19, 6
+ IDCT_16x32_END 20, 21, 7
+ RET
+ALIGN function_align
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ mov r3d, 32
+ jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly
+ALIGN function_align
+.main_oddhalf_fast2: ; bottom three-quarters are zero
+ vpbroadcastd m8, [o(pw_201_4091x8)]
+ vpbroadcastd m20, [o(pw_m1380_3857x8)]
+ vpbroadcastd m9, [o(pw_995_3973x8)]
+ vpbroadcastd m16, [o(pw_m601_4052x8)]
+ pmulhrsw m21, m8 ; t16a, t31a
+ pmulhrsw m20, m15 ; t19a, t28a
+ pmulhrsw m18, m9 ; t20a, t27a
+ pmulhrsw m14, m16 ; t23a, t24a
+ mova m8, m21
+ mova m17, m20
+ mova m15, m18
+ mova m16, m14
+ jmp .main3
+ALIGN function_align
+.main_oddhalf_fast: ; bottom half is zero
+ vpbroadcastd m8, [o(pw_201_4091x8)]
+ vpbroadcastd m9, [o(pw_m2751_3035x8)]
+ vpbroadcastd m11, [o(pw_1751_3703x8)]
+ vpbroadcastd m12, [o(pw_m1380_3857x8)]
+ pmulhrsw m21, m8 ; t16a, t31a
+ vpbroadcastd m8, [o(pw_995_3973x8)]
+ pmulhrsw m17, m9 ; t17a, t30a
+ vpbroadcastd m9, [o(pw_m2106_3513x8)]
+ pmulhrsw m20, m11 ; t18a, t29a
+ vpbroadcastd m11, [o(pw_2440_3290x8)]
+ pmulhrsw m15, m12 ; t19a, t28a
+ vpbroadcastd m12, [o(pw_m601_4052x8)]
+ pmulhrsw m18, m8 ; t20a, t27a
+ pmulhrsw m16, m9 ; t21a, t26a
+ pmulhrsw m19, m11 ; t22a, t25a
+ pmulhrsw m14, m12 ; t23a, t24a
+ jmp .main2
+ALIGN function_align
+.main_oddhalf:
+ ITX_MUL2X_PACK 21, 8, 9, 10, 201, 4091, 5 ; t16a, t31a
+ ITX_MUL2X_PACK 17, 8, 9, 10, 3035, 2751, 5 ; t17a, t30a
+ ITX_MUL2X_PACK 20, 8, 9, 10, 1751, 3703, 5 ; t18a, t29a
+ ITX_MUL2X_PACK 15, 8, 9, 10, 3857, 1380, 5 ; t19a, t28a
+ ITX_MUL2X_PACK 18, 8, 9, 10, 995, 3973, 5 ; t20a, t27a
+ ITX_MUL2X_PACK 16, 8, 9, 10, 3513, 2106, 5 ; t21a, t26a
+ ITX_MUL2X_PACK 19, 8, 9, 10, 2440, 3290, 5 ; t22a, t25a
+ ITX_MUL2X_PACK 14, 8, 9, 10, 4052, 601, 5 ; t23a, t24a
+.main2:
+ psubsw m8, m21, m17 ; t17 t30
+ paddsw m21, m17 ; t16 t31
+ psubsw m17, m15, m20 ; t18 t29
+ paddsw m20, m15 ; t19 t28
+ psubsw m15, m18, m16 ; t21 t26
+ paddsw m18, m16 ; t20 t27
+ psubsw m16, m14, m19 ; t22 t25
+ paddsw m14, m19 ; t23 t24
+.main3:
+ ITX_MUL2X_PACK 8, 9, 19, 10, 799, 4017, 5 ; t17a t30a
+ ITX_MUL2X_PACK 17, 9, 19, 10, m4017, 799, 5 ; t18a t29a
+ ITX_MUL2X_PACK 15, 9, 19, 10, 3406, 2276, 5 ; t21a t26a
+ ITX_MUL2X_PACK 16, 9, 19, 10, m2276, 3406, 5 ; t22a t25a
+ vpbroadcastd m11, [o(pw_m3784_1567)]
+ psubsw m19, m21, m20 ; t19a t28a
+ paddsw m21, m20 ; t16a t31a
+ psubsw m20, m14, m18 ; t20a t27a
+ paddsw m14, m18 ; t23a t24a
+ psubsw m18, m8, m17 ; t18 t29
+ paddsw m8, m17 ; t17 t30
+ psubsw m17, m16, m15 ; t21 t26
+ paddsw m15, m16 ; t22 t25
+ ITX_MUL2X_PACK 18, 9, 16, 10, 1567_3784, 11, 20 ; t18a t29a
+ ITX_MUL2X_PACK 19, 9, 16, 10, 1567_3784, 11, 20 ; t19 t28
+ ITX_MUL2X_PACK 20, 9, 16, 10, 11, m1567_m3784, 36 ; t20 t27
+ ITX_MUL2X_PACK 17, 9, 16, 10, 11, m1567_m3784, 36 ; t21a t26a
+ vbroadcasti32x4 m9, [o(deint_shuf)]
+ psubsw m16, m21, m14 ; t23 t24
+ paddsw m14, m21 ; t16 t31
+ psubsw m21, m8, m15 ; t22a t25a
+ paddsw m15, m8 ; t17a t30a
+ psubsw m8, m18, m17 ; t21 t26
+ paddsw m18, m17 ; t18 t29
+ paddsw m17, m19, m20 ; t19a t28a
+ psubsw m19, m20 ; t20a t27a
+ vpbroadcastd m11, [o(pw_m2896_2896)]
+ vpbroadcastd m12, [o(pw_2896_2896)]
+ REPX {pshufb x, m9}, m14, m15, m18, m17
+ mova m9, m10
+ vpdpwssd m9, m16, m11
+ mova m20, m10
+ vpdpwssd m20, m21, m11
+ psrad m9, 12
+ psrad m20, 12
+ packssdw m9, m20 ; t23a t22
+ mova m20, m10
+ vpdpwssd m20, m16, m12
+ mova m16, m10
+ vpdpwssd m16, m21, m12
+ psrad m20, 12
+ psrad m16, 12
+ packssdw m16, m20, m16 ; t24a t25
+ ITX_MUL2X_PACK 8, 21, 20, 10, 11, 12, 8 ; t21a t26a
+ ITX_MUL2X_PACK 19, 8, 11, 10, 11, 12, 8 ; t20 t27
+ packssdw m11, m20 ; t27 t26a
+ packssdw m8, m21 ; t20 t21a
+ punpcklqdq m20, m14, m15 ; t16 t17a
+ punpckhqdq m14, m15 ; t31 t30a
+ punpckhqdq m15, m17, m18 ; t28a t29
+ punpcklqdq m17, m18 ; t19a t18
+ psubsw m21, m0, m14 ; out31 out30
+ paddsw m0, m14 ; out0 out1
+ psubsw m14, m7, m20 ; out16 out17
+ paddsw m7, m20 ; out15 out14
+ psubsw m20, m1, m15 ; out28 out29
+ paddsw m1, m15 ; out3 out2
+ psubsw m15, m6, m17 ; out19 out18
+ paddsw m6, m17 ; out12 out13
+ psubsw m17, m4, m9 ; out23 out22
+ paddsw m4, m9 ; out8 out9
+ psubsw m18, m3, m16 ; out24 out25
+ paddsw m3, m16 ; out7 out6
+ psubsw m16, m5, m8 ; out20 out21
+ paddsw m5, m8 ; out11 out10
+ psubsw m19, m2, m11 ; out27 out26
+ paddsw m2, m11 ; out4 out5
+ ret
+
+cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 22, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ mova m21, [o(permB)]
+ vpermq m1, m21, [cq+64* 0] ; 0 1
+ vpermq m14, m21, [cq+64* 1] ; 2 3
+ vpermq m20, m21, [cq+64* 2] ; 4 5
+ vpermq m15, m21, [cq+64* 3] ; 6 7
+ vpbroadcastd m8, [o(pw_2896x8)]
+ vpermq m2, m21, [cq+64* 4] ; 8 9
+ vpermq m16, m21, [cq+64* 5] ; 10 11
+ vpermq m3, m21, [cq+64* 6] ; 12 13
+ vpermq m17, m21, [cq+64* 7] ; 14 15
+ REPX {pmulhrsw x, m8}, m1, m14, m20, m15, m2, m16, m3, m17
+ pxor m12, m12
+ REPX {mova [cq+64*x], m12}, 0, 1, 2, 3, 4, 5, 6, 7
+ cmp eobd, 151
+ jb .fast
+ vpermq m9, m21, [cq+64* 8] ; 16 17
+ vpermq m19, m21, [cq+64* 9] ; 18 19
+ vpermq m4, m21, [cq+64*10] ; 20 21
+ vpermq m5, m21, [cq+64*11] ; 22 23
+ vpermq m6, m21, [cq+64*12] ; 24 25
+ vpermq m18, m21, [cq+64*13] ; 26 27
+ vpermq m7, m21, [cq+64*14] ; 28 29
+ vpermq m21, m21, [cq+64*15] ; 30 31
+ REPX {pmulhrsw x, m8}, m9, m19, m4, m5, m6, m18, m7, m21
+ REPX {mova [cq+64*x], m12}, 8, 9, 10, 11, 12, 13, 14, 15
+ punpcklwd m8, m21, m14 ; 30 2
+ punpckhwd m21, m1 ; 31 1
+ punpcklwd m0, m17, m19 ; 14 18
+ punpckhwd m17, m9 ; 15 17
+ punpcklwd m9, m1 ; 16 0
+ punpckhwd m14, m7 ; 3 29
+ punpcklwd m1, m15, m18 ; 6 26
+ punpckhwd m15, m6 ; 7 25
+ punpcklwd m6, m2 ; 24 8
+ punpckhwd m19, m3 ; 19 13
+ punpcklwd m3, m4 ; 12 20
+ punpckhwd m18, m20 ; 27 5
+ punpcklwd m7, m20 ; 28 4
+ punpckhwd m20, m5, m2 ; 23 9
+ punpcklwd m5, m16 ; 22 10
+ punpckhwd m16, m4 ; 11 21
+ call m(idct_16x16_internal_8bpc).main2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
+ jmp .pass2
+.fast: ; bottom half zero
+ punpcklwd m8, m14, m14 ; 2
+ punpcklwd m0, m17, m17 ; 14
+ punpcklwd m5, m16, m16 ; 10
+ punpcklwd m9, m12, m1 ; __ 0
+ punpckhwd m21, m1, m1 ; 1
+ punpcklwd m1, m15, m15 ; 6
+ punpcklwd m7, m20, m20 ; 4
+ punpckhwd m19, m3, m3 ; 13
+ punpcklwd m3, m3 ; 12
+ punpcklwd m6, m12, m2 ; __ 8
+ punpckhwd m18, m20, m20 ; 5
+ punpckhwd m20, m2, m2 ; 9
+ call m(idct_16x16_internal_8bpc).main_fast
+ punpckhwd m15, m15 ; 7
+ punpckhwd m14, m14 ; 3
+ punpckhwd m16, m16 ; 11
+ punpckhwd m17, m17 ; 15
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+.pass2:
+ vpbroadcastd m9, [o(pw_16384)]
+ call .transpose_round
+ vshufi32x4 m16, m14, m2, q3131 ; 5
+ vshufi32x4 m14, m2, q2020 ; 1
+ vshufi32x4 m2, m0, m3, q3131 ; 4
+ vshufi32x4 m0, m3, q2020 ; 0
+ vshufi32x4 m3, m1, m18, q3131 ; 6
+ vshufi32x4 m1, m18, q2020 ; 2
+ vshufi32x4 m18, m20, m6, q2020 ; 9
+ vshufi32x4 m20, m6, q3131 ; 13
+ vshufi32x4 m6, m21, m4, q3131 ; 12
+ vshufi32x4 m4, m21, m4, q2020 ; 8
+ vshufi32x4 m21, m19, m7, q3131 ; 15
+ vshufi32x4 m19, m7, q2020 ; 11
+ vshufi32x4 m7, m5, m15, q3131 ; 14
+ vshufi32x4 m5, m15, q2020 ; 10
+ vshufi32x4 m15, m17, m9, q2020 ; 3
+ vshufi32x4 m17, m9, q3131 ; 7
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main2
+ call .main_oddhalf
+ vpbroadcastd m12, [o(pw_2048)]
+ movshdup m13, [o(permD)]
+ lea r2, [strideq*3]
+ pmovzxbw m8, [dstq+strideq*0]
+ pmovzxbw m9, [dstq+strideq*1]
+ pmovzxbw m10, [dstq+strideq*2]
+ pmovzxbw m11, [dstq+r2 ]
+ REPX {pmulhrsw x, m12}, m0, m1, m2, m3
+ lea r3, [dstq+strideq*4]
+ paddw m0, m8
+ paddw m1, m9
+ paddw m2, m10
+ paddw m3, m11
+ pmovzxbw m8, [r3+strideq*0]
+ pmovzxbw m9, [r3+strideq*1]
+ pmovzxbw m10, [r3+strideq*2]
+ pmovzxbw m11, [r3+r2 ]
+ REPX {pmulhrsw x, m12}, m4, m5, m6, m7
+ lea r4, [dstq+strideq*8]
+ packuswb m0, m1
+ paddw m4, m8
+ paddw m5, m9
+ packuswb m2, m3
+ paddw m6, m10
+ paddw m7, m11
+ pmovzxbw m8, [r4+strideq*0]
+ pmovzxbw m9, [r4+strideq*1]
+ pmovzxbw m10, [r4+strideq*2]
+ pmovzxbw m11, [r4+r2 ]
+ REPX {pmulhrsw x, m12}, m14, m15, m16, m17
+ lea r5, [r3+strideq*8]
+ packuswb m4, m5
+ paddw m14, m8
+ paddw m15, m9
+ packuswb m6, m7
+ paddw m16, m10
+ paddw m17, m11
+ pmovzxbw m8, [r5+strideq*0]
+ pmovzxbw m9, [r5+strideq*1]
+ pmovzxbw m10, [r5+strideq*2]
+ pmovzxbw m11, [r5+r2 ]
+ REPX {pmulhrsw x, m12}, m18, m19, m20, m21
+ packuswb m14, m15
+ paddw m18, m8
+ paddw m19, m9
+ packuswb m16, m17
+ paddw m20, m10
+ paddw m21, m11
+ packuswb m18, m19
+ packuswb m20, m21
+ REPX {vpermq x, m13, x}, m0, m2, m4, m6, m14, m16, m18, m20
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ mova [dstq+strideq*2], ym2
+ vextracti32x8 [dstq+r2 ], m2, 1
+ mova [r3+strideq*0], ym4
+ vextracti32x8 [r3+strideq*1], m4, 1
+ mova [r3+strideq*2], ym6
+ vextracti32x8 [r3+r2 ], m6, 1
+ mova [r4+strideq*0], ym14
+ vextracti32x8 [r4+strideq*1], m14, 1
+ mova [r4+strideq*2], ym16
+ vextracti32x8 [r4+r2 ], m16, 1
+ mova [r5+strideq*0], ym18
+ vextracti32x8 [r5+strideq*1], m18, 1
+ mova [r5+strideq*2], ym20
+ vextracti32x8 [r5+r2 ], m20, 1
+ RET
+ALIGN function_align
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ imul r6d, 181
+ mov r3d, 16
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 128+256
+ sar r6d, 8+1
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3
+ALIGN function_align
+.main_oddhalf_fast2: ; bottom three-quarters are zero
+ vpbroadcastd m9, [o(pw_2896x8)]
+ vpbroadcastd m2, [o(pw_4017x8)]
+ vpbroadcastd m3, [o(pw_799x8)]
+ vpbroadcastd m18, [o(pw_4076x8)]
+ vpbroadcastd m19, [o(pw_401x8)]
+ vpbroadcastd m20, [o(pw_m1189x8)]
+ vpbroadcastd m16, [o(pw_3920x8)]
+ pmulhrsw m9, m0 ; t0
+ pmulhrsw m2, m1 ; t7a
+ pmulhrsw m1, m3 ; t4a
+ pmulhrsw m18, m14 ; t15a
+ pmulhrsw m14, m19 ; t8a
+ pmulhrsw m20, m15 ; t11a
+ pmulhrsw m15, m16 ; t12a
+ psubsw m7, m9, m2 ; idct8 out7
+ paddsw m0, m9, m2 ; idct8 out0
+ psubsw m4, m9, m1 ; idct8 out4
+ paddsw m3, m9, m1 ; idct8 out3
+ ITX_MULSUB_2W 2, 1, 5, 6, 10, 2896, 2896 ; t5, t6
+ mova m21, m18
+ mova m19, m14
+ mova m16, m15
+ mova m8, m20
+ psubsw m6, m9, m1 ; idct8 out6
+ paddsw m1, m9 ; idct8 out1
+ psubsw m5, m9, m2 ; idct8 out5
+ paddsw m2, m9 ; idct8 out2
+ jmp .main3
+ALIGN function_align
+.main_oddhalf_fast: ; bottom half is zero
+ vpbroadcastd m5, [o(pw_m2276x8)]
+ vpbroadcastd m11, [o(pw_3406x8)]
+ vpbroadcastd m7, [o(pw_4017x8)]
+ vpbroadcastd m12, [o(pw_799x8)]
+ vpbroadcastd m6, [o(pw_3784x8)]
+ vpbroadcastd m10, [o(pw_1567x8)]
+ vpbroadcastd m4, [o(pw_2896x8)]
+ pmulhrsw m5, m3 ; t5a
+ pmulhrsw m3, m11 ; t6a
+ pmulhrsw m7, m1 ; t7a
+ pmulhrsw m1, m12 ; t4a
+ pmulhrsw m6, m2 ; t3
+ pmulhrsw m2, m10 ; t2
+ pmulhrsw m4, m0 ; t0
+ vpbroadcastd m11, [o(pw_2896_2896)]
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ vpbroadcastd m10, [o(pd_2048)]
+ mova m0, m4 ; t1
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main3
+ vpbroadcastd m21, [o(pw_4076x8)]
+ vpbroadcastd m8, [o(pw_401x8)]
+ vpbroadcastd m18, [o(pw_m2598x8)]
+ vpbroadcastd m9, [o(pw_3166x8)]
+ vpbroadcastd m19, [o(pw_3612x8)]
+ vpbroadcastd m11, [o(pw_1931x8)]
+ vpbroadcastd m20, [o(pw_m1189x8)]
+ vpbroadcastd m12, [o(pw_3920x8)]
+ pmulhrsw m21, m14 ; t15a
+ pmulhrsw m14, m8 ; t8a
+ pmulhrsw m18, m17 ; t9a
+ pmulhrsw m17, m9 ; t14a
+ pmulhrsw m19, m16 ; t13a
+ pmulhrsw m16, m11 ; t10a
+ pmulhrsw m20, m15 ; t11a
+ pmulhrsw m15, m12 ; t12a
+ jmp .main2
+ALIGN function_align
+.main_oddhalf:
+ ITX_MULSUB_2W 14, 21, 8, 9, 10, 401, 4076 ; t8a, t15a
+ ITX_MULSUB_2W 18, 17, 8, 9, 10, 3166, 2598 ; t9a, t14a
+ ITX_MULSUB_2W 16, 19, 8, 9, 10, 1931, 3612 ; t10a, t13a
+ ITX_MULSUB_2W 20, 15, 8, 9, 10, 3920, 1189 ; t11a, t12a
+.main2:
+ paddsw m8, m20, m16 ; t11
+ psubsw m20, m16 ; t10
+ paddsw m16, m15, m19 ; t12
+ psubsw m15, m19 ; t13
+ psubsw m19, m14, m18 ; t9
+ paddsw m14, m18 ; t8
+ psubsw m18, m21, m17 ; t14
+ paddsw m21, m17 ; t15
+.main3:
+ vpbroadcastd m11, [o(pw_1567_3784)]
+ vpbroadcastd m12, [o(pw_m3784_1567)]
+ ITX_MULSUB_2W 18, 19, 9, 17, 10, 11, 12 ; t9a, t14a
+ vpbroadcastd m11, [o(pw_m1567_m3784)]
+ ITX_MULSUB_2W 15, 20, 9, 17, 10, 12, 11 ; t10a, t13a
+ vpbroadcastd m11, [o(pw_2896_2896)]
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ psubsw m17, m14, m8 ; t11a
+ paddsw m8, m14 ; t8a
+ paddsw m14, m18, m15 ; t9
+ psubsw m18, m15 ; t10
+ psubsw m15, m19, m20 ; t13
+ paddsw m19, m20 ; t14
+ paddsw m20, m21, m16 ; t15a
+ psubsw m16, m21, m16 ; t12a
+ ITX_MULSUB_2W 15, 18, 9, 21, 10, 11, 12 ; t10a, t13a
+ ITX_MULSUB_2W 16, 17, 9, 21, 10, 11, 12 ; t11, t12
+ psubsw m21, m0, m20 ; out15
+ paddsw m0, m20 ; out0
+ psubsw m20, m1, m19 ; out14
+ paddsw m1, m19 ; out1
+ psubsw m19, m2, m18 ; out13
+ paddsw m2, m18 ; out2
+ psubsw m18, m3, m17 ; out12
+ paddsw m3, m17 ; out3
+ psubsw m17, m4, m16 ; out11
+ paddsw m4, m16 ; out4
+ psubsw m16, m5, m15 ; out10
+ paddsw m5, m15 ; out5
+ psubsw m15, m6, m14 ; out9
+ paddsw m6, m14 ; out6
+ psubsw m14, m7, m8 ; out8
+ paddsw m7, m8 ; out7
+ ret
+.transpose_round:
+ punpcklwd m8, m0, m2
+ punpckhwd m0, m2
+ punpcklwd m2, m1, m3
+ punpckhwd m1, m3
+ punpcklwd m3, m4, m6
+ punpckhwd m4, m6
+ punpcklwd m6, m5, m7
+ punpckhwd m5, m7
+ punpcklwd m7, m14, m16
+ punpckhwd m14, m16
+ punpcklwd m16, m15, m17
+ punpckhwd m15, m17
+ punpcklwd m17, m19, m21
+ punpckhwd m19, m21
+ punpckhwd m21, m18, m20
+ punpcklwd m18, m20
+ punpcklwd m20, m8, m1
+ punpckhwd m8, m1
+ punpcklwd m1, m0, m2
+ punpckhwd m0, m2
+ punpcklwd m2, m3, m5
+ punpckhwd m3, m5
+ punpcklwd m5, m4, m6
+ punpckhwd m4, m6
+ REPX {pmulhrsw x, m9}, m20, m8, m1, m0
+ punpcklwd m6, m7, m15
+ punpckhwd m7, m15
+ punpcklwd m15, m14, m16
+ punpckhwd m14, m16
+ REPX {pmulhrsw x, m9}, m2, m3, m5, m4
+ punpckhwd m16, m18, m19
+ punpcklwd m18, m19
+ punpcklwd m19, m21, m17
+ punpckhwd m21, m17
+ REPX {pmulhrsw x, m9}, m6, m7, m15, m14
+ punpcklwd m17, m8, m0 ; a2 a6 aa ae
+ punpckhwd m8, m0 ; a3 a7 ab af
+ punpcklwd m0, m20, m1 ; a0 a4 a8 ac
+ punpckhwd m20, m1 ; a1 a5 a9 ad
+ REPX {pmulhrsw x, m9}, m16, m18, m19, m21
+ punpcklwd m1, m2, m5 ; b0 b4 b8 bc
+ punpckhwd m2, m5 ; b1 b5 b9 bd
+ punpcklwd m5, m3, m4 ; b2 b6 ba be
+ punpckhwd m3, m4 ; b3 b7 bb bf
+ punpcklwd m4, m6, m15 ; c0 c4 c8 cc
+ punpckhwd m6, m15 ; c1 c5 c9 cd
+ punpcklwd m15, m7, m14 ; c2 c6 ca ce
+ punpckhwd m7, m14 ; c3 c7 cb cf
+ punpcklwd m14, m18, m19 ; d0 d4 d8 dc
+ punpckhwd m18, m19 ; d1 d5 d9 dd
+ punpcklwd m9, m16, m21 ; d2 d6 da de
+ punpckhwd m16, m21 ; d3 d7 db df
+ vshufi32x4 m21, m0, m1, q3232 ; a8 ac b8 bc
+ vinserti32x8 m0, ym1, 1 ; a0 a4 b0 b4
+ vinserti32x8 m1, m17, ym5, 1 ; a2 a6 b2 b6
+ vshufi32x4 m5, m17, m5, q3232 ; aa ae ba be
+ vinserti32x8 m17, m8, ym3, 1 ; a3 a7 b3 b7
+ vshufi32x4 m19, m8, m3, q3232 ; ab af bb bf
+ vinserti32x8 m3, m4, ym14, 1 ; c0 c4 d0 d4
+ vshufi32x4 m4, m14, q3232 ; c8 cc d8 dc
+ vinserti32x8 m14, m20, ym2, 1 ; a1 a5 b1 b5
+ vshufi32x4 m20, m2, q3232 ; a9 ad b9 bd
+ vinserti32x8 m2, m6, ym18, 1 ; c1 c5 d1 d5
+ vshufi32x4 m6, m18, q3232 ; c9 cd d9 dd
+ vinserti32x8 m18, m15, ym9, 1 ; c2 c6 d2 d6
+ vshufi32x4 m15, m9, q3232 ; ca ce da de
+ vinserti32x8 m9, m7, ym16, 1 ; c3 c7 d3 d7
+ vshufi32x4 m7, m16, q3232 ; cb cf db df
+ ret
+
+%macro IDTX_16x32 4 ; src/dst[1-4]
+ pmulhrsw m%1, m15, [cq+64*%1]
+ pmulhrsw m%2, m15, [cq+64*%2]
+ pmulhrsw m%3, m15, [cq+64*%3]
+ pmulhrsw m%4, m15, [cq+64*%4]
+ pmulhrsw m18, m16, m%1
+ pmulhrsw m19, m16, m%2
+ pmulhrsw m20, m16, m%3
+ pmulhrsw m21, m16, m%4
+ REPX {pmulhrsw x, m17}, m18, m19, m20, m21
+ paddsw m%1, m18
+ paddsw m%2, m19
+ paddsw m%3, m20
+ paddsw m%4, m21
+%endmacro
+
+%macro IDTX_16x32_STORE 2 ; src[1-2]
+ mova xm17, [dstq+r3*0]
+ vinserti128 ym17, [dstq+r3*4], 1
+ vinserti32x4 m17, [dstq+r3*8], 2
+ vinserti32x4 m17, [dstq+r4*8], 3
+ mova [cq+64*(%1*2+0)], m18
+ mova [cq+64*(%1*2+1)], m18
+ punpcklbw m16, m17, m18
+ punpckhbw m17, m18
+ paddw m16, m%1
+ paddw m17, m%2
+ packuswb m16, m17
+ mova [dstq+r3*0], xm16
+ vextracti128 [dstq+r3*4], ym16, 1
+ vextracti32x4 [dstq+r3*8], m16, 2
+ vextracti32x4 [dstq+r4*8], m16, 3
+%if %1 != 7
+ add dstq, strideq
+%endif
+%endmacro
+
+cglobal inv_txfm_add_identity_identity_16x32_8bpc, 3, 5, 22, dst, stride, c
+ vpbroadcastd m15, [pw_2896x8]
+ vpbroadcastd m16, [pw_1697x16]
+ vpbroadcastd m17, [pw_16384]
+ IDTX_16x32 0, 1, 2, 3
+ IDTX_16x32 4, 5, 6, 7
+ IDTX_16x32 8, 9, 10, 11
+ IDTX_16x32 12, 13, 14, 15
+ vpbroadcastd m16, [pw_8192]
+ call .transpose_2x8x8_round
+ lea r3, [strideq*2]
+ lea r4, [strideq*3]
+ pxor m18, m18
+ IDTX_16x32_STORE 0, 8
+ IDTX_16x32_STORE 1, 9
+ IDTX_16x32_STORE 2, 10
+ IDTX_16x32_STORE 3, 11
+ IDTX_16x32_STORE 4, 12
+ IDTX_16x32_STORE 5, 13
+ IDTX_16x32_STORE 6, 14
+ IDTX_16x32_STORE 7, 15
+ RET
+ALIGN function_align
+.transpose_2x8x8_round:
+ punpckhwd m17, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m6, m7
+ punpcklwd m6, m7
+ punpckhwd m7, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ punpckhdq m6, m5, m7
+ punpckldq m5, m7
+ punpckldq m7, m17, m1
+ punpckhdq m17, m1
+ REPX {pmulhrsw x, m16}, m0, m2, m3, m4, m5, m7, m6, m17
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m7
+ punpckhqdq m5, m7
+ punpckhqdq m7, m6, m17
+ punpcklqdq m6, m17
+ punpckhwd m17, m12, m13
+ punpcklwd m12, m13
+ punpckhwd m13, m8, m9
+ punpcklwd m8, m9
+ punpckhwd m9, m14, m15
+ punpcklwd m14, m15
+ punpckhwd m15, m10, m11
+ punpcklwd m10, m11
+ punpckhdq m11, m8, m10
+ punpckldq m8, m10
+ punpckldq m10, m12, m14
+ punpckhdq m12, m14
+ punpckhdq m14, m13, m15
+ punpckldq m13, m15
+ punpckldq m15, m17, m9
+ punpckhdq m17, m9
+ REPX {pmulhrsw x, m16}, m8, m10, m11, m12, m13, m15, m14, m17
+ punpckhqdq m9, m8, m10
+ punpcklqdq m8, m10
+ punpcklqdq m10, m11, m12
+ punpckhqdq m11, m12
+ punpcklqdq m12, m13, m15
+ punpckhqdq m13, m15
+ punpckhqdq m15, m14, m17
+ punpcklqdq m14, m17
+ ret
+
+%macro IDTX_32x16 4 ; dst[1-4]
+ pmulhrsw m%2, m12, [cq+32*(%1+ 0)]
+ pmulhrsw m18, m12, [cq+32*(%1+16)]
+ pmulhrsw m%4, m12, [cq+32*(%3+ 0)]
+ pmulhrsw m19, m12, [cq+32*(%3+16)]
+ REPX {paddsw x, x}, m%2, m18, m%4, m19
+ mova m%1, m14
+ vpermi2q m%1, m%2, m18
+ vpermt2q m%2, m16, m18
+%if %3 != 14
+ mova m%3, m14
+%endif
+ vpermi2q m%3, m%4, m19
+ vpermt2q m%4, m16, m19
+ pmulhrsw m18, m17, m%1
+ pmulhrsw m19, m17, m%2
+ pmulhrsw m20, m17, m%3
+ pmulhrsw m21, m17, m%4
+ REPX {paddsw x, x}, m%1, m%2, m%3, m%4
+ paddsw m%1, m18
+ paddsw m%2, m19
+ paddsw m%3, m20
+ paddsw m%4, m21
+%endmacro
+
+%macro IDTX_32x16_STORE 2-3 0 ; src[1-2], 32x32
+ mova ym19, [dstq+strideq*0]
+ vinserti32x8 m19, [dstq+strideq*8], 1
+%if %3 == 0
+ mova [cq+64*(%1*2+0)], m20
+ mova [cq+64*(%1*2+1)], m20
+%endif
+ punpcklbw m18, m19, m20
+ punpckhbw m19, m20
+ paddw m18, m%1
+ paddw m19, m%2
+ packuswb m18, m19
+ mova [dstq+strideq*0], ym18
+ vextracti32x8 [dstq+strideq*8], m18, 1
+%if %3 || %1 != 7
+ add dstq, strideq
+%endif
+%endmacro
+
+cglobal inv_txfm_add_identity_identity_32x16_8bpc, 3, 3, 22, dst, stride, c
+ vpbroadcastd m12, [pw_2896x8]
+ movu m14, [permB+7]
+ vpbroadcastd m17, [pw_1697x16]
+ psrlq m16, m14, 4
+ IDTX_32x16 0, 1, 2, 3
+ IDTX_32x16 4, 5, 6, 7
+ IDTX_32x16 8, 9, 10, 11
+ IDTX_32x16 12, 13, 14, 15
+ vpbroadcastd m16, [pw_2048]
+ call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round
+ pxor m20, m20
+ IDTX_32x16_STORE 0, 8
+ IDTX_32x16_STORE 1, 9
+ IDTX_32x16_STORE 2, 10
+ IDTX_32x16_STORE 3, 11
+ IDTX_32x16_STORE 4, 12
+ IDTX_32x16_STORE 5, 13
+ IDTX_32x16_STORE 6, 14
+ IDTX_32x16_STORE 7, 15
+ RET
+
+%macro IDCT_32x32_END 4 ; src, mem, stride[1-2]
+ pmovzxbw m10, [dstq+%3]
+ pmovzxbw m11, [r3 +%4]
+%if %2 < 8
+ paddsw m8, m%2, m%1
+ psubsw m9, m%2, m%1
+%else
+ mova m9, [cq+64*(%2*2-16)]
+ paddsw m8, m9, m%1
+ psubsw m9, m%1
+%endif
+ pmulhrsw m8, m12
+ pmulhrsw m9, m12
+%if %2 >= 8
+%if %2 == 8
+ pxor m0, m0
+%endif
+ mova [cq+64*(%2*2-16)], m0
+ mova [cq+64*(%2*2-15)], m0
+%endif
+ paddw m8, m10
+ paddw m9, m11
+ packuswb m8, m9
+ vpermq m8, m13, m8
+ mova [dstq+%3], ym8
+ vextracti32x8 [r3 +%4], m8, 1
+%if %2 == 3 || %2 == 7 || %2 == 11
+ add dstq, r5
+ sub r3, r5
+%endif
+%endmacro
+
+cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ WIN64_SPILL_XMM 30
+ cmp eobd, 136
+ jb .fast
+ mova m5, [cq+64*20]
+ mova m3, [cq+64*12]
+ mova m1, [cq+64* 4]
+ mova m7, [cq+64*28]
+ mova m2, [cq+64* 8]
+ mova m6, [cq+64*24]
+ mova m0, [cq+64* 0]
+ mova m4, [cq+64*16]
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ mova m14, [cq+64* 2]
+ mova m21, [cq+64*30]
+ mova m18, [cq+64*18]
+ mova m17, [cq+64*14]
+ mova m16, [cq+64*10]
+ mova m19, [cq+64*22]
+ mova m20, [cq+64*26]
+ mova m15, [cq+64* 6]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ mova [cq+64* 0], m14
+ mova [cq+64* 2], m15
+ mova [cq+64* 4], m16
+ mova [cq+64* 6], m17
+ mova [cq+64* 8], m18
+ mova [cq+64*10], m19
+ mova [cq+64*12], m20
+ mova [cq+64*14], m21
+ mova m22, [cq+64* 1]
+ mova m21, [cq+64*31]
+ mova m14, [cq+64*17]
+ mova m29, [cq+64*15]
+ mova m26, [cq+64* 9]
+ mova m17, [cq+64*23]
+ mova m18, [cq+64*25]
+ mova m25, [cq+64* 7]
+ mova m24, [cq+64* 5]
+ mova m19, [cq+64*27]
+ mova m16, [cq+64*21]
+ mova m27, [cq+64*11]
+ mova m28, [cq+64*13]
+ mova m15, [cq+64*19]
+ mova m20, [cq+64*29]
+ mova m23, [cq+64* 3]
+ call .main_oddhalf
+ vpbroadcastd m10, [o(pw_8192)]
+ psubsw m13, m0, m29 ; 31
+ paddsw m0, m29 ; 0
+ psubsw m29, m1, m28 ; 30
+ paddsw m1, m28 ; 1
+ psubsw m28, m2, m27 ; 29
+ paddsw m2, m27 ; 2
+ psubsw m27, m3, m26 ; 28
+ paddsw m3, m26 ; 3
+ psubsw m26, m4, m25 ; 27
+ paddsw m4, m25 ; 4
+ psubsw m25, m5, m24 ; 26
+ paddsw m5, m24 ; 5
+ psubsw m24, m6, m23 ; 25
+ paddsw m6, m23 ; 6
+ psubsw m23, m7, m22 ; 24
+ paddsw m7, m22 ; 7
+ pxor m9, m9
+ punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7
+ punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7
+ punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3
+ REPX {mova [cq+64*x], m9}, 16, 17, 18, 19
+ punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7
+ punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3
+ punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7
+ punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3
+ REPX {mova [cq+64*x], m9}, 20, 21, 22, 23
+ punpckhwd m3, m23, m24
+ punpcklwd m23, m24
+ punpckhwd m24, m25, m26
+ punpcklwd m25, m26
+ REPX {mova [cq+64*x], m9}, 24, 25, 26, 27
+ punpckhwd m26, m27, m28
+ punpcklwd m27, m28
+ punpckhwd m28, m29, m13
+ punpcklwd m29, m13
+ REPX {mova [cq+64*x], m9}, 28, 29, 30, 31
+ punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7
+ punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5
+ punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7
+ punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5
+ REPX {pmulhrsw x, m10}, m0, m4, m8, m22
+ punpckhdq m13, m23, m25
+ punpckldq m23, m25
+ punpckhdq m25, m27, m29
+ punpckldq m27, m29
+ REPX {pmulhrsw x, m10}, m13, m23, m25, m27
+ punpckhdq m9, m3, m24
+ punpckldq m3, m24
+ punpckhdq m24, m26, m28
+ punpckldq m26, m28
+ punpcklqdq m5, m23, m27 ; d00 d08 d16 d24
+ punpckhqdq m23, m27 ; d01 d09 d17 d25
+ punpckhqdq m27, m13, m25 ; d03 d11 d19 d27
+ punpcklqdq m13, m25 ; d02 d10 d18 d26
+ punpckhqdq m25, m3, m26 ; d05 d13 d21 d29
+ punpcklqdq m3, m26 ; d04 d12 d20 d28
+ punpckhqdq m26, m9, m24 ; d07 d15 d23 d31
+ punpcklqdq m9, m24 ; d06 d14 d22 d30
+ REPX {pmulhrsw x, m10}, m25, m3, m26
+ mova [cq+64* 9], m23
+ mova [cq+64*11], m27
+ mova [cq+64*13], m25
+ mova [cq+64*15], m26
+ punpckhqdq m24, m8, m22 ; a05 a13 a21 a29
+ punpcklqdq m8, m22 ; a04 a12 a20 a28
+ punpckhqdq m22, m0, m4 ; a01 a09 a17 a25
+ punpcklqdq m0, m4 ; a00 a08 a16 a24
+ punpckhqdq m23, m7, m2 ; a03 a11 a19 a27
+ punpcklqdq m7, m2 ; a02 a10 a18 a26
+ punpckhqdq m25, m6, m1 ; a07 a15 a23 a31
+ punpcklqdq m6, m1 ; a06 a14 a22 a30
+ mova m2, [cq+64* 0]
+ mova m11, [cq+64* 2]
+ mova m12, [cq+64* 4]
+ mova m29, [cq+64* 6]
+ mova m27, [cq+64* 8]
+ mova m26, [cq+64*10]
+ mova m4, [cq+64*12]
+ mova m28, [cq+64*14]
+ psubsw m1, m2, m21 ; 23
+ paddsw m2, m21 ; 8
+ psubsw m21, m11, m20 ; 22
+ paddsw m11, m20 ; 9
+ psubsw m20, m12, m19 ; 21
+ paddsw m12, m19 ; 10
+ psubsw m19, m29, m18 ; 20
+ paddsw m29, m18 ; 11
+ psubsw m18, m27, m17 ; 19
+ paddsw m27, m17 ; 12
+ psubsw m17, m26, m16 ; 18
+ paddsw m26, m16 ; 13
+ paddsw m16, m4, m15 ; 14
+ psubsw m4, m15 ; 17
+ pmulhrsw m15, m6, m10
+ psubsw m6, m28, m14 ; 16
+ paddsw m28, m14 ; 15
+ pmulhrsw m14, m7, m10
+ punpcklwd m7, m6, m4
+ punpckhwd m6, m4
+ punpckhwd m4, m17, m18
+ punpcklwd m17, m18
+ punpckhwd m18, m19, m20
+ punpcklwd m19, m20
+ punpckhwd m20, m21, m1
+ punpcklwd m21, m1
+ punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7
+ punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3
+ punpckhwd m11, m12, m29 ; k4 l4 k5 l5 k6 l6 k7 l7
+ punpcklwd m12, m29 ; k0 l0 k1 l1 k2 l2 k3 l3
+ punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7
+ punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3
+ punpckhwd m26, m16, m28 ; o4 p4 o5 p5 o6 p6 o7 p7
+ punpcklwd m16, m28 ; o0 p0 o1 p1 o2 p2 o3 p3
+ pmulhrsw m23, m10
+ pmulhrsw m25, m10
+ punpckhdq m28, m2, m12 ; i2 j2 k2 l2 i3 j3 k3 l3
+ punpckldq m2, m12 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhdq m12, m27, m16 ; m2 n2 o2 p2 m3 n3 o3 p3
+ punpckldq m27, m16 ; m0 n0 o0 p0 m1 n1 o1 p1
+ REPX {pmulhrsw x, m10}, m28, m2, m12, m27
+ punpckhdq m16, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7
+ punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5
+ punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7
+ punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5
+ REPX {pmulhrsw x, m10}, m16, m1, m11, m29
+ punpckhdq m26, m19, m21
+ punpckldq m19, m21
+ punpckhdq m21, m6, m4
+ punpckldq m6, m4
+ REPX {pmulhrsw x, m10}, m26, m19, m21, m6
+ punpckhdq m4, m18, m20
+ punpckldq m18, m20
+ punpckhdq m20, m7, m17
+ punpckldq m7, m17
+ REPX {pmulhrsw x, m10}, m4, m18, m20, m7
+ punpcklqdq m17, m28, m12 ; b02 b10 b18 b26
+ punpckhqdq m28, m12 ; b03 b11 b19 b27
+ punpckhqdq m12, m2, m27 ; b01 b09 b17 b25
+ punpcklqdq m2, m27 ; b00 b08 b16 b24
+ punpckhqdq m27, m1, m29 ; b05 b13 b21 b29
+ punpcklqdq m1, m29 ; b04 b12 b20 b28
+ punpckhqdq m29, m16, m11 ; b07 b15 b23 b31
+ punpcklqdq m16, m11 ; b06 b14 b22 b30
+ mova [cq+64* 1], m12
+ mova [cq+64* 3], m28
+ mova [cq+64* 5], m27
+ mova [cq+64* 7], m29
+ punpckhqdq m27, m20, m26 ; c03 c11 c19 c27
+ punpcklqdq m20, m26 ; c02 c10 c18 c26
+ punpckhqdq m26, m7, m19 ; c01 c09 c17 c25
+ punpcklqdq m7, m19 ; c00 c08 c16 c24
+ punpckhqdq m28, m6, m18 ; c05 c13 c21 c29
+ punpcklqdq m6, m18 ; c04 c12 c20 c28
+ punpckhqdq m29, m21, m4 ; c07 c15 c23 c31
+ punpcklqdq m21, m4 ; c06 c14 c22 c30
+ pmulhrsw m19, m9, m10
+ vshufi32x4 m4, m0, m2, q3232 ; a16 a24 b16 b24
+ vinserti32x8 m0, ym2, 1 ; a00 a08 b00 b08
+ vshufi32x4 m2, m7, m5, q3232 ; c16 c24 d16 d24
+ vinserti32x8 m7, ym5, 1 ; c00 c08 d00 d08
+ vshufi32x4 m5, m8, m1, q3232 ; a20 a28 b20 b28
+ vinserti32x8 m1, m8, ym1, 1 ; a04 a12 b04 b12
+ vshufi32x4 m8, m6, m3, q3232 ; c20 c28 d20 d28
+ vinserti32x8 m6, ym3, 1 ; c04 c12 d04 d12
+ vshufi32x4 m3, m1, m6, q3131 ; 12
+ vshufi32x4 m1, m6, q2020 ; 4
+ vshufi32x4 m6, m4, m2, q3131 ; 24
+ vshufi32x4 m4, m2, q2020 ; 16
+ vshufi32x4 m2, m0, m7, q3131 ; 8
+ vshufi32x4 m0, m7, q2020 ; 0
+ vshufi32x4 m7, m5, m8, q3131 ; 28
+ vshufi32x4 m5, m8, q2020 ; 20
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ vshufi32x4 m18, m14, m17, q3232 ; a18 a26 b18 b26
+ vinserti32x8 m14, ym17, 1 ; a02 a10 b02 b10
+ vshufi32x4 m17, m20, m13, q3232 ; c18 c26 d18 d26
+ vinserti32x8 m20, ym13, 1 ; c02 c10 d02 d10
+ vshufi32x4 m13, m21, m19, q3232 ; c22 c30 d22 d30
+ vinserti32x8 m21, ym19, 1 ; c06 c14 d06 d14
+ vshufi32x4 m19, m15, m16, q3232 ; a22 a30 b22 b30
+ vinserti32x8 m15, ym16, 1 ; a06 a14 b06 b14
+ vshufi32x4 m16, m14, m20, q3131 ; 10
+ vshufi32x4 m14, m20, q2020 ; 2
+ vshufi32x4 m20, m18, m17, q3131 ; 26
+ vshufi32x4 m18, m17, q2020 ; 18
+ vshufi32x4 m17, m15, m21, q3131 ; 14
+ vshufi32x4 m15, m21, q2020 ; 6
+ vshufi32x4 m21, m19, m13, q3131 ; 30
+ vshufi32x4 m19, m13, q2020 ; 22
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ mova [cq+64* 0], m14
+ mova [cq+64* 2], m15
+ mova [cq+64* 4], m16
+ mova [cq+64* 6], m17
+ mova [cq+64* 8], m18
+ mova [cq+64*10], m19
+ mova [cq+64*12], m20
+ mova [cq+64*14], m21
+ mova m15, [cq+64* 1]
+ mova m16, [cq+64* 3]
+ mova m17, [cq+64* 5]
+ mova m19, [cq+64* 7]
+ mova m20, [cq+64* 9]
+ mova m21, [cq+64*11]
+ mova m13, [cq+64*13]
+ mova m18, [cq+64*15]
+ vshufi32x4 m14, m22, m15, q3232 ; a17 a25 b17 b25
+ vinserti32x8 m22, ym15, 1 ; a01 a09 b01 b09
+ vshufi32x4 m15, m23, m16, q3232 ; a19 a27 b19 b27
+ vinserti32x8 m23, ym16, 1 ; a03 a11 b03 b11
+ vshufi32x4 m16, m24, m17, q3232 ; a21 a29 b21 b29
+ vinserti32x8 m24, ym17, 1 ; a05 a13 b05 b13
+ vshufi32x4 m17, m25, m19, q3232 ; a23 a31 b23 b31
+ vinserti32x8 m25, ym19, 1 ; a07 a15 b07 b15
+ vinserti32x8 m8, m26, ym20, 1 ; c01 c09 d01 d09
+ vshufi32x4 m26, m20, q3232 ; c17 c25 d17 d25
+ vinserti32x8 m9, m27, ym21, 1 ; c03 c11 d03 d11
+ vshufi32x4 m27, m21, q3232 ; c19 c27 d19 d27
+ vinserti32x8 m11, m28, ym13, 1 ; c05 c13 d05 d13
+ vshufi32x4 m28, m13, q3232 ; c21 c29 d21 d29
+ vinserti32x8 m12, m29, ym18, 1 ; c07 c15 d07 d15
+ vshufi32x4 m29, m18, q3232 ; c23 c31 d23 d31
+ vshufi32x4 m18, m14, m26, q3131 ; 25
+ vshufi32x4 m14, m26, q2020 ; 17
+ vshufi32x4 m19, m15, m27, q3131 ; 27
+ vshufi32x4 m15, m27, q2020 ; 19
+ vshufi32x4 m20, m16, m28, q3131 ; 29
+ vshufi32x4 m16, m28, q2020 ; 21
+ vshufi32x4 m21, m17, m29, q3131 ; 31
+ vshufi32x4 m17, m29, q2020 ; 23
+ vshufi32x4 m26, m22, m8, q3131 ; 9
+ vshufi32x4 m22, m8, q2020 ; 1
+ vshufi32x4 m27, m23, m9, q3131 ; 11
+ vshufi32x4 m23, m9, q2020 ; 3
+ vshufi32x4 m28, m24, m11, q3131 ; 13
+ vshufi32x4 m24, m11, q2020 ; 5
+ vshufi32x4 m29, m25, m12, q3131 ; 15
+ vshufi32x4 m25, m12, q2020 ; 7
+ call .main_oddhalf
+ jmp .end
+.fast: ; bottom/right halves are zero
+ mova m14, [o(dup16_perm)]
+ pmovzxwd m9, [cq+64* 0]
+ pmovzxwd m6, [cq+64* 8]
+ vpermb m8, m14, [cq+64* 2]
+ vpermb ym0, ym14, [cq+64*14]
+ vpermb ym5, ym14, [cq+64*10]
+ vpermb m1, m14, [cq+64* 6]
+ vpermb m7, m14, [cq+64* 4]
+ vpermb ym3, ym14, [cq+64*12]
+ pslld m9, 16
+ pslld m6, 16
+ call m(idct_16x16_internal_8bpc).main_fast
+ vpermb m21, m14, [cq+64* 1]
+ vpermb ym17, ym14, [cq+64*15]
+ vpermb ym20, ym14, [cq+64* 9]
+ vpermb m15, m14, [cq+64* 7]
+ vpermb m18, m14, [cq+64* 5]
+ vpermb ym16, ym14, [cq+64*11]
+ vpermb ym19, ym14, [cq+64*13]
+ vpermb m14, m14, [cq+64* 3]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m9, [o(pw_8192)]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round
+ vshufi32x4 m22, m14, m2, q2020 ; 1
+ vshufi32x4 m24, m14, m2, q3131 ; 5
+ vshufi32x4 m23, m17, m9, q2020 ; 3
+ vshufi32x4 m25, m17, m9, q3131 ; 7
+ vshufi32x4 m16, m5, m15, q2020 ; 10
+ vshufi32x4 m17, m5, m15, q3131 ; 14
+ vshufi32x4 m14, m1, m18, q2020 ; 2
+ vshufi32x4 m15, m1, m18, q3131 ; 6
+ vshufi32x4 m1, m0, m3, q3131 ; 4
+ vshufi32x4 m0, m3, q2020 ; 0
+ vshufi32x4 m3, m21, m4, q3131 ; 12
+ vshufi32x4 m2, m21, m4, q2020 ; 8
+ vshufi32x4 m26, m20, m6, q2020 ; 9
+ vshufi32x4 m28, m20, m6, q3131 ; 13
+ vshufi32x4 m27, m19, m7, q2020 ; 11
+ vshufi32x4 m29, m19, m7, q3131 ; 15
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ mova [cq+64* 0], m14
+ mova [cq+64* 2], m15
+ mova [cq+64* 4], m16
+ mova [cq+64* 6], m17
+ mova [cq+64* 8], m18
+ mova [cq+64*10], m19
+ mova [cq+64*12], m20
+ mova [cq+64*14], m21
+ call .main_oddhalf_fast
+.end:
+ lea r4, [strideq*3]
+ vpbroadcastd m12, [o(pw_2048)]
+ movshdup m13, [o(permD)]
+ lea r3, [dstq+r4*8]
+ lea r5, [strideq+r4] ; stride*4
+ add r3, r5 ; dst+stride*28
+ IDCT_32x32_END 29, 0, strideq*0, r4
+ IDCT_32x32_END 28, 1, strideq*1, strideq*2
+ IDCT_32x32_END 27, 2, strideq*2, strideq*1
+ IDCT_32x32_END 26, 3, r4 , strideq*0
+ IDCT_32x32_END 25, 4, strideq*0, r4
+ IDCT_32x32_END 24, 5, strideq*1, strideq*2
+ IDCT_32x32_END 23, 6, strideq*2, strideq*1
+ IDCT_32x32_END 22, 7, r4 , strideq*0
+ IDCT_32x32_END 21, 8, strideq*0, r4
+ IDCT_32x32_END 20, 9, strideq*1, strideq*2
+ IDCT_32x32_END 19, 10, strideq*2, strideq*1
+ IDCT_32x32_END 18, 11, r4 , strideq*0
+ IDCT_32x32_END 17, 12, strideq*0, r4
+ IDCT_32x32_END 16, 13, strideq*1, strideq*2
+ IDCT_32x32_END 15, 14, strideq*2, strideq*1
+ IDCT_32x32_END 14, 15, r4 , strideq*0
+ RET
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ mov r3d, 32
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly2
+ALIGN function_align
+.main_oddhalf_fast2: ; bottom three-quarters are zero
+ vpbroadcastd m21, [o(pw_4091x8)]
+ vpbroadcastd m8, [o(pw_201x8)]
+ vpbroadcastd m18, [o(pw_m1380x8)]
+ vpbroadcastd m9, [o(pw_3857x8)]
+ vpbroadcastd m19, [o(pw_3973x8)]
+ vpbroadcastd m11, [o(pw_995x8)]
+ vpbroadcastd m28, [o(pw_m601x8)]
+ vpbroadcastd m12, [o(pw_4052x8)]
+ pmulhrsw m21, m22 ; t31a
+ pmulhrsw m22, m8 ; t16a
+ pmulhrsw m18, m25 ; t19a
+ pmulhrsw m25, m9 ; t28a
+ pmulhrsw m19, m24 ; t27a
+ pmulhrsw m24, m11 ; t20a
+ pmulhrsw m28, m23 ; t23a
+ pmulhrsw m23, m12 ; t24a
+ mova m15, m21
+ mova m8, m22
+ mova m14, m18
+ mova m27, m25
+ mova m29, m19
+ mova m26, m24
+ mova m16, m28
+ mova m20, m23
+ jmp .main3
+ALIGN function_align
+.main_oddhalf_fast: ; bottom half is zero
+ vpbroadcastd m21, [o(pw_4091x8)]
+ vpbroadcastd m8, [o(pw_201x8)]
+ vpbroadcastd m14, [o(pw_m2751x8)]
+ vpbroadcastd m9, [o(pw_3035x8)]
+ vpbroadcastd m17, [o(pw_3703x8)]
+ vpbroadcastd m11, [o(pw_1751x8)]
+ vpbroadcastd m18, [o(pw_m1380x8)]
+ vpbroadcastd m12, [o(pw_3857x8)]
+ pmulhrsw m21, m22 ; t31a
+ vpbroadcastd m19, [o(pw_3973x8)]
+ pmulhrsw m22, m8 ; t16a
+ vpbroadcastd m8, [o(pw_995x8)]
+ pmulhrsw m14, m29 ; t30a
+ vpbroadcastd m16, [o(pw_m2106x8)]
+ pmulhrsw m29, m9 ; t17a
+ vpbroadcastd m9, [o(pw_3513x8)]
+ pmulhrsw m17, m26 ; t29a
+ vpbroadcastd m15, [o(pw_3290x8)]
+ pmulhrsw m26, m11 ; t18a
+ vpbroadcastd m11, [o(pw_2440x8)]
+ pmulhrsw m18, m25 ; t19a
+ vpbroadcastd m20, [o(pw_m601x8)]
+ pmulhrsw m25, m12 ; t28a
+ vpbroadcastd m12, [o(pw_4052x8)]
+ pmulhrsw m19, m24 ; t27a
+ pmulhrsw m24, m8 ; t20a
+ pmulhrsw m16, m27 ; t21a
+ pmulhrsw m27, m9 ; t26a
+ pmulhrsw m15, m28 ; t25a
+ pmulhrsw m28, m11 ; t22a
+ pmulhrsw m20, m23 ; t23a
+ pmulhrsw m23, m12 ; t24a
+ jmp .main2
+ALIGN function_align
+.main_oddhalf:
+ ITX_MULSUB_2W 22, 21, 8, 9, 10, 201, 4091 ; t16a, t31a
+ ITX_MULSUB_2W 14, 29, 8, 9, 10, 3035, 2751 ; t17a, t30a
+ ITX_MULSUB_2W 26, 17, 8, 9, 10, 1751, 3703 ; t18a, t29a
+ ITX_MULSUB_2W 18, 25, 8, 9, 10, 3857, 1380 ; t19a, t28a
+ ITX_MULSUB_2W 24, 19, 8, 9, 10, 995, 3973 ; t20a, t27a
+ ITX_MULSUB_2W 16, 27, 8, 9, 10, 3513, 2106 ; t21a, t26a
+ ITX_MULSUB_2W 28, 15, 8, 9, 10, 2440, 3290 ; t22a, t25a
+ ITX_MULSUB_2W 20, 23, 8, 9, 10, 4052, 601 ; t23a, t24a
+.main2:
+ psubsw m8, m22, m14 ; t17
+ paddsw m22, m14 ; t16
+ paddsw m14, m18, m26 ; t19
+ psubsw m18, m26 ; t18
+ psubsw m26, m24, m16 ; t21
+ paddsw m24, m16 ; t20
+ psubsw m16, m20, m28 ; t22
+ paddsw m28, m20 ; t23
+ psubsw m20, m23, m15 ; t25
+ paddsw m23, m15 ; t24
+ psubsw m15, m21, m29 ; t30
+ paddsw m21, m29 ; t31
+ psubsw m29, m19, m27 ; t26
+ paddsw m19, m27 ; t27
+ paddsw m27, m25, m17 ; t28
+ psubsw m25, m17 ; t29
+.main3:
+ ITX_MULSUB_2W 15, 8, 9, 17, 10, 799, 4017 ; t17a, t30a
+ ITX_MULSUB_2W 25, 18, 9, 17, 10, m4017, 799 ; t18a, t29a
+ ITX_MULSUB_2W 29, 26, 9, 17, 10, 3406, 2276 ; t21a, t26a
+ ITX_MULSUB_2W 20, 16, 9, 17, 10, m2276, 3406 ; t22a, t25a
+ vpbroadcastd m12, [o(pw_m3784_1567)]
+ vpbroadcastd m11, [o(pw_1567_3784)]
+ psubsw m17, m21, m27 ; t28a
+ paddsw m21, m27 ; t31a
+ psubsw m27, m15, m25 ; t18
+ paddsw m15, m25 ; t17
+ psubsw m25, m20, m29 ; t21
+ paddsw m20, m29 ; t22
+ psubsw m29, m8, m18 ; t29
+ paddsw m8, m18 ; t30
+ psubsw m18, m22, m14 ; t19a
+ paddsw m22, m14 ; t16a
+ psubsw m14, m28, m24 ; t20a
+ paddsw m24, m28 ; t23a
+ paddsw m28, m16, m26 ; t25
+ psubsw m16, m26 ; t26
+ psubsw m26, m23, m19 ; t27a
+ paddsw m23, m19 ; t24a
+ ITX_MULSUB_2W 29, 27, 9, 19, 10, 11, 12 ; t18a, t29a
+ ITX_MULSUB_2W 17, 18, 9, 19, 10, 11, 12 ; t19, t28
+ vpbroadcastd m11, [o(pw_m1567_m3784)]
+ ITX_MULSUB_2W 16, 25, 9, 19, 10, 12, 11 ; t21a, t26a
+ ITX_MULSUB_2W 26, 14, 9, 19, 10, 12, 11 ; t20, t27
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ vpbroadcastd m11, [o(pw_2896_2896)]
+ psubsw m19, m27, m25 ; t26
+ paddsw m27, m25 ; t29
+ psubsw m25, m17, m26 ; t20a
+ paddsw m17, m26 ; t19a
+ paddsw m26, m18, m14 ; t28a
+ psubsw m18, m14 ; t27a
+ paddsw m14, m22, m24 ; t16
+ psubsw m22, m24 ; t23
+ psubsw m24, m29, m16 ; t21
+ paddsw m16, m29 ; t18
+ paddsw m29, m21, m23 ; t31
+ psubsw m21, m23 ; t24
+ psubsw m23, m15, m20 ; t22a
+ paddsw m15, m20 ; t17a
+ psubsw m20, m8, m28 ; t25a
+ paddsw m28, m8 ; t30a
+ ITX_MULSUB_2W 18, 25, 8, 9, 10, 11, 12 ; t20, t27
+ ITX_MULSUB_2W 19, 24, 8, 9, 10, 11, 12 ; t21a, t26a
+ ITX_MULSUB_2W 21, 22, 8, 9, 10, 11, 12 ; t23a, t24a
+ ITX_MULSUB_2W 20, 23, 8, 9, 10, 11, 12 ; t22, t25
+ ret
+
+%macro IDTX_32x32 2 ; dst[1-2]
+ vmovdqa32 ym%1, [cq+64*(%1+ 0)] ; force EVEX encoding, which
+ vmovdqa32 ym17, [cq+64*(%1+16)] ; reduces code size due to
+ vmovdqa32 ym%2, [cq+64*(%2+ 0)] ; compressed displacements
+ vmovdqa32 ym18, [cq+64*(%2+16)]
+ vpermt2q m%1, m21, m17
+ vpermt2q m%2, m21, m18
+%endmacro
+
+cglobal inv_txfm_add_identity_identity_32x32_8bpc, 3, 3, 22, dst, stride, c
+ movu m21, [permB+7]
+ vpbroadcastd m16, [pw_8192]
+ pxor m20, m20
+.loop:
+ IDTX_32x32 0, 1
+ IDTX_32x32 2, 3
+ IDTX_32x32 4, 5
+ IDTX_32x32 6, 7
+ IDTX_32x32 8, 9
+ IDTX_32x32 10, 11
+ IDTX_32x32 12, 13
+ IDTX_32x32 14, 15
+ call m(inv_txfm_add_identity_identity_16x32_8bpc).transpose_2x8x8_round
+ IDTX_32x16_STORE 0, 8, 1
+ IDTX_32x16_STORE 1, 9, 1
+ IDTX_32x16_STORE 2, 10, 1
+ IDTX_32x16_STORE 3, 11, 1
+ IDTX_32x16_STORE 4, 12, 1
+ IDTX_32x16_STORE 5, 13, 1
+ IDTX_32x16_STORE 6, 14, 1
+ IDTX_32x16_STORE 7, 15, 1
+ lea dstq, [dstq+strideq*8]
+ btc cq, 5
+ jnc .loop
+ mov r0d, 8
+.zero_loop:
+ mova [cq+64*0], m20
+ mova [cq+64*1], m20
+ mova [cq+64*2], m20
+ mova [cq+64*3], m20
+ add cq, 64*4
+ dec r0d
+ jg .zero_loop
+ RET
+
+cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ WIN64_SPILL_XMM 30
+ cmp eobd, 151
+ jb .fast
+ mova m5, [cq+64*10]
+ mova m3, [cq+64* 6]
+ mova m1, [cq+64* 2]
+ mova m7, [cq+64*14]
+ mova m2, [cq+64* 4]
+ mova m6, [cq+64*12]
+ mova m0, [cq+64* 0]
+ mova m4, [cq+64* 8]
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ mova m14, [cq+64* 1]
+ mova m21, [cq+64*15]
+ mova m18, [cq+64* 9]
+ mova m17, [cq+64* 7]
+ mova m16, [cq+64* 5]
+ mova m19, [cq+64*11]
+ mova m20, [cq+64*13]
+ mova m15, [cq+64* 3]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ vpbroadcastd m9, [o(pw_8192)]
+%macro TRANSPOSE_8x4_ROUND 4
+ punpckhwd m8, m%3, m%4 ; c4 d4 c5 d5 c6 d6 c7 d7
+ punpcklwd m%3, m%4 ; c0 d0 c1 d1 c2 d2 c3 d3
+ punpckhwd m%4, m%1, m%2 ; a4 b4 a5 b5 a6 b6 a7 b7
+ punpcklwd m%1, m%2 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpckhdq m%2, m%1, m%3 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m%1, m%3 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckldq m%3, m%4, m8 ; a4 b4 c4 d4 a5 b5 c5 d5
+ punpckhdq m%4, m8 ; a6 b6 c6 d6 a7 b7 c7 d7
+ REPX {pmulhrsw x, m9}, m%2, m%1, m%3, m%4
+%endmacro
+ TRANSPOSE_8x4_ROUND 0, 1, 2, 3
+ TRANSPOSE_8x4_ROUND 4, 5, 6, 7
+ TRANSPOSE_8x4_ROUND 14, 15, 16, 17
+ TRANSPOSE_8x4_ROUND 18, 19, 20, 21
+ vinserti32x8 m26, m0, ym4, 1 ; a0 a4 b0 b4
+ vshufi32x4 m0, m4, q3232 ; a8 a12 b8 b12
+ vinserti32x8 m27, m1, ym5, 1 ; a1 a5 b1 b5
+ vshufi32x4 m1, m5, q3232 ; a9 a13 b9 b13
+ vinserti32x8 m28, m2, ym6, 1 ; a2 a6 b2 b6
+ vshufi32x4 m2, m6, q3232 ; a10 a14 b10 b14
+ vinserti32x8 m29, m3, ym7, 1 ; a3 a7 b3 b7
+ vshufi32x4 m8, m3, m7, q3232 ; a11 a15 b11 b15
+ vinserti32x8 m4, m14, ym18, 1 ; c0 c4 d0 d4
+ vshufi32x4 m14, m18, q3232 ; c8 c12 d8 d12
+ vinserti32x8 m5, m15, ym19, 1 ; c1 c5 d1 d5
+ vshufi32x4 m15, m19, q3232 ; c9 c13 d9 d13
+ vinserti32x8 m6, m16, ym20, 1 ; c2 c6 d2 d6
+ vshufi32x4 m16, m20, q3232 ; c10 c14 d10 d14
+ vinserti32x8 m7, m17, ym21, 1 ; c3 c7 d3 d7
+ vshufi32x4 m17, m21, q3232 ; c11 c15 d11 d15
+ vshufi32x4 m22, m26, m4, q2020 ; 0 1
+ vshufi32x4 m26, m4, q3131 ; 8 9
+ vshufi32x4 m23, m27, m5, q2020 ; 2 3
+ vshufi32x4 m27, m5, q3131 ; 10 11
+ vshufi32x4 m24, m28, m6, q2020 ; 4 5
+ vshufi32x4 m28, m6, q3131 ; 12 13
+ vshufi32x4 m25, m29, m7, q2020 ; 6 7
+ vshufi32x4 m29, m7, q3131 ; 14 15
+ vshufi32x4 m4, m0, m14, q2020 ; 16 17
+ vshufi32x4 m3, m0, m14, q3131 ; 24 25
+ vshufi32x4 m20, m1, m15, q2020 ; 18 19
+ vshufi32x4 m19, m1, m15, q3131 ; 26 27
+ vshufi32x4 m5, m2, m16, q2020 ; 20 21
+ vshufi32x4 m0, m2, m16, q3131 ; 28 29
+ vshufi32x4 m16, m8, m17, q2020 ; 22 23
+ vshufi32x4 m17, m8, m17, q3131 ; 30 31
+ pxor m6, m6
+ mova [cq+64* 0], m4
+ mova [cq+64* 2], m5
+ mova [cq+64* 4], m3
+ mova [cq+64* 6], m0
+ punpcklwd m8, m24, m24 ; 4
+ punpcklwd m0, m0 ; 28
+ punpcklwd m5, m5 ; 20
+ punpcklwd m1, m28, m28 ; 12
+ punpcklwd m7, m26, m26 ; 8
+ punpcklwd m3, m3 ; 24
+ punpcklwd m9, m6, m22 ; __ 0
+ punpcklwd m6, m4 ; __ 16
+ call m(idct_16x16_internal_8bpc).main_fast3
+ mova [cq+64* 1], m20
+ mova [cq+64* 3], m16
+ mova [cq+64* 5], m19
+ mova [cq+64* 7], m17
+ punpcklwd m21, m23, m23 ; 2
+ punpcklwd m17, m17 ; 30
+ punpcklwd m20, m20 ; 18
+ punpcklwd m15, m29, m29 ; 14
+ punpcklwd m18, m27, m27 ; 10
+ punpcklwd m16, m16 ; 22
+ punpcklwd m19, m19 ; 26
+ punpcklwd m14, m25, m25 ; 6
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ mova [cq+64* 8], m14
+ mova [cq+64* 9], m15
+ mova [cq+64*10], m16
+ mova [cq+64*11], m17
+ mova [cq+64*12], m18
+ mova [cq+64*13], m19
+ mova [cq+64*14], m20
+ mova [cq+64*15], m21
+ mova m21, [cq+64* 7]
+ mova m14, [cq+64* 0]
+ mova m17, [cq+64* 3]
+ mova m18, [cq+64* 4]
+ mova m19, [cq+64* 5]
+ mova m16, [cq+64* 2]
+ mova m15, [cq+64* 1]
+ mova m20, [cq+64* 6]
+ REPX {punpckhwd x, x}, m22, m21, m14, m29, m26, m17, m18, m25, \
+ m24, m19, m16, m27, m28, m15, m20, m23
+ call .main_oddhalf
+ jmp .end
+.fast: ; right half is zero
+ mova ym8, [cq+64*15]
+ vinserti32x8 m8, [cq+64* 1], 1
+ mova m2, [o(int16_perm)]
+ mova ym9, [cq+64* 8]
+ vinserti32x8 m9, [cq+64* 0], 1
+ mova ym0, [cq+64* 7]
+ vinserti32x8 m0, [cq+64* 9], 1
+ mova ym7, [cq+64*14]
+ vinserti32x8 m7, [cq+64* 2], 1
+ mova ym1, [cq+64* 3]
+ vinserti32x8 m1, [cq+64*13], 1
+ mova ym3, [cq+64* 6]
+ vinserti32x8 m3, [cq+64*10], 1
+ mova ym5, [cq+64*11]
+ vinserti32x8 m5, [cq+64* 5], 1
+ mova ym6, [cq+64*12]
+ vinserti32x8 m6, [cq+64* 4], 1
+ REPX {vpermb x, m2, x}, m8, m9, m0, m7, m1, m3, m5, m6
+ call m(idct_16x16_internal_8bpc).main2
+ vbroadcasti32x4 m8, [o(int_shuf3)]
+ vbroadcasti32x4 m9, [o(int_shuf4)]
+ vpbroadcastd m11, [o(pw_8192)]
+ pshufb m0, m8
+ pshufb m1, m9
+ pshufb m2, m8
+ pshufb m3, m9
+ REPX {pmulhrsw x, m11}, m0, m1, m2, m3
+ pshufb m4, m8
+ pshufb m5, m9
+ pshufb m6, m8
+ pshufb m7, m9
+ REPX {pmulhrsw x, m11}, m4, m5, m6, m7
+ punpckhdq m28, m0, m1
+ punpckldq m0, m1
+ punpckhdq m27, m2, m3
+ punpckldq m2, m3
+ punpckhdq m22, m4, m5
+ punpckldq m4, m5
+ punpckhdq m23, m6, m7
+ punpckldq m6, m7
+ vinserti32x8 m14, m0, ym2, 1
+ vshufi32x4 m15, m0, m2, q3232
+ vinserti32x8 m2, m4, ym6, 1
+ vshufi32x4 m4, m6, q3232
+ vshufi32x4 m21, m14, m2, q2020 ; 0 2
+ vshufi32x4 m14, m2, q3131 ; 4 6
+ vshufi32x4 m18, m15, m4, q2020 ; 8 10
+ vshufi32x4 m15, m4, q3131 ; 12 14
+ pxor m9, m9
+ punpcklwd m8, m14, m14 ; 4
+ punpcklwd m1, m15, m15 ; 12
+ punpcklwd m7, m18, m18 ; 8
+ punpcklwd m9, m21 ; __ 0
+ call m(idct_16x16_internal_8bpc).main_fast4
+ punpckhwd m21, m21 ; 2
+ punpckhwd m15, m15 ; 14
+ punpckhwd m18, m18 ; 10
+ punpckhwd m14, m14 ; 6
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+ vinserti32x8 m24, m28, ym27, 1
+ vshufi32x4 m28, m27, q3232
+ vinserti32x8 m27, m22, ym23, 1
+ vshufi32x4 m22, m23, q3232
+ vshufi32x4 m23, m24, m27, q2020 ; 1 3
+ vshufi32x4 m24, m27, q3131 ; 5 7
+ vshufi32x4 m27, m28, m22, q2020 ; 9 11
+ vshufi32x4 m28, m22, q3131 ; 13 15
+ punpcklwd m22, m23, m23 ; 1
+ punpckhwd m29, m28, m28 ; 15
+ punpcklwd m26, m27, m27 ; 9
+ punpckhwd m25, m24, m24 ; 7
+ mova [cq+64* 8], m14
+ mova [cq+64* 9], m15
+ mova [cq+64*10], m16
+ mova [cq+64*11], m17
+ punpcklwd m24, m24 ; 5
+ punpckhwd m27, m27 ; 11
+ punpcklwd m28, m28 ; 13
+ punpckhwd m23, m23 ; 3
+ mova [cq+64*12], m18
+ mova [cq+64*13], m19
+ mova [cq+64*14], m20
+ mova [cq+64*15], m21
+ call .main_oddhalf_fast
+.end:
+ imul r6, strideq, 60
+ mova m10, [o(end_16x32p)]
+ vpbroadcastd m11, [o(pw_2048)]
+ lea r3, [strideq*3]
+ pxor m12, m12
+ add r6, dstq ; dst+stride*60
+ psrldq m13, m10, 1
+ lea r4, [strideq+r3] ; stride*4
+%macro IDCT_16x64_END 3 ; idct32, idct64, tmp
+%if %1 & 1
+ %define %%s0 r3
+ %define %%s1 strideq*2
+ %define %%s2 strideq*1
+ %define %%s3 strideq*0
+%else
+ %define %%s0 strideq*0
+ %define %%s1 strideq*1
+ %define %%s2 strideq*2
+ %define %%s3 r3
+%if %1
+ add dstq, r4
+ sub r6, r4
+%endif
+%endif
+%if %1 < 8
+ pmulhrsw m8, m11, m%1
+ pmulhrsw m9, m11, m%2
+%else
+ mova m9, [cq+64*%1]
+ paddsw m8, m9, m%2 ; out 0+n, 1+n
+ psubsw m9, m%2 ; out 63-n, 62-n
+ pmulhrsw m8, m11
+ pmulhrsw m9, m11
+%endif
+ mova xm29, [dstq+%%s0]
+ vinserti128 ym29, [dstq+%%s1], 1
+ mova xm%3, [r6 +%%s3]
+ vinserti128 ym%3, [r6 +%%s2], 1
+ vpermb m29, m10, m29
+ vpermb m%3, m10, m%3
+ mova [cq+64*%1], m12
+ paddw m29, m8
+ paddw m%3, m9
+ packuswb m29, m%3
+ vpermd m29, m13, m29
+ mova [dstq+%%s0], xm29
+ vextracti128 [dstq+%%s1], ym29, 1
+ vextracti32x4 [r6 +%%s2], m29, 2
+ vextracti32x4 [r6 +%%s3], m29, 3
+%endmacro
+ IDCT_16x64_END 0, 29, 0
+ IDCT_16x64_END 1, 28, 28
+ IDCT_16x64_END 2, 27, 28
+ IDCT_16x64_END 3, 26, 28
+ IDCT_16x64_END 4, 25, 28
+ IDCT_16x64_END 5, 24, 28
+ IDCT_16x64_END 6, 23, 28
+ IDCT_16x64_END 7, 22, 28
+ IDCT_16x64_END 8, 21, 28
+ IDCT_16x64_END 9, 20, 28
+ IDCT_16x64_END 10, 19, 28
+ IDCT_16x64_END 11, 18, 28
+ IDCT_16x64_END 12, 17, 28
+ IDCT_16x64_END 13, 16, 28
+ IDCT_16x64_END 14, 15, 28
+ IDCT_16x64_END 15, 14, 28
+ RET
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ imul r6d, 181
+ mov r3d, 64
+ add r6d, 128+512
+ sar r6d, 8+2
+ jmp m(inv_txfm_add_dct_dct_16x8_8bpc).dconly3
+ALIGN function_align
+.main_oddhalf_fast: ; bottom three-quarters are zero
+ vpbroadcastd m8, [o(pw_101_4095x8)]
+ vpbroadcastd m21, [o(pw_m1474_3822x8)]
+ vpbroadcastd m14, [o(pw_897_3996x8)]
+ vpbroadcastd m17, [o(pw_m700_4036x8)]
+ vpbroadcastd m18, [o(pw_501_4065x8)]
+ vpbroadcastd m19, [o(pw_m1092_3948x8)]
+ vpbroadcastd m16, [o(pw_1285_3889x8)]
+ vpbroadcastd m15, [o(pw_m301_4085x8)]
+ pmulhrsw m8, m22 ; t32a t63a
+ pmulhrsw m21, m29 ; t35a t60a
+ pmulhrsw m14, m26 ; t36a t59a
+ pmulhrsw m17, m25 ; t39a t56
+ pmulhrsw m18, m24 ; t40a t55a
+ pmulhrsw m19, m27 ; t43a t52a
+ pmulhrsw m16, m28 ; t44a t51a
+ pmulhrsw m15, m23 ; t47a t48a
+ mova m22, m8
+ mova m29, m21
+ mova m26, m14
+ mova m25, m17
+ mova m24, m18
+ mova m27, m19
+ mova m28, m16
+ mova m20, m15
+ jmp .main_oddhalf2
+ALIGN function_align
+.main_oddhalf:
+ vpbroadcastd m8, [o(pw_101_4095x8)]
+ vpbroadcastd m9, [o(pw_m2824_2967x8)]
+ vpbroadcastd m11, [o(pw_1660_3745x8)]
+ vpbroadcastd m12, [o(pw_m1474_3822x8)]
+ pmulhrsw m22, m8 ; t32a t63a
+ vpbroadcastd m8, [o(pw_897_3996x8)]
+ pmulhrsw m21, m9 ; t33a t62a
+ vpbroadcastd m9, [o(pw_m2191_3461x8)]
+ pmulhrsw m14, m11 ; t34a t61a
+ vpbroadcastd m11, [o(pw_2359_3349x8)]
+ pmulhrsw m29, m12 ; t35a t60a
+ vpbroadcastd m12, [o(pw_m700_4036x8)]
+ pmulhrsw m26, m8 ; t36a t59a
+ vpbroadcastd m8, [o(pw_501_4065x8)]
+ pmulhrsw m17, m9 ; t37a t58a
+ vpbroadcastd m9, [o(pw_m2520_3229x8)]
+ pmulhrsw m18, m11 ; t38a t57a
+ vpbroadcastd m11, [o(pw_2019_3564x8)]
+ pmulhrsw m25, m12 ; t39a t56a
+ vpbroadcastd m12, [o(pw_m1092_3948x8)]
+ pmulhrsw m24, m8 ; t40a t55a
+ vpbroadcastd m8, [o(pw_1285_3889x8)]
+ pmulhrsw m19, m9 ; t41a t54a
+ vpbroadcastd m9, [o(pw_m1842_3659x8)]
+ pmulhrsw m16, m11 ; t42a t53a
+ vpbroadcastd m11, [o(pw_2675_3102x8)]
+ pmulhrsw m27, m12 ; t43a t52a
+ vpbroadcastd m12, [o(pw_m301_4085x8)]
+ pmulhrsw m28, m8 ; t44a t51a
+ pmulhrsw m15, m9 ; t45a t50a
+ pmulhrsw m20, m11 ; t46a t49a
+ pmulhrsw m23, m12 ; t47a t48a
+ psubsw m8, m22, m21 ; t33 t62
+ paddsw m22, m21 ; t32 t63
+ psubsw m21, m29, m14 ; t34 t61
+ paddsw m29, m14 ; t35 t60
+ psubsw m14, m26, m17 ; t37 t58
+ paddsw m26, m17 ; t36 t59
+ psubsw m17, m25, m18 ; t38 t57
+ paddsw m25, m18 ; t39 t56
+ psubsw m18, m24, m19 ; t41 t54
+ paddsw m24, m19 ; t40 t55
+ psubsw m19, m27, m16 ; t42 t53
+ paddsw m27, m16 ; t43 t52
+ psubsw m16, m28, m15 ; t45 t50
+ paddsw m28, m15 ; t44 t51
+ psubsw m15, m23, m20 ; t46 t49
+ paddsw m20, m23 ; t47 t48
+.main_oddhalf2:
+ ITX_MUL2X_PACK 8, 9, 23, 10, 401, 4076, 5 ; t33a t62a
+ ITX_MUL2X_PACK 21, 9, 23, 10, m4076, 401, 5 ; t34a t61a
+ ITX_MUL2X_PACK 14, 9, 23, 10, 3166, 2598, 5 ; t37a t58a
+ ITX_MUL2X_PACK 17, 9, 23, 10, m2598, 3166, 5 ; t38a t57a
+ ITX_MUL2X_PACK 18, 9, 23, 10, 1931, 3612, 5 ; t41a t54a
+ ITX_MUL2X_PACK 19, 9, 23, 10, m3612, 1931, 5 ; t42a t53a
+ ITX_MUL2X_PACK 16, 9, 23, 10, 3920, 1189, 5 ; t45a t50a
+ ITX_MUL2X_PACK 15, 9, 23, 10, m1189, 3920, 5 ; t46a t49a
+ vpbroadcastd m11, [o(pw_m4017_799)]
+ psubsw m23, m25, m26 ; t36a t59a
+ paddsw m25, m26 ; t39a t56a
+ psubsw m26, m24, m27 ; t43a t52a
+ paddsw m27, m24 ; t40a t55a
+ psubsw m24, m20, m28 ; t44a t51a
+ paddsw m20, m28 ; t47a t48a
+ psubsw m28, m8, m21 ; t34 t61
+ paddsw m8, m21 ; t33 t62
+ psubsw m21, m17, m14 ; t37 t58
+ paddsw m17, m14 ; t38 t57
+ psubsw m14, m18, m19 ; t42 t53
+ paddsw m18, m19 ; t41 t54
+ psubsw m19, m15, m16 ; t45 t50
+ paddsw m15, m16 ; t46 t49
+ psubsw m16, m22, m29 ; t35a t60a
+ paddsw m22, m29 ; t32a t63a
+ ITX_MUL2X_PACK 16, 9, 29, 10, 799_4017, 11, 20 ; t35 t60
+ ITX_MUL2X_PACK 28, 9, 29, 10, 799_4017, 11, 20 ; t34a t61a
+ ITX_MUL2X_PACK 23, 9, 29, 10, 11, m799_m4017, 36 ; t36 t59
+ ITX_MUL2X_PACK 21, 9, 29, 10, 11, m799_m4017, 36 ; t37a t58a
+ vpbroadcastd m11, [o(pw_m2276_3406)]
+ ITX_MUL2X_PACK 26, 9, 29, 10, 3406_2276, 11, 20 ; t43 t52
+ ITX_MUL2X_PACK 14, 9, 29, 10, 3406_2276, 11, 20 ; t42a t53a
+ ITX_MUL2X_PACK 24, 9, 29, 10, 11, m3406_m2276, 36 ; t44 t51
+ ITX_MUL2X_PACK 19, 9, 29, 10, 11, m3406_m2276, 36 ; t45a t50a
+ vpbroadcastd m11, [o(pw_1567_3784)]
+ vpbroadcastd m12, [o(pw_m3784_1567)]
+ psubsw m29, m22, m25 ; t39 t56
+ paddsw m22, m25 ; t32 t63
+ psubsw m25, m20, m27 ; t40 t55
+ paddsw m20, m27 ; t47 t48
+ psubsw m27, m8, m17 ; t38a t57a
+ paddsw m8, m17 ; t33a t62a
+ psubsw m17, m15, m18 ; t41a t54a
+ paddsw m15, m18 ; t46a t49a
+ paddsw m18, m16, m23 ; t35a t60a
+ psubsw m16, m23 ; t36a t59a
+ psubsw m23, m24, m26 ; t43a t52a
+ paddsw m24, m26 ; t44a t51a
+ paddsw m26, m28, m21 ; t34 t61
+ psubsw m28, m21 ; t37 t58
+ psubsw m21, m19, m14 ; t42 t53
+ paddsw m19, m14 ; t45 t50
+ ITX_MUL2X_PACK 29, 9, 14, 10, 11, 12, 4 ; t39a t56a
+ ITX_MUL2X_PACK 27, 9, 14, 10, 11, 12, 4 ; t38 t57
+ ITX_MUL2X_PACK 16, 9, 14, 10, 11, 12, 4 ; t36 t59
+ ITX_MUL2X_PACK 28, 9, 14, 10, 11, 12, 4 ; t37a t58a
+ vpbroadcastd m11, [o(pw_m1567_m3784)]
+ ITX_MUL2X_PACK 25, 9, 14, 10, 12, 11, 4 ; t40a t55a
+ ITX_MUL2X_PACK 17, 9, 14, 10, 12, 11, 4 ; t41 t54
+ ITX_MUL2X_PACK 23, 9, 14, 10, 12, 11, 4 ; t43 t52
+ ITX_MUL2X_PACK 21, 9, 14, 10, 12, 11, 4 ; t42a t53a
+ vbroadcasti32x4 m13, [o(deint_shuf)]
+ vpbroadcastd m11, [o(pw_2896_2896)]
+ vpbroadcastd m12, [o(pw_m2896_2896)]
+ paddsw m14, m22, m20 ; t32a t63a
+ psubsw m22, m20 ; t47a t48a
+ psubsw m20, m8, m15 ; t46 t49
+ paddsw m8, m15 ; t33 t62
+ paddsw m15, m18, m24 ; t35 t60
+ psubsw m18, m24 ; t44 t51
+ psubsw m24, m26, m19 ; t45a t50a
+ paddsw m26, m19 ; t34a t61a
+ REPX {pshufb x, m13}, m14, m8, m15, m26
+ psubsw m19, m29, m25 ; t40 t55
+ paddsw m25, m29 ; t39 t56
+ psubsw m29, m27, m17 ; t41a t54a
+ paddsw m27, m17 ; t38a t57a
+ psubsw m17, m16, m23 ; t43a t52a
+ paddsw m16, m23 ; t36a t59a
+ psubsw m9, m28, m21 ; t42 t53
+ paddsw m28, m21 ; t37 t58
+ REPX {pshufb x, m13}, m25, m27, m16, m28
+ ITX_MUL2X_PACK 22, 13, 21, 10, 11, 12, 8 ; t47 t48
+ ITX_MUL2X_PACK 20, 23, 22, 10, 11, 12, 8 ; t46a t49a
+ packssdw m21, m22 ; t47 t46a
+ packssdw m13, m23 ; t48 t49a
+ ITX_MUL2X_PACK 18, 22, 20, 10, 11, 12, 8 ; t44a t51a
+ ITX_MUL2X_PACK 24, 23, 18, 10, 11, 12, 8 ; t45 t50
+ packssdw m20, m18 ; t44a t45
+ packssdw m22, m23 ; t51a t50
+ ITX_MUL2X_PACK 19, 24, 18, 10, 11, 12, 8 ; t40a t55a
+ ITX_MUL2X_PACK 29, 23, 19, 10, 11, 12, 8 ; t41 t54
+ packssdw m18, m19 ; t40a t41
+ packssdw m24, m23 ; t55a t54
+ ITX_MUL2X_PACK 17, 23, 19, 10, 11, 12, 8 ; t43 t52
+ ITX_MUL2X_PACK 9, 29, 17, 10, 11, 12, 8 ; t42a t53a
+ packssdw m19, m17 ; t43 t42a
+ packssdw m23, m29 ; t52 t53a
+ punpcklqdq m17, m25, m27 ; t39 t38a
+ punpckhqdq m25, m27 ; t56 t57a
+ punpckhqdq m27, m15, m26 ; t60 t61a
+ punpcklqdq m15, m26 ; t35 t34a
+ punpckhqdq m26, m16, m28 ; t59a t58
+ punpcklqdq m16, m28 ; t36a t37
+ punpckhqdq m28, m14, m8 ; t63a t62
+ punpcklqdq m14, m8 ; t32a t33
+ psubsw m29, m0, m28 ; out63 out62
+ paddsw m0, m28 ; out0 out1
+ psubsw m28, m1, m27 ; out60 out61
+ paddsw m1, m27 ; out3 out2
+ psubsw m27, m2, m26 ; out59 out58
+ paddsw m2, m26 ; out4 out5
+ psubsw m26, m3, m25 ; out56 out57
+ paddsw m3, m25 ; out7 out6
+ psubsw m25, m4, m24 ; out55 out54
+ paddsw m4, m24 ; out8 out9
+ psubsw m24, m5, m23 ; out52 out53
+ paddsw m5, m23 ; out11 out10
+ psubsw m23, m6, m22 ; out51 out50
+ paddsw m6, m22 ; out12 out13
+ psubsw m22, m7, m13 ; out48 out49
+ paddsw m7, m13 ; out15 out14
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 7, 0, dst, stride, c, eob
+%undef cmp
+ lea r5, [o_base]
+ test eobd, eobd
+ jnz .normal
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ mov r3d, 16
+.dconly:
+ imul r6d, 181
+ add r6d, 128+512
+ sar r6d, 8+2
+.dconly2:
+ imul r6d, 181
+ add r6d, 128+2048
+ sar r6d, 8+4
+ pxor m2, m2
+ vpbroadcastw m3, r6d
+.dconly_loop:
+ mova m1, [dstq]
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ paddw m0, m3
+ paddw m1, m3
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, strideq
+ dec r3d
+ jg .dconly_loop
+ RET
+.normal:
+ WIN64_SPILL_XMM 31
+ mova m19, [o(dup16_perm)]
+ mova m24, [cq+64* 2]
+ mova m28, [cq+64* 6]
+ mova m26, [cq+64* 4]
+ mova m22, [cq+64* 0]
+ mova m23, [cq+64* 1]
+ mova m29, [cq+64* 7]
+ mova m27, [cq+64* 5]
+ mova m25, [cq+64* 3]
+ vpermb m8, m19, m24 ; 4
+ vpermb m1, m19, m28 ; 12
+ vpermb m7, m19, m26 ; 8
+ vpermb m9, m19, m22 ; __ 0
+ vpermb m21, m19, m23 ; 2
+ vpermb m15, m19, m29 ; 14
+ vpermb m18, m19, m27 ; 10
+ vpermb m14, m19, m25 ; 6
+ pslld m9, 16
+ vpord m30, m19, [o(pb_32)] {1to16}
+ REPX {vpermb x, m30, x}, m22, m29, m26, m25, m24, m27, m28, m23
+ cmp eobd, 151
+ jb .fast
+ vpermb m0, m19, [cq+64*14] ; 28
+ vpermb m5, m19, [cq+64*10] ; 20
+ vpermb m3, m19, [cq+64*12] ; 24
+ vpermb m6, m19, [cq+64* 8] ; __ 16
+ pslld m6, 16
+ call m(idct_16x16_internal_8bpc).main_fast
+ vpermb m17, m19, [cq+64*15] ; 30
+ vpermb m20, m19, [cq+64* 9] ; 18
+ vpermb m16, m19, [cq+64*11] ; 22
+ vpermb m19, m19, [cq+64*13] ; 26
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ mova [cq+64* 0], m14
+ mova [cq+64* 1], m15
+ mova [cq+64* 2], m16
+ mova [cq+64* 3], m17
+ mova [cq+64* 4], m18
+ mova [cq+64* 5], m19
+ mova [cq+64* 6], m20
+ mova [cq+64* 7], m21
+ vpermb m21, m30, [cq+64*15]
+ vpermb m14, m30, [cq+64* 8]
+ vpermb m17, m30, [cq+64*11]
+ vpermb m18, m30, [cq+64*12]
+ vpermb m19, m30, [cq+64*13]
+ vpermb m16, m30, [cq+64*10]
+ vpermb m15, m30, [cq+64* 9]
+ vpermb m20, m30, [cq+64*14]
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf
+ jmp .end
+.fast: ; bottom half is zero
+ call m(idct_16x16_internal_8bpc).main_fast2
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+ mova [cq+64* 0], m14
+ mova [cq+64* 1], m15
+ mova [cq+64* 2], m16
+ mova [cq+64* 3], m17
+ mova [cq+64* 4], m18
+ mova [cq+64* 5], m19
+ mova [cq+64* 6], m20
+ mova [cq+64* 7], m21
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
+.end:
+ mova [cq+64* 8], m4
+ mova [cq+64* 9], m5
+ mova [cq+64*10], m6
+ mova [cq+64*11], m7
+ mova [cq+64*12], m26
+ mova [cq+64*13], m27
+ mova [cq+64*14], m28
+ mova [cq+64*15], m29
+ vpbroadcastd m13, [o(pw_8192)]
+ call .pass1_end
+ call .pass2
+ mova [cq+64* 0], m0
+ mova [cq+64* 1], m1
+ mova [cq+64* 2], m2
+ mova [cq+64* 3], m3
+ mova [cq+64* 4], m4
+ mova [cq+64* 5], m5
+ mova [cq+64* 6], m6
+ mova [cq+64* 7], m7
+ pmulhrsw m0, m13, [cq+64* 8]
+ pmulhrsw m1, m13, [cq+64* 9]
+ pmulhrsw m2, m13, [cq+64*10]
+ pmulhrsw m3, m13, [cq+64*11]
+ vpbroadcastd m30, [o(pw_2048)]
+ pmulhrsw m4, m13, m22
+ pmulhrsw m5, m13, m23
+ pmulhrsw m6, m13, m24
+ pmulhrsw m7, m13, m25
+ pmulhrsw m22, m30, m14
+ pmulhrsw m14, m13, m26
+ pmulhrsw m23, m30, m15
+ pmulhrsw m15, m13, m27
+ pmulhrsw m24, m30, m16
+ pmulhrsw m16, m13, m28
+ pmulhrsw m25, m30, m17
+ pmulhrsw m17, m13, m29
+ pmulhrsw m26, m30, m18
+ pmulhrsw m18, m13, [cq+64*12]
+ pmulhrsw m27, m30, m19
+ pmulhrsw m19, m13, [cq+64*13]
+ pmulhrsw m28, m30, m20
+ pmulhrsw m20, m13, [cq+64*14]
+ pmulhrsw m29, m30, m21
+ pmulhrsw m21, m13, [cq+64*15]
+ call .transpose_round
+ call .pass2
+ pxor m10, m10
+ lea r3, [strideq*3]
+%macro IDCT_64x16_END 4
+ mova m9, [dstq+%4]
+%if %1 < 8
+ pmulhrsw m%3, m30, [cq+64*%1]
+%endif
+ pmulhrsw m%2, m30
+ mova [cq+64*%1], m10
+ punpcklbw m8, m9, m10
+ punpckhbw m9, m10
+ paddw m8, m%3
+ paddw m9, m%2
+ packuswb m8, m9
+ mova [dstq+%4], m8
+%if %1 == 3 || %1 == 7 || %1 == 11
+ lea dstq, [dstq+strideq*4]
+%endif
+%endmacro
+ IDCT_64x16_END 0, 0, 11, strideq*0
+ IDCT_64x16_END 1, 1, 11, strideq*1
+ IDCT_64x16_END 2, 2, 11, strideq*2
+ IDCT_64x16_END 3, 3, 11, r3
+ IDCT_64x16_END 4, 4, 11, strideq*0
+ IDCT_64x16_END 5, 5, 11, strideq*1
+ IDCT_64x16_END 6, 6, 11, strideq*2
+ IDCT_64x16_END 7, 7, 11, r3
+ IDCT_64x16_END 8, 14, 22, strideq*0
+ IDCT_64x16_END 9, 15, 23, strideq*1
+ IDCT_64x16_END 10, 16, 24, strideq*2
+ IDCT_64x16_END 11, 17, 25, r3
+ IDCT_64x16_END 12, 18, 26, strideq*0
+ IDCT_64x16_END 13, 19, 27, strideq*1
+ IDCT_64x16_END 14, 20, 28, strideq*2
+ IDCT_64x16_END 15, 21, 29, r3
+ RET
+ALIGN function_align
+.pass1_end:
+ mova m4, [cq+64* 0]
+ mova m5, [cq+64* 1]
+ mova m6, [cq+64* 2]
+ mova m7, [cq+64* 3]
+ mova m8, [cq+64* 4]
+ mova m9, [cq+64* 5]
+ mova m11, [cq+64* 6]
+ mova m12, [cq+64* 7]
+ psubsw m29, m4, m21 ; out47 out46
+ paddsw m4, m21 ; out16 out17
+ psubsw m28, m5, m20 ; out44 out45
+ paddsw m5, m20 ; out19 out18
+ REPX {pmulhrsw x, m13}, m0, m1, m2, m3
+ psubsw m27, m6, m19 ; out43 out42
+ paddsw m6, m19 ; out20 out21
+ psubsw m26, m7, m18 ; out40 out41
+ paddsw m7, m18 ; out23 out22
+ pmulhrsw m18, m13, m22
+ pmulhrsw m19, m13, m23
+ pmulhrsw m20, m13, m24
+ pmulhrsw m21, m13, m25
+ paddsw m25, m12, m14 ; out31 out30
+ psubsw m14, m12, m14 ; out32 out33
+ paddsw m24, m11, m15 ; out28 out29
+ psubsw m15, m11, m15 ; out35 out34
+ REPX {pmulhrsw x, m13}, m4, m5, m6, m7
+ paddsw m23, m9, m16 ; out27 out26
+ psubsw m16, m9, m16 ; out36 out37
+ paddsw m22, m8, m17 ; out24 out25
+ psubsw m17, m8, m17 ; out39 out38
+ REPX {pmulhrsw x, m13}, m14, m15, m16, m17
+.transpose_round:
+%macro TRANSPOSE_8x4_PACKED 4
+ punpckhwd m8, m%1, m%3 ; b0 f0 b1 f1 b2 f2 b3 f3
+ punpcklwd m%1, m%3 ; a0 e0 a1 e1 a2 e2 a3 e3
+ punpcklwd m%3, m%2, m%4 ; d0 h0 d1 h1 d2 h2 d3 h3
+ punpckhwd m%2, m%4 ; c0 g0 c1 g1 c2 g2 c3 g3
+ punpckhwd m%4, m%1, m%2 ; a2 c2 e2 g2 a3 c3 e3 g3
+ punpcklwd m%1, m%2 ; a0 c0 e0 g0 a1 c1 e1 g1
+ punpckhwd m%2, m8, m%3 ; b2 d2 f2 h2 b3 d3 f3 h3
+ punpcklwd m8, m%3 ; b0 d0 f0 h0 b1 d1 f1 h1
+ punpcklwd m%3, m%4, m%2 ; 2
+ punpckhwd m%4, m%2 ; 3
+ punpckhwd m%2, m%1, m8 ; 1
+ punpcklwd m%1, m8 ; 0
+%endmacro
+ TRANSPOSE_8x4_PACKED 0, 1, 2, 3
+ TRANSPOSE_8x4_PACKED 18, 19, 20, 21
+ TRANSPOSE_8x4_PACKED 4, 5, 6, 7
+ TRANSPOSE_8x4_PACKED 14, 15, 16, 17
+ vshufi32x4 m8, m0, m4, q3232 ; a02 a03 b02 b03
+ vinserti32x8 m0, ym4, 1 ; a00 a01 b00 b01
+ vshufi32x4 m4, m1, m5, q3232 ; a12 a13 b12 b13
+ vinserti32x8 m9, m1, ym5, 1 ; a10 a11 b10 b11
+ vshufi32x4 m5, m2, m6, q3232 ; a22 a23 b22 b23
+ vinserti32x8 m1, m2, ym6, 1 ; a20 a21 b20 b21
+ vshufi32x4 m6, m3, m7, q3232 ; a32 a33 b32 b33
+ vinserti32x8 m11, m3, ym7, 1 ; a30 a31 b30 b31
+ vshufi32x4 m2, m14, m18, q3232 ; c02 c03 d02 d03
+ vinserti32x8 m3, m14, ym18, 1 ; c00 c01 d00 d01
+ vshufi32x4 m18, m15, m19, q3232 ; c12 c13 d12 d13
+ vinserti32x8 m15, ym19, 1 ; c10 c11 d10 d11
+ vshufi32x4 m19, m16, m20, q3232 ; c22 c23 d22 d23
+ vinserti32x8 m16, ym20, 1 ; c20 c21 d20 d21
+ vshufi32x4 m20, m17, m21, q3232 ; c32 c33 d32 d33
+ vinserti32x8 m17, ym21, 1 ; c30 c31 d30 d31
+ ret
+.pass2:
+ vshufi32x4 m7, m5, m19, q3131 ; 14
+ vshufi32x4 m5, m19, q2020 ; 10
+ vshufi32x4 m21, m6, m20, q3131 ; 15
+ vshufi32x4 m19, m6, m20, q2020 ; 11
+ vshufi32x4 m20, m4, m18, q3131 ; 13
+ vshufi32x4 m18, m4, m18, q2020 ; 9
+ vshufi32x4 m6, m8, m2, q3131 ; 12
+ vshufi32x4 m4, m8, m2, q2020 ; 8
+ vshufi32x4 m2, m0, m3, q3131 ; 4
+ vshufi32x4 m0, m3, q2020 ; 0
+ vshufi32x4 m3, m1, m16, q3131 ; 6
+ vshufi32x4 m1, m16, q2020 ; 2
+ vshufi32x4 m16, m9, m15, q3131 ; 5
+ vshufi32x4 m14, m9, m15, q2020 ; 1
+ vshufi32x4 m15, m11, m17, q2020 ; 3
+ vshufi32x4 m17, m11, m17, q3131 ; 7
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main2
+ jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+
+cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 7, 0, dst, stride, c, eob
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 9, 30, 64*32, dst, stride, c, eob
+ vpbroadcastd m23, [o(pw_2896x8)]
+%undef cmp
+ cmp eobd, 136
+ jb .fast
+ pmulhrsw m5, m23, [cq+64*20]
+ pmulhrsw m3, m23, [cq+64*12]
+ pmulhrsw m1, m23, [cq+64* 4]
+ pmulhrsw m7, m23, [cq+64*28]
+ pmulhrsw m2, m23, [cq+64* 8]
+ pmulhrsw m6, m23, [cq+64*24]
+ pmulhrsw m0, m23, [cq+64* 0]
+ pmulhrsw m4, m23, [cq+64*16]
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ pmulhrsw m14, m23, [cq+64* 2]
+ pmulhrsw m21, m23, [cq+64*30]
+ pmulhrsw m18, m23, [cq+64*18]
+ pmulhrsw m17, m23, [cq+64*14]
+ pmulhrsw m16, m23, [cq+64*10]
+ pmulhrsw m19, m23, [cq+64*22]
+ pmulhrsw m20, m23, [cq+64*26]
+ pmulhrsw m15, m23, [cq+64* 6]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ mova [cq+64* 0], m14
+ mova [cq+64* 2], m15
+ mova [cq+64* 4], m16
+ mova [cq+64* 6], m17
+ mova [cq+64* 8], m18
+ mova [cq+64*10], m19
+ mova [cq+64*12], m20
+ mova [cq+64*14], m21
+ pmulhrsw m22, m23, [cq+64* 1]
+ pmulhrsw m21, m23, [cq+64*31]
+ pmulhrsw m14, m23, [cq+64*17]
+ pmulhrsw m29, m23, [cq+64*15]
+ pmulhrsw m26, m23, [cq+64* 9]
+ pmulhrsw m17, m23, [cq+64*23]
+ pmulhrsw m18, m23, [cq+64*25]
+ pmulhrsw m25, m23, [cq+64* 7]
+ pmulhrsw m24, m23, [cq+64* 5]
+ pmulhrsw m19, m23, [cq+64*27]
+ pmulhrsw m16, m23, [cq+64*21]
+ pmulhrsw m27, m23, [cq+64*11]
+ pmulhrsw m28, m23, [cq+64*13]
+ pmulhrsw m15, m23, [cq+64*19]
+ pmulhrsw m20, m23, [cq+64*29]
+ pmulhrsw m23, [cq+64* 3]
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
+ vpbroadcastd m12, [o(pw_16384)]
+ psubsw m13, m0, m29 ; 31
+ paddsw m0, m29 ; 0
+ psubsw m29, m1, m28 ; 30
+ paddsw m1, m28 ; 1
+ psubsw m28, m2, m27 ; 29
+ paddsw m2, m27 ; 2
+ psubsw m27, m3, m26 ; 28
+ paddsw m3, m26 ; 3
+ psubsw m26, m4, m25 ; 27
+ paddsw m4, m25 ; 4
+ psubsw m25, m5, m24 ; 26
+ paddsw m5, m24 ; 5
+ psubsw m24, m6, m23 ; 25
+ paddsw m6, m23 ; 6
+ psubsw m23, m7, m22 ; 24
+ paddsw m7, m22 ; 7
+ pxor m9, m9
+ punpckhwd m8, m0, m1 ; a4 b4 a5 b5 a6 b6 a7 b7
+ punpcklwd m0, m1 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpckhwd m1, m2, m3 ; c4 d4 c5 d5 c6 d6 c7 d7
+ punpcklwd m2, m3 ; c0 d0 c1 d1 c2 d2 c3 d3
+ REPX {mova [cq+64*x], m9}, 16, 17, 18, 19
+ punpckhwd m22, m4, m5 ; e4 f4 e5 f5 e6 f6 e7 f7
+ punpcklwd m4, m5 ; e0 f0 e1 f1 e2 f2 e3 f3
+ punpckhwd m5, m6, m7 ; g4 h4 g5 h5 g6 h6 g7 h7
+ punpcklwd m6, m7 ; g0 h0 g1 h1 g2 h2 g3 h3
+ REPX {mova [cq+64*x], m9}, 20, 21, 22, 23
+ punpckhwd m3, m23, m24
+ punpcklwd m23, m24
+ punpckhwd m24, m25, m26
+ punpcklwd m25, m26
+ REPX {mova [cq+64*x], m9}, 24, 25, 26, 27
+ punpckhwd m26, m27, m28
+ punpcklwd m27, m28
+ punpckhwd m28, m29, m13
+ punpcklwd m29, m13
+ REPX {mova [cq+64*x], m9}, 28, 29, 30, 31
+ punpckhdq m7, m0, m2 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m0, m2 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckhdq m2, m4, m6 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpckldq m4, m6 ; e0 f0 g0 h0 e1 f1 g1 h1
+ REPX {pmulhrsw x, m12}, m7, m0, m2, m4
+ punpckhdq m6, m8, m1 ; a6 b6 c6 d6 a7 b7 c7 d7
+ punpckldq m8, m1 ; a4 b4 c4 d4 a5 b5 c5 d5
+ punpckhdq m1, m22, m5 ; e6 f6 g6 h6 e7 f7 g7 h7
+ punpckldq m22, m5 ; e4 f4 g4 h5 e5 f5 g5 h5
+ REPX {pmulhrsw x, m12}, m6, m8, m1, m22
+ punpckhdq m13, m23, m25
+ punpckldq m23, m25
+ punpckhdq m25, m27, m29
+ punpckldq m27, m29
+ REPX {pmulhrsw x, m12}, m13, m23, m25, m27
+ punpckhdq m9, m3, m24
+ punpckldq m3, m24
+ punpckhdq m24, m26, m28
+ punpckldq m26, m28
+ REPX {pmulhrsw x, m12}, m9, m3, m24, m26
+ punpckhqdq m5, m23, m27 ; d01 d09 d17 d25
+ punpcklqdq m23, m27 ; d00 d08 d16 d24
+ punpcklqdq m27, m13, m25 ; d02 d10 d18 d26
+ punpckhqdq m13, m25 ; d03 d11 d19 d27
+ punpcklqdq m25, m3, m26 ; d04 d12 d20 d28
+ punpckhqdq m3, m26 ; d05 d13 d21 d29
+ punpcklqdq m26, m9, m24 ; d06 d14 d22 d30
+ punpckhqdq m9, m24 ; d07 d15 d23 d31
+ mova [cq+64* 3], m23
+ mova [cq+64*13], m27
+ mova [cq+64* 7], m25
+ mova [cq+64*15], m26
+ punpckhqdq m24, m8, m22 ; a05 a13 a21 a29
+ punpcklqdq m8, m22 ; a04 a12 a20 a28
+ punpckhqdq m22, m0, m4 ; a01 a09 a17 a25
+ punpcklqdq m0, m4 ; a00 a08 a16 a24
+ punpckhqdq m23, m7, m2 ; a03 a11 a19 a27
+ punpcklqdq m7, m2 ; a02 a10 a18 a26
+ punpckhqdq m25, m6, m1 ; a07 a15 a23 a31
+ punpcklqdq m6, m1 ; a06 a14 a22 a30
+ mova [cq+64* 1], m0
+ mova [cq+64* 9], m7
+ mova [cq+64* 5], m8
+ mova [cq+64*11], m6
+ mova m2, [cq+64* 0]
+ mova m11, [cq+64* 2]
+ mova m8, [cq+64* 4]
+ mova m29, [cq+64* 6]
+ mova m27, [cq+64* 8]
+ mova m26, [cq+64*10]
+ mova m4, [cq+64*12]
+ mova m28, [cq+64*14]
+ psubsw m1, m2, m21 ; 23
+ paddsw m2, m21 ; 8
+ psubsw m21, m11, m20 ; 22
+ paddsw m11, m20 ; 9
+ psubsw m20, m8, m19 ; 21
+ paddsw m8, m19 ; 10
+ psubsw m19, m29, m18 ; 20
+ paddsw m29, m18 ; 11
+ psubsw m18, m27, m17 ; 19
+ paddsw m27, m17 ; 12
+ psubsw m17, m26, m16 ; 18
+ paddsw m26, m16 ; 13
+ psubsw m16, m4, m15 ; 17
+ paddsw m4, m15 ; 14
+ psubsw m15, m28, m14 ; 16
+ paddsw m28, m14 ; 15
+ punpcklwd m14, m15, m16
+ punpckhwd m15, m16
+ punpckhwd m16, m17, m18
+ punpcklwd m17, m18
+ punpckhwd m18, m19, m20
+ punpcklwd m19, m20
+ punpckhwd m20, m21, m1
+ punpcklwd m21, m1
+ punpckhwd m1, m2, m11 ; i4 j4 i5 j5 i6 j6 i7 j7
+ punpcklwd m2, m11 ; i0 j1 i1 j1 i2 j2 i3 j3
+ punpckhwd m11, m8, m29 ; k4 l4 k5 l5 k6 l6 k7 l7
+ punpcklwd m8, m29 ; k0 l0 k1 l1 k2 l2 k3 l3
+ punpckhwd m29, m27, m26 ; m4 n4 m5 n5 m6 n6 m7 n7
+ punpcklwd m27, m26 ; m0 n0 m1 n1 m2 n2 m3 n3
+ punpckhwd m26, m4, m28 ; o4 p4 o5 p5 o6 p6 o7 p7
+ punpcklwd m4, m28 ; o0 p0 o1 p1 o2 p2 o3 p3
+ punpckhdq m28, m2, m8 ; i2 j2 k2 l2 i3 j3 k3 l3
+ punpckldq m2, m8 ; i0 j0 k0 l0 i1 j1 k1 l1
+ punpckhdq m8, m27, m4 ; m2 n2 o2 p2 m3 n3 o3 p3
+ punpckldq m27, m4 ; m0 n0 o0 p0 m1 n1 o1 p1
+ REPX {pmulhrsw x, m12}, m28, m2, m8, m27
+ punpckhdq m4, m1, m11 ; i6 j6 k6 l6 i7 j7 k7 l7
+ punpckldq m1, m11 ; i4 j4 k4 l4 i5 j5 k5 l5
+ punpckhdq m11, m29, m26 ; m6 n6 o6 p6 m7 n7 o7 p7
+ punpckldq m29, m26 ; m4 n4 o4 p4 m5 n5 o5 p5
+ REPX {pmulhrsw x, m12}, m4, m1, m11, m29
+ punpckhdq m26, m19, m21
+ punpckldq m19, m21
+ punpckhdq m21, m15, m16
+ punpckldq m15, m16
+ REPX {pmulhrsw x, m12}, m26, m19, m21, m15
+ punpckhdq m16, m18, m20
+ punpckldq m18, m20
+ punpckhdq m20, m14, m17
+ punpckldq m14, m17
+ REPX {pmulhrsw x, m12}, m16, m18, m20, m14
+ punpckhqdq m17, m28, m8 ; b03 b11 b19 b27
+ punpcklqdq m28, m8 ; b02 b10 b18 b26
+ punpckhqdq m8, m2, m27 ; b01 b09 b17 b25
+ punpcklqdq m2, m27 ; b00 b08 b16 b24
+ punpcklqdq m27, m1, m29 ; b04 b12 b20 b28
+ punpckhqdq m1, m29 ; b05 b13 b21 b29
+ punpcklqdq m29, m4, m11 ; b06 b14 b22 b30
+ punpckhqdq m4, m11 ; b07 b15 b23 b31
+ mova [cq+64* 0], m2
+ mova [cq+64* 8], m28
+ mova [cq+64* 4], m27
+ mova [cq+64*10], m29
+ punpckhqdq m27, m20, m26 ; c03 c11 c19 c27
+ punpcklqdq m20, m26 ; c02 c10 c18 c26
+ punpckhqdq m26, m14, m19 ; c01 c09 c17 c25
+ punpcklqdq m14, m19 ; c00 c08 c16 c24
+ punpckhqdq m28, m15, m18 ; c05 c13 c21 c29
+ punpcklqdq m15, m18 ; c04 c12 c20 c28
+ punpckhqdq m29, m21, m16 ; c07 c15 c23 c31
+ punpcklqdq m21, m16 ; c06 c14 c22 c30
+ mova [cq+64* 2], m14
+ mova [cq+64*12], m20
+ mova [cq+64* 6], m15
+ mova [cq+64*14], m21
+ vshufi32x4 m14, m22, m8, q3232 ; a17 a25 b17 b25
+ vinserti32x8 m22, ym8, 1 ; a01 a09 b01 b09
+ vshufi32x4 m15, m23, m17, q3232 ; a19 a27 b19 b27
+ vinserti32x8 m23, ym17, 1 ; a03 a11 b03 b11
+ vshufi32x4 m16, m24, m1, q3232 ; a21 a29 b21 b29
+ vinserti32x8 m24, ym1, 1 ; a05 a13 b05 b13
+ vshufi32x4 m17, m25, m4, q3232 ; a23 a31 b23 b31
+ vinserti32x8 m25, ym4, 1 ; a07 a15 b07 b15
+ vinserti32x8 m19, m26, ym5, 1 ; c01 c09 d01 d09
+ vshufi32x4 m26, m5, q3232 ; c17 c25 d17 d25
+ vinserti32x8 m20, m27, ym13, 1 ; c03 c11 d03 d11
+ vshufi32x4 m27, m13, q3232 ; c19 c27 d19 d27
+ vinserti32x8 m21, m28, ym3, 1 ; c05 c13 d05 d13
+ vshufi32x4 m28, m3, q3232 ; c21 c29 d21 d29
+ vinserti32x8 m18, m29, ym9, 1 ; c07 c15 d07 d15
+ vshufi32x4 m29, m9, q3232 ; c23 c31 d23 d31
+ mov r4, rsp
+ vshufi32x4 m0, m22, m19, q2020 ; 1
+ vshufi32x4 m1, m17, m29, q3131 ; 31
+ vshufi32x4 m2, m14, m26, q2020 ; 17
+ vshufi32x4 m3, m25, m18, q3131 ; 15
+ call .main_part1
+ vshufi32x4 m0, m25, m18, q2020 ; 7
+ vshufi32x4 m1, m14, m26, q3131 ; 25
+ vshufi32x4 m2, m17, m29, q2020 ; 23
+ vshufi32x4 m3, m22, m19, q3131 ; 9
+ call .main_part1
+ vshufi32x4 m0, m24, m21, q2020 ; 5
+ vshufi32x4 m1, m15, m27, q3131 ; 27
+ vshufi32x4 m2, m16, m28, q2020 ; 21
+ vshufi32x4 m3, m23, m20, q3131 ; 11
+ call .main_part1
+ vshufi32x4 m0, m23, m20, q2020 ; 3
+ vshufi32x4 m1, m16, m28, q3131 ; 29
+ vshufi32x4 m2, m15, m27, q2020 ; 19
+ vshufi32x4 m3, m24, m21, q3131 ; 13
+ call .main_part1
+ call .main_part2
+ mova m0, [cq+64* 1] ; a0
+ mova m15, [cq+64* 0] ; b0
+ mova m3, [cq+64* 2] ; c0
+ mova m16, [cq+64* 3] ; d0
+ mova m14, [cq+64* 5] ; a4
+ mova m8, [cq+64* 4] ; b4
+ mova m17, [cq+64* 6] ; c4
+ mova m1, [cq+64* 7] ; d4
+ vshufi32x4 m2, m0, m15, q3232 ; a16 a24 b16 b24
+ vinserti32x8 m0, ym15, 1 ; a00 a08 b00 b08
+ vshufi32x4 m15, m3, m16, q3232 ; c16 c24 d16 d24
+ vinserti32x8 m3, ym16, 1 ; c00 c08 d00 d08
+ vshufi32x4 m16, m14, m8, q3232 ; a20 a28 b20 b28
+ vinserti32x8 m14, ym8, 1 ; a04 a12 b04 b12
+ vshufi32x4 m8, m17, m1, q3232 ; c20 c28 d20 d28
+ vinserti32x8 m17, ym1, 1 ; c04 c12 d04 d12
+ vshufi32x4 m1, m0, m3, q3131 ; 8
+ vshufi32x4 m0, m3, q2020 ; 0
+ vshufi32x4 m3, m2, m15, q3131 ; 24
+ vshufi32x4 m2, m15, q2020 ; 16
+ vshufi32x4 m15, m14, m17, q3131 ; 12
+ vshufi32x4 m14, m17, q2020 ; 4
+ vshufi32x4 m17, m16, m8, q3131 ; 28
+ vshufi32x4 m16, m8, q2020 ; 20
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ mova m8, [cq+64* 8]
+ mova m9, [cq+64*12]
+ mova m11, [cq+64*10]
+ mova m12, [cq+64*14]
+ mova [cq+64* 0], m14
+ mova [cq+64* 2], m15
+ mova [cq+64* 4], m16
+ mova [cq+64* 6], m17
+ mova [cq+64* 8], m18
+ mova [cq+64*10], m19
+ mova [cq+64*12], m20
+ mova [cq+64*14], m21
+ mova m22, [cq+64* 9]
+ mova m27, [cq+64*13]
+ mova m23, [cq+64*11]
+ mova m24, [cq+64*15]
+ vshufi32x4 m26, m22, m8, q3232 ; a18 a26 b18 b26
+ vinserti32x8 m22, ym8, 1 ; a02 a10 b02 b10
+ vshufi32x4 m8, m9, m27, q3232 ; c18 c26 d18 d26
+ vinserti32x8 m9, ym27, 1 ; c02 c10 d02 d10
+ vshufi32x4 m27, m23, m11, q3232 ; a22 a30 b22 b30
+ vinserti32x8 m23, ym11, 1 ; a06 a14 b06 b14
+ vshufi32x4 m11, m12, m24, q3232 ; c22 c30 d22 d30
+ vinserti32x8 m12, ym24, 1 ; c06 c14 d06 d14
+ vshufi32x4 m28, m26, m8, q3131 ; 26
+ vshufi32x4 m26, m8, q2020 ; 18
+ vshufi32x4 m24, m22, m9, q3131 ; 10
+ vshufi32x4 m22, m9, q2020 ; 2
+ vshufi32x4 m29, m27, m11, q3131 ; 30
+ vshufi32x4 m27, m11, q2020 ; 22
+ vshufi32x4 m25, m23, m12, q3131 ; 14
+ vshufi32x4 m23, m12, q2020 ; 6
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+ jmp .end
+.fast: ; bottom/right halves are zero
+ pmulhrsw ym9, ym23, [cq+64* 0]
+ pmulhrsw ym6, ym23, [cq+64* 8]
+ mova m14, [o(dup16_perm)]
+ pmulhrsw ym8, ym23, [cq+64* 2]
+ pmulhrsw xm0, xm23, [cq+64*14]
+ pmulhrsw xm5, xm23, [cq+64*10]
+ pmulhrsw ym1, ym23, [cq+64* 6]
+ pmulhrsw ym7, ym23, [cq+64* 4]
+ pmulhrsw xm3, xm23, [cq+64*12]
+ pmovzxwd m9, ym9
+ pmovzxwd m6, ym6
+ vpermb m8, m14, m8
+ punpcklwd xm0, xm0
+ vpermb ym5, ym14, ym5
+ vpermb m1, m14, m1
+ vpermb m7, m14, m7
+ punpcklwd xm3, xm3
+ pslld m9, 16
+ pslld m6, 16
+ call m(idct_16x16_internal_8bpc).main_fast
+ vpmulhrsw ym21, ym23, [cq+64* 1]
+ {evex}vpmulhrsw xm17, xm23, [cq+64*15] ; force EVEX encoding, which
+ {evex}vpmulhrsw xm20, xm23, [cq+64* 9] ; reduces code size due to
+ {evex}vpmulhrsw ym15, ym23, [cq+64* 7] ; compressed displacements
+ {evex}vpmulhrsw ym18, ym23, [cq+64* 5]
+ {evex}vpmulhrsw xm16, xm23, [cq+64*11]
+ {evex}vpmulhrsw xm19, xm23, [cq+64*13]
+ {evex}vpmulhrsw ym23, [cq+64* 3]
+ vpermb m21, m14, m21
+ punpcklwd xm17, xm17
+ vpermb ym20, ym14, ym20
+ vpermb m15, m14, m15
+ vpermb m18, m14, m18
+ vpermb ym16, ym14, ym16
+ punpcklwd xm19, xm19
+ vpermb m14, m14, m23
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m9, [o(pw_16384)]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).transpose_round
+ vshufi32x4 m16, m0, m3, q2020 ; 0
+ vshufi32x4 m26, m0, m3, q3131 ; 4
+ vshufi32x4 m0, m14, m2, q2020 ; 1
+ vshufi32x4 m14, m2, q3131 ; 5
+ vshufi32x4 m3, m19, m7, q3131 ; 15
+ vshufi32x4 m19, m7, q2020 ; 11
+ vshufi32x4 m27, m17, m9, q2020 ; 3
+ vshufi32x4 m17, m9, q3131 ; 7
+ vshufi32x4 m28, m20, m6, q2020 ; 9
+ vshufi32x4 m20, m6, q3131 ; 13
+ vshufi32x4 m22, m1, m18, q2020 ; 2
+ vshufi32x4 m23, m1, m18, q3131 ; 6
+ vshufi32x4 m24, m5, m15, q2020 ; 10
+ vshufi32x4 m25, m5, m15, q3131 ; 14
+ vshufi32x4 m15, m21, m4, q3131 ; 12
+ vshufi32x4 m21, m21, m4, q2020 ; 8
+ mov r4, rsp
+ call .main_part1_fast
+ mova m0, m17
+ mova m3, m28
+ call .main_part1_fast
+ mova m0, m14
+ mova m3, m19
+ call .main_part1_fast
+ mova m0, m27
+ mova m3, m20
+ call .main_part1_fast
+ call .main_part2
+ mova m0, m16
+ mova m1, m21
+ mova m14, m26
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
+ mova [cq+64*14], m21
+ mova [cq+64* 0], m14
+ mova [cq+64* 6], m17
+ mova [cq+64* 8], m18
+ mova [cq+64*10], m19
+ mova [cq+64* 4], m16
+ mova [cq+64* 2], m15
+ mova [cq+64*12], m20
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
+.end:
+ lea r4, [strideq*3]
+ vpbroadcastd m12, [o(pw_2048)]
+ movshdup m13, [o(permD)]
+ lea r5, [r4+strideq] ; stride*4
+ lea r3, [dstq+r4*8]
+ lea r6, [strideq+r5*8] ; stride*33
+ lea r8, [r4+r5*8] ; stride*35
+ add r3, r5 ; dst+stride*28
+ lea r7, [r6+strideq] ; stride*34
+%macro IDCT_32x64_END 6 ; src, mem, stride[1-4]
+%if %2 < 8
+ paddsw m10, m%2, m%1
+ psubsw m11, m%2, m%1
+%else
+ mova m11, [cq+64*(%2*2-16)]
+ paddsw m10, m11, m%1
+ psubsw m11, m%1
+%endif
+ mova m9, [rsp+64*(31-%2)]
+ mova m%1, [rsp+64*%2]
+ paddsw m8, m10, m9
+ psubsw m10, m9
+ paddsw m9, m11, m%1
+ pmovzxbw m0, [dstq+%3]
+ psubsw m11, m%1
+ pmovzxbw m%1, [r3 +%4]
+ REPX {pmulhrsw x, m12}, m8, m10, m9, m11
+ paddw m8, m0
+ pmovzxbw m0, [r3 +%5]
+ paddw m10, m%1
+ pmovzxbw m%1, [dstq+%6]
+ paddw m9, m0
+ paddw m11, m%1
+%if %2 >= 8
+%if %2 == 8
+ pxor m1, m1
+%endif
+ mova [cq+64*(%2*2-16)], m1
+ mova [cq+64*(%2*2-15)], m1
+%endif
+ packuswb m8, m10
+ packuswb m9, m11
+ vpermq m8, m13, m8
+ vpermq m9, m13, m9
+ mova [dstq+%3], ym8
+ vextracti32x8 [r3 +%4], m8, 1
+ mova [r3 +%5], ym9
+ vextracti32x8 [dstq+%6], m9, 1
+%if %2 == 3 || %2 == 7 || %2 == 11
+ add dstq, r5
+ sub r3, r5
+%endif
+%endmacro
+ IDCT_32x64_END 29, 0, strideq*0, r8, r4 , r5*8
+ IDCT_32x64_END 28, 1, strideq*1, r7, strideq*2, r6
+ IDCT_32x64_END 27, 2, strideq*2, r6, strideq*1, r7
+ IDCT_32x64_END 26, 3, r4 , r5*8, strideq*0, r8
+ IDCT_32x64_END 25, 4, strideq*0, r8, r4 , r5*8
+ IDCT_32x64_END 24, 5, strideq*1, r7, strideq*2, r6
+ IDCT_32x64_END 23, 6, strideq*2, r6, strideq*1, r7
+ IDCT_32x64_END 22, 7, r4 , r5*8, strideq*0, r8
+ IDCT_32x64_END 21, 8, strideq*0, r8, r4 , r5*8
+ IDCT_32x64_END 20, 9, strideq*1, r7, strideq*2, r6
+ IDCT_32x64_END 19, 10, strideq*2, r6, strideq*1, r7
+ IDCT_32x64_END 18, 11, r4 , r5*8, strideq*0, r8
+ IDCT_32x64_END 17, 12, strideq*0, r8, r4 , r5*8
+ IDCT_32x64_END 16, 13, strideq*1, r7, strideq*2, r6
+ IDCT_32x64_END 15, 14, strideq*2, r6, strideq*1, r7
+ IDCT_32x64_END 14, 15, r4 , r5*8, strideq*0, r8
+ RET
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ imul r6d, 181
+ mov r3d, 64
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 128+256
+ sar r6d, 8+1
+ jmp m(inv_txfm_add_dct_dct_32x8_8bpc).dconly3
+ALIGN function_align ; bottom three-quarters are zero
+.main_part1_fast:
+ vpbroadcastd m1, [o(idct64_mul+4*0)]
+ vpbroadcastd m8, [o(idct64_mul+4*1)]
+ vpbroadcastd m2, [o(idct64_mul+4*6)]
+ vpbroadcastd m9, [o(idct64_mul+4*7)]
+ pmulhrsw m1, m0 ; t63a
+ pmulhrsw m0, m8 ; t32a
+ pmulhrsw m2, m3 ; t60a
+ pmulhrsw m3, m9 ; t35a
+ mova m8, m0
+ mova m7, m1
+ mova m6, m3
+ mova m5, m2
+ jmp .main_part1b
+.main_part1:
+ ; idct64 steps 1-5:
+ ; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ ; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ ; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ ; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+ vpbroadcastd m7, [o(idct64_mul+4*0)]
+ vpbroadcastd m8, [o(idct64_mul+4*1)]
+ vpbroadcastd m6, [o(idct64_mul+4*2)]
+ vpbroadcastd m9, [o(idct64_mul+4*3)]
+ pmulhrsw m7, m0 ; t63a
+ vpbroadcastd m5, [o(idct64_mul+4*4)]
+ pmulhrsw m0, m8 ; t32a
+ vpbroadcastd m8, [o(idct64_mul+4*5)]
+ pmulhrsw m6, m1 ; t62a
+ vpbroadcastd m4, [o(idct64_mul+4*6)]
+ pmulhrsw m1, m9 ; t33a
+ vpbroadcastd m9, [o(idct64_mul+4*7)]
+ pmulhrsw m5, m2 ; t61a
+ pmulhrsw m2, m8 ; t34a
+ pmulhrsw m4, m3 ; t60a
+ pmulhrsw m3, m9 ; t35a
+ psubsw m8, m0, m1 ; t33
+ paddsw m0, m1 ; t32
+ psubsw m1, m7, m6 ; t62
+ paddsw m7, m6 ; t63
+ psubsw m6, m3, m2 ; t34
+ paddsw m3, m2 ; t35
+ psubsw m2, m4, m5 ; t61
+ paddsw m5, m4 ; t60
+.main_part1b:
+ vpbroadcastd m11, [o(idct64_mul+4*8)]
+ vpbroadcastd m12, [o(idct64_mul+4*9)]
+ ITX_MULSUB_2W 1, 8, 4, 9, 10, 11, 12 ; t33a, t62a
+ vpbroadcastd m11, [o(idct64_mul+4*10)]
+ ITX_MULSUB_2W 2, 6, 4, 9, 10, 12, 11 ; t34a, t61a
+ vpbroadcastd m11, [o(idct64_mul+4*11)]
+ vpbroadcastd m12, [o(idct64_mul+4*12)]
+ psubsw m4, m0, m3 ; t35a
+ paddsw m0, m3 ; t32a
+ psubsw m3, m7, m5 ; t60a
+ paddsw m7, m5 ; t63a
+ psubsw m5, m1, m2 ; t34
+ paddsw m1, m2 ; t33
+ psubsw m2, m8, m6 ; t61
+ paddsw m6, m8 ; t62
+ add r5, 4*13
+ ITX_MULSUB_2W 3, 4, 8, 9, 10, 11, 12 ; t35, t60
+ ITX_MULSUB_2W 2, 5, 8, 9, 10, 11, 12 ; t34a, t61a
+ mova [r4+64*0], m0
+ mova [r4+64*7], m7
+ mova [r4+64*1], m1
+ mova [r4+64*6], m6
+ mova [r4+64*3], m3
+ mova [r4+64*4], m4
+ mova [r4+64*2], m2
+ mova [r4+64*5], m5
+ add r4, 64*8
+ ret
+.main_part2:
+ vpbroadcastd m11, [o(pw_1567_3784 -16*13)]
+ vpbroadcastd m12, [o(pw_m3784_1567 -16*13)]
+ lea r6, [r4+64*7]
+ vpbroadcastd m17, [o(pw_m1567_m3784-16*13)]
+ vpbroadcastd m18, [o(pw_2896_2896 -16*13)]
+ vpbroadcastd m19, [o(pw_m2896_2896 -16*13)]
+ sub r5, 16*13
+.main_part2_loop:
+ mova m0, [r4-64*32] ; t32a
+ mova m1, [r6-64*24] ; t39a
+ mova m2, [r6-64*32] ; t63a
+ mova m3, [r4-64*24] ; t56a
+ mova m4, [r4-64*16] ; t40a
+ mova m5, [r6-64* 8] ; t47a
+ mova m6, [r6-64*16] ; t55a
+ mova m7, [r4-64* 8] ; t48a
+ psubsw m8, m0, m1 ; t39
+ paddsw m0, m1 ; t32
+ psubsw m1, m2, m3 ; t56
+ paddsw m2, m3 ; t63
+ psubsw m3, m5, m4 ; t40
+ paddsw m5, m4 ; t47
+ psubsw m4, m7, m6 ; t55
+ paddsw m7, m6 ; t48
+ ITX_MULSUB_2W 1, 8, 6, 9, 10, 11, 12 ; t39a, t56a
+ ITX_MULSUB_2W 4, 3, 6, 9, 10, 12, 17 ; t40a, t55a
+ psubsw m6, m2, m7 ; t48a
+ paddsw m2, m7 ; t63a
+ psubsw m7, m0, m5 ; t47a
+ paddsw m0, m5 ; t32a
+ psubsw m5, m8, m3 ; t55
+ paddsw m8, m3 ; t56
+ psubsw m3, m1, m4 ; t40
+ paddsw m1, m4 ; t39
+ ITX_MULSUB_2W 6, 7, 4, 9, 10, 18, 19 ; t47, t48
+ ITX_MULSUB_2W 5, 3, 4, 9, 10, 18, 19 ; t40a, t55a
+ mova [r6-64* 8], m2
+ mova [r4-64*32], m0
+ mova [r4-64* 8], m8
+ mova [r6-64*32], m1
+ mova [r6-64*24], m6
+ mova [r4-64*16], m7
+ mova [r4-64*24], m5
+ mova [r6-64*16], m3
+ add r4, 64
+ sub r6, 64
+ cmp r4, r6
+ jb .main_part2_loop
+ ret
+
+cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 7, 0, dst, stride, c, eob
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 7, 30, 64*32, dst, stride, c, eob
+ vpbroadcastd m23, [o(pw_2896x8)]
+%undef cmp
+ cmp eobd, 136
+ jb .fast
+ pmulhrsw m0, m23, [cq+64* 1]
+ pmulhrsw m1, m23, [cq+64*31]
+ pmulhrsw m2, m23, [cq+64*17]
+ pmulhrsw m3, m23, [cq+64*15]
+ vpbroadcastd m10, [o(pd_2048)]
+ mov r4, rsp
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ pmulhrsw m0, m23, [cq+64* 7]
+ pmulhrsw m1, m23, [cq+64*25]
+ pmulhrsw m2, m23, [cq+64*23]
+ pmulhrsw m3, m23, [cq+64* 9]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ pmulhrsw m0, m23, [cq+64* 5]
+ pmulhrsw m1, m23, [cq+64*27]
+ pmulhrsw m2, m23, [cq+64*21]
+ pmulhrsw m3, m23, [cq+64*11]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ pmulhrsw m0, m23, [cq+64* 3]
+ pmulhrsw m1, m23, [cq+64*29]
+ pmulhrsw m2, m23, [cq+64*19]
+ pmulhrsw m3, m23, [cq+64*13]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
+ pmulhrsw m3, m23, [cq+64*24]
+ pmulhrsw m1, m23, [cq+64* 8]
+ pmulhrsw m2, m23, [cq+64*16]
+ pmulhrsw m0, m23, [cq+64* 0]
+ pmulhrsw m14, m23, [cq+64* 4]
+ pmulhrsw m17, m23, [cq+64*28]
+ pmulhrsw m16, m23, [cq+64*20]
+ pmulhrsw m15, m23, [cq+64*12]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ pmulhrsw m22, m23, [cq+64* 2]
+ pmulhrsw m29, m23, [cq+64*30]
+ pmulhrsw m26, m23, [cq+64*18]
+ pmulhrsw m25, m23, [cq+64*14]
+ pmulhrsw m24, m23, [cq+64*10]
+ pmulhrsw m27, m23, [cq+64*22]
+ pmulhrsw m28, m23, [cq+64*26]
+ pmulhrsw m23, [cq+64* 6]
+ mova [cq+64* 0], m14
+ mova [cq+64* 1], m15
+ mova [cq+64* 2], m16
+ mova [cq+64* 3], m17
+ mova [cq+64* 4], m18
+ mova [cq+64* 5], m19
+ mova [cq+64* 6], m20
+ mova [cq+64* 7], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m13, [o(pw_16384)]
+ call .pass1_end_part1
+ mova [cq+64*16], m1
+ mova [cq+64*17], m3
+ mova [cq+64*18], m5
+ mova [cq+64*19], m7
+ mova [cq+64*24], m23
+ mova [cq+64*25], m25
+ mova [cq+64*26], m27
+ mova [cq+64*27], m29
+ pmulhrsw m23, m13, m0 ; a0
+ pmulhrsw m25, m13, m2 ; a2
+ pmulhrsw m27, m13, m4 ; a4
+ pmulhrsw m29, m13, m6 ; a6
+ REPX {pmulhrsw x, m13}, m22, m24, m26, m28 ; e0 e2 e4 e6
+ call .pass1_end_part2
+ mova [cq+64*20], m15
+ mova [cq+64*21], m17
+ mova [cq+64*22], m19
+ mova [cq+64*23], m21
+ mova [cq+64*28], m1
+ mova [cq+64*29], m3
+ mova [cq+64*30], m5
+ mova [cq+64*31], m7
+ REPX {pmulhrsw x, m13}, m14, m16, m18, m20 ; c0 c2 c4 c6
+ REPX {pmulhrsw x, m13}, m0, m2, m4, m6 ; g0 g2 g4 g6
+ vinserti32x8 m3, m23, ym14, 1 ; a00 a01 c00 c01
+ vshufi32x4 m23, m14, q3232 ; a02 a03 c02 c03
+ vinserti32x8 m15, m22, ym0, 1 ; e00 e01 g00 g01
+ vshufi32x4 m22, m0, q3232 ; e02 e03 g02 g03
+ vinserti32x8 m1, m27, ym18, 1 ; a40 a41 c40 c41
+ vshufi32x4 m27, m18, q3232 ; a42 a43 c42 c43
+ vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41
+ vshufi32x4 m26, m4, q3232 ; e42 e43 g42 g43
+ vinserti32x8 m14, m25, ym16, 1 ; a20 a21 c20 c21
+ vshufi32x4 m25, m16, q3232 ; a22 a23 c22 c23
+ vinserti32x8 m17, m24, ym2, 1 ; e20 e21 g20 g21
+ vshufi32x4 m24, m2, q3232 ; e22 e23 g22 g23
+ vinserti32x8 m19, m29, ym20, 1 ; a60 a61 c60 c61
+ vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63
+ vinserti32x8 m20, m28, ym6, 1 ; e60 e61 g60 g61
+ vshufi32x4 m28, m6, q3232 ; e62 e63 g62 g63
+ vshufi32x4 m2, m3, m15, q3131 ; 8
+ vshufi32x4 m0, m3, m15, q2020 ; 0
+ vshufi32x4 m6, m23, m22, q3131 ; 24
+ vshufi32x4 m4, m23, m22, q2020 ; 16
+ vshufi32x4 m3, m1, m18, q3131 ; 12
+ vshufi32x4 m1, m18, q2020 ; 4
+ vshufi32x4 m7, m27, m26, q3131 ; 28
+ vshufi32x4 m5, m27, m26, q2020 ; 20
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ vshufi32x4 m16, m14, m17, q3131 ; 10
+ vshufi32x4 m14, m17, q2020 ; 2
+ vshufi32x4 m17, m19, m20, q3131 ; 14
+ vshufi32x4 m15, m19, m20, q2020 ; 6
+ vshufi32x4 m20, m25, m24, q3131 ; 26
+ vshufi32x4 m18, m25, m24, q2020 ; 18
+ vshufi32x4 m21, m29, m28, q3131 ; 30
+ vshufi32x4 m19, m29, m28, q2020 ; 22
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ pmulhrsw m22, m13, [cq+64*16] ; a1
+ pmulhrsw m23, m13, [cq+64*20] ; c1
+ pmulhrsw m24, m13, [cq+64*24] ; e1
+ pmulhrsw m25, m13, [cq+64*28] ; g1
+ pmulhrsw m26, m13, [cq+64*17] ; a3
+ pmulhrsw m27, m13, [cq+64*21] ; c3
+ pmulhrsw m28, m13, [cq+64*25] ; e3
+ pmulhrsw m29, m13, [cq+64*29] ; g3
+ mova [cq+64* 8], m14
+ mova [cq+64* 9], m15
+ mova [cq+64*10], m16
+ mova [cq+64*11], m17
+ mova [cq+64*12], m18
+ mova [cq+64*13], m19
+ mova [cq+64*14], m20
+ mova [cq+64*15], m21
+ pmulhrsw m14, m13, [cq+64*18] ; a5
+ pmulhrsw m15, m13, [cq+64*22] ; c5
+ pmulhrsw m16, m13, [cq+64*26] ; e5
+ pmulhrsw m17, m13, [cq+64*30] ; g5
+ pmulhrsw m18, m13, [cq+64*19] ; a7
+ pmulhrsw m19, m13, [cq+64*23] ; c7
+ pmulhrsw m20, m13, [cq+64*27] ; e7
+ pmulhrsw m21, m13, [cq+64*31] ; g7
+ vinserti32x8 m8, m22, ym23, 1 ; a10 a11 c10 c11
+ vshufi32x4 m22, m23, q3232 ; a12 a13 c12 c13
+ vinserti32x8 m9, m24, ym25, 1 ; e10 e11 g10 g11
+ vshufi32x4 m24, m25, q3232 ; e12 e13 g12 g13
+ vinserti32x8 m23, m26, ym27, 1 ; a30 a31 c30 c31
+ vshufi32x4 m26, m27, q3232 ; a32 a33 c32 c33
+ vinserti32x8 m11, m28, ym29, 1 ; e30 e31 g30 g31
+ vshufi32x4 m28, m29, q3232 ; e32 e33 g32 g33
+ mova [cq+64* 0], m0
+ mova [cq+64* 1], m1
+ mova [cq+64* 2], m2
+ mova [cq+64* 3], m3
+ mova [cq+64* 4], m4
+ mova [cq+64* 5], m5
+ mova [cq+64* 6], m6
+ mova [cq+64* 7], m7
+ vinserti32x8 m12, m14, ym15, 1 ; a50 a51 c50 c51
+ vshufi32x4 m14, m15, q3232 ; a52 a53 c52 c53
+ vinserti32x8 m13, m16, ym17, 1 ; e50 e51 g50 g51
+ vshufi32x4 m16, m17, q3232 ; e52 e53 g52 g53
+ vinserti32x8 m25, m18, ym19, 1 ; a70 a71 c70 c71
+ vshufi32x4 m18, m19, q3232 ; a72 a73 c72 c73
+ vinserti32x8 m17, m20, ym21, 1 ; e70 e71 g70 g71
+ vshufi32x4 m20, m21, q3232 ; e72 e73 g72 g73
+ vshufi32x4 m27, m23, m11, q3131 ; 11 m27
+ vshufi32x4 m23, m11, q2020 ; 3 m23
+ vshufi32x4 m19, m26, m28, q3131 ; 27 m19
+ vshufi32x4 m15, m26, m28, q2020 ; 19 m15
+ vshufi32x4 m29, m25, m17, q3131 ; 15 m29
+ vshufi32x4 m25, m17, q2020 ; 7 m25
+ vshufi32x4 m21, m18, m20, q3131 ; 31 m21
+ vshufi32x4 m17, m18, m20, q2020 ; 23 m17
+ vshufi32x4 m20, m14, m16, q3131 ; 29 m20
+ vshufi32x4 m16, m14, m16, q2020 ; 21 m16
+ vshufi32x4 m18, m22, m24, q3131 ; 25 m18
+ vshufi32x4 m14, m22, m24, q2020 ; 17 m14
+ vshufi32x4 m26, m8, m9, q3131 ; 9 m26
+ vshufi32x4 m22, m8, m9, q2020 ; 1 m22
+ vshufi32x4 m28, m12, m13, q3131 ; 13 m28
+ vshufi32x4 m24, m12, m13, q2020 ; 5 m24
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
+ vpbroadcastd m13, [o(pw_16384)]
+ pmulhrsw m0, m13, [r4-64*21]
+ pmulhrsw m1, m13, [r4-64*22]
+ pmulhrsw m2, m13, [r4-64*23]
+ pmulhrsw m3, m13, [r4-64*24]
+ pmulhrsw m4, m13, [r4-64*25]
+ pmulhrsw m5, m13, [r4-64*26]
+ pmulhrsw m6, m13, [r4-64*27]
+ pmulhrsw m7, m13, [r4-64*28]
+ mova [cq+64*16], m14
+ mova [cq+64*17], m15
+ mova [cq+64*18], m16
+ mova [cq+64*19], m17
+ mova [cq+64*20], m18
+ mova [cq+64*21], m19
+ mova [cq+64*22], m20
+ mova [cq+64*23], m21
+ pmulhrsw m14, m13, [r4-64*12]
+ pmulhrsw m15, m13, [r4-64*11]
+ pmulhrsw m16, m13, [r4-64*10]
+ pmulhrsw m17, m13, [r4-64* 9]
+ pmulhrsw m18, m13, [r4-64* 8]
+ pmulhrsw m19, m13, [r4-64* 7]
+ pmulhrsw m20, m13, [r4-64* 6]
+ pmulhrsw m21, m13, [r4-64* 5]
+ mova [cq+64*24], m22
+ mova [cq+64*25], m23
+ mova [cq+64*26], m24
+ mova [cq+64*27], m25
+ mova [cq+64*28], m26
+ mova [cq+64*29], m27
+ mova [cq+64*30], m28
+ mova [cq+64*31], m29
+ call .transpose_2x8x8_lo
+ mova [r4-64*12], m1
+ mova [r4-64*11], m3
+ mova [r4-64*10], m5
+ mova [r4-64* 9], m7
+ mova [r4-64* 8], m15
+ mova [r4-64* 7], m17
+ mova [r4-64* 6], m19
+ mova [r4-64* 5], m21
+ vinserti32x8 m22, m0, ym14, 1 ; f00 f01 h00 h01
+ vshufi32x4 m23, m0, m14, q3232 ; f02 f03 h02 h03
+ vinserti32x8 m24, m2, ym16, 1 ; f20 f21 h20 h21
+ vshufi32x4 m25, m2, m16, q3232 ; f22 f23 h22 h23
+ vinserti32x8 m26, m4, ym18, 1 ; f40 f41 h40 h41
+ vshufi32x4 m27, m4, m18, q3232 ; f42 f43 h42 h43
+ vinserti32x8 m28, m6, ym20, 1 ; f60 f61 h60 h61
+ vshufi32x4 m29, m6, m20, q3232 ; f62 f63 h62 h63
+ pmulhrsw m0, m13, [r4-64*20]
+ pmulhrsw m1, m13, [r4-64*19]
+ pmulhrsw m2, m13, [r4-64*18]
+ pmulhrsw m3, m13, [r4-64*17]
+ pmulhrsw m4, m13, [r4-64*16]
+ pmulhrsw m5, m13, [r4-64*15]
+ pmulhrsw m6, m13, [r4-64*14]
+ pmulhrsw m7, m13, [r4-64*13]
+ pmulhrsw m14, m13, [r4-64*29]
+ pmulhrsw m15, m13, [r4-64*30]
+ pmulhrsw m16, m13, [r4-64*31]
+ pmulhrsw m17, m13, [r4-64*32]
+ pmulhrsw m18, m13, [r4-64*33]
+ pmulhrsw m19, m13, [r4-64*34]
+ pmulhrsw m20, m13, [r4-64*35]
+ pmulhrsw m21, m13, [r4-64*36]
+ call .transpose_2x8x8_lo
+ mova [r4-64*20], m1
+ mova [r4-64*19], m3
+ mova [r4-64*18], m5
+ mova [r4-64*17], m7
+ mova [r4-64*16], m15
+ mova [r4-64*15], m17
+ mova [r4-64*14], m19
+ mova [r4-64*13], m21
+ vinserti32x8 m1, m4, ym18, 1 ; b40 b41 d40 d41
+ vshufi32x4 m5, m4, m18, q3232 ; b42 b43 d42 d43
+ vshufi32x4 m4, m0, m14, q3232 ; b02 b03 d02 d03
+ vinserti32x8 m0, ym14, 1 ; b00 b01 d00 d01
+ vinserti32x8 m14, m2, ym16, 1 ; b20 b21 d20 d21
+ vshufi32x4 m18, m2, m16, q3232 ; b22 b23 d22 d23
+ vinserti32x8 m15, m6, ym20, 1 ; b60 b61 d60 d61
+ vshufi32x4 m19, m6, m20, q3232 ; b62 b63 d62 d63
+ vshufi32x4 m2, m0, m22, q3131 ; 8
+ vshufi32x4 m0, m22, q2020 ; 0
+ vshufi32x4 m3, m1, m26, q3131 ; 12
+ vshufi32x4 m1, m26, q2020 ; 4
+ vshufi32x4 m6, m4, m23, q3131 ; 24
+ vshufi32x4 m4, m23, q2020 ; 16
+ vshufi32x4 m7, m5, m27, q3131 ; 28
+ vshufi32x4 m5, m27, q2020 ; 20
+ call m(inv_txfm_add_dct_dct_32x8_8bpc).main
+ vshufi32x4 m16, m14, m24, q3131 ; 10
+ vshufi32x4 m14, m24, q2020 ; 2
+ vshufi32x4 m17, m15, m28, q3131 ; 14
+ vshufi32x4 m15, m28, q2020 ; 6
+ vshufi32x4 m20, m18, m25, q3131 ; 26
+ vshufi32x4 m18, m25, q2020 ; 18
+ vshufi32x4 m21, m19, m29, q3131 ; 30
+ vshufi32x4 m19, m29, q2020 ; 22
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf
+ mova m22, [r4-64*20]
+ mova m26, [r4-64*16]
+ mova m23, [r4-64*19]
+ mova m27, [r4-64*15]
+ mova m24, [r4-64*18]
+ mova m28, [r4-64*14]
+ mova m25, [r4-64*17]
+ mova m29, [r4-64*13]
+ mova [r4-64*20], m14
+ mova [r4-64*19], m15
+ mova [r4-64*18], m16
+ mova [r4-64*17], m17
+ mova [r4-64*16], m18
+ mova [r4-64*15], m19
+ mova [r4-64*14], m20
+ mova [r4-64*13], m21
+ mova m19, [r4-64*12]
+ mova m11, [r4-64* 8]
+ mova m20, [r4-64*11]
+ mova m12, [r4-64* 7]
+ mova m21, [r4-64*10]
+ mova m8, [r4-64* 6]
+ mova m9, [r4-64* 9]
+ mova m18, [r4-64* 5]
+ vshufi32x4 m14, m22, m26, q3232 ; b12 b13 d12 d13
+ vinserti32x8 m22, ym26, 1 ; b10 b11 d10 d11
+ vshufi32x4 m15, m23, m27, q3232 ; b32 b33 d32 d33
+ vinserti32x8 m23, ym27, 1 ; b30 b31 d30 d31
+ vshufi32x4 m16, m24, m28, q3232 ; b52 b53 d52 d53
+ vinserti32x8 m24, ym28, 1 ; b50 b51 d50 d51
+ vshufi32x4 m17, m25, m29, q3232 ; b72 b73 d72 d73
+ vinserti32x8 m25, ym29, 1 ; b70 b71 d70 d71
+ vinserti32x8 m27, m19, ym11, 1 ; f10 f11 h10 h11
+ vshufi32x4 m19, m11, q3232 ; f12 f13 h12 h13
+ vinserti32x8 m28, m20, ym12, 1 ; f30 f31 h30 h31
+ vshufi32x4 m20, m12, q3232 ; f32 f33 h32 h33
+ vinserti32x8 m29, m21, ym8, 1 ; f50 f51 h50 h51
+ vshufi32x4 m21, m8, q3232 ; f52 f53 h52 h53
+ vinserti32x8 m8, m9, ym18, 1 ; f70 f71 h70 h71
+ vshufi32x4 m9, m18, q3232 ; f72 f73 h72 h73
+ vshufi32x4 m26, m22, m27, q3131 ; 9
+ vshufi32x4 m22, m27, q2020 ; 1
+ vshufi32x4 m27, m23, m28, q3131 ; 11
+ vshufi32x4 m23, m28, q2020 ; 3
+ vshufi32x4 m28, m24, m29, q3131 ; 13
+ vshufi32x4 m24, m29, q2020 ; 5
+ vshufi32x4 m29, m25, m8, q3131 ; 15
+ vshufi32x4 m25, m8, q2020 ; 7
+ vshufi32x4 m18, m14, m19, q3131 ; 25
+ vshufi32x4 m14, m19, q2020 ; 17
+ vshufi32x4 m19, m15, m20, q3131 ; 27
+ vshufi32x4 m15, m20, q2020 ; 19
+ vshufi32x4 m20, m16, m21, q3131 ; 29
+ vshufi32x4 m16, m21, q2020 ; 21
+ vshufi32x4 m21, m17, m9, q3131 ; 31
+ vshufi32x4 m17, m9, q2020 ; 23
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf
+ jmp .end
+.fast: ; bottom/right halves are zero
+ {evex}vpmulhrsw ym8, ym23, [cq+64* 4]
+ {evex}vpmulhrsw xm1, xm23, [cq+64*12]
+ mova m28, [o(dup16_perm)]
+ {evex}vpmulhrsw ym7, ym23, [cq+64* 8]
+ vpmulhrsw ym22, ym23, [cq+64* 0]
+ vpermb m8, m28, m8
+ vpermb ym1, ym28, ym1
+ vpermb m7, m28, m7
+ pmovzxwd m9, ym22
+ pslld m9, 16
+ call m(idct_16x16_internal_8bpc).main_fast2
+ {evex}vpmulhrsw ym21, ym23, [cq+64* 2]
+ {evex}vpmulhrsw xm15, xm23, [cq+64*14]
+ {evex}vpmulhrsw xm18, xm23, [cq+64*10]
+ {evex}vpmulhrsw ym14, ym23, [cq+64* 6]
+ vpermb m21, m28, m21
+ punpcklwd xm15, xm15
+ vpermb ym18, ym28, ym18
+ vpermb m14, m28, m14
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+ vpmulhrsw ym22, ym23, [cq+64* 1]
+ {evex}vpmulhrsw xm29, xm23, [cq+64*15]
+ {evex}vpmulhrsw xm26, xm23, [cq+64* 9]
+ {evex}vpmulhrsw ym25, ym23, [cq+64* 7]
+ {evex}vpmulhrsw ym24, ym23, [cq+64* 5]
+ {evex}vpmulhrsw xm27, xm23, [cq+64*11]
+ {evex}vpmulhrsw xm8, xm23, [cq+64*13]
+ {evex}vpmulhrsw ym23, [cq+64* 3]
+ vpermb m22, m28, m22
+ punpcklwd xm29, xm29
+ vpermb ym26, ym28, ym26
+ vpermb m25, m28, m25
+ mova [cq+64* 0], m14
+ mova [cq+64* 1], m15
+ mova [cq+64* 2], m16
+ mova [cq+64* 3], m17
+ REPX {vpermb x, m28, x}, m24, m27, m23
+ punpcklwd xm28, xm8, xm8
+ mova [cq+64* 4], m18
+ mova [cq+64* 5], m19
+ mova [cq+64* 6], m20
+ mova [cq+64* 7], m21
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
+ mov r4, rsp
+ vpbroadcastd m13, [o(pw_16384)]
+ mova [r4+64*16], m4
+ mova [r4+64*17], m5
+ mova [r4+64*18], m6
+ mova [r4+64*19], m7
+ mova [r4+64*28], m26
+ mova [r4+64*29], m27
+ mova [r4+64*30], m28
+ mova [r4+64*31], m29
+ call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end
+ mova [r4+64*20], m22
+ mova [r4+64*21], m23
+ mova [r4+64*22], m24
+ mova [r4+64*23], m25
+ mova [r4+64*24], m26
+ mova [r4+64*25], m27
+ mova [r4+64*26], m28
+ mova [r4+64*27], m29
+ call .pass2_fast
+ mova [cq+64* 8], m14
+ mova [cq+64* 9], m15
+ mova [cq+64*10], m16
+ mova [cq+64*11], m17
+ mova [cq+64*12], m18
+ mova [cq+64*13], m19
+ mova [cq+64*14], m20
+ mova [cq+64*15], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+ mova [cq+64* 0], m0
+ mova [cq+64* 1], m1
+ mova [cq+64* 2], m2
+ mova [cq+64* 3], m3
+ mova [cq+64* 4], m4
+ mova [cq+64* 5], m5
+ mova [cq+64* 6], m6
+ mova [cq+64* 7], m7
+ pmulhrsw m0, m13, [r4+64*16]
+ pmulhrsw m1, m13, [r4+64*17]
+ pmulhrsw m2, m13, [r4+64*18]
+ pmulhrsw m3, m13, [r4+64*19]
+ pmulhrsw m4, m13, [r4+64*20]
+ pmulhrsw m5, m13, [r4+64*21]
+ pmulhrsw m6, m13, [r4+64*22]
+ pmulhrsw m7, m13, [r4+64*23]
+ mova [cq+64*16], m14
+ mova [cq+64*17], m15
+ mova [cq+64*18], m16
+ mova [cq+64*19], m17
+ mova [cq+64*20], m18
+ mova [cq+64*21], m19
+ mova [cq+64*22], m20
+ mova [cq+64*23], m21
+ pmulhrsw m14, m13, [r4+64*24]
+ pmulhrsw m15, m13, [r4+64*25]
+ pmulhrsw m16, m13, [r4+64*26]
+ pmulhrsw m17, m13, [r4+64*27]
+ pmulhrsw m18, m13, [r4+64*28]
+ pmulhrsw m19, m13, [r4+64*29]
+ pmulhrsw m20, m13, [r4+64*30]
+ pmulhrsw m21, m13, [r4+64*31]
+ mova [cq+64*24], m22
+ mova [cq+64*25], m23
+ mova [cq+64*26], m24
+ mova [cq+64*27], m25
+ mova [cq+64*28], m26
+ mova [cq+64*29], m27
+ mova [cq+64*30], m28
+ mova [cq+64*31], m29
+ call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round
+ call .pass2_fast
+ mova [r4+64*16], m14
+ mova [r4+64*17], m15
+ mova [r4+64*18], m16
+ mova [r4+64*19], m17
+ mova [r4+64*20], m18
+ mova [r4+64*21], m19
+ mova [r4+64*22], m20
+ mova [r4+64*23], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+.end:
+ vpbroadcastd m13, [o(pw_2048)]
+ lea r5, [strideq*3]
+ pxor m12, m12
+ lea r3, [dstq+r5*8]
+ lea r6, [strideq+r5] ; stride*4
+ add r3, r6 ; dst+stride*28
+%macro IDCT_64x32_END 5 ; src16, src32, mem, off_lo, off_hi
+ mova m11, [cq+64*( %3)] ; 0
+ mova m9, [cq+64*(31-%3)] ; 31
+%if %3 >= 8
+ mova m%1, [rsp+64*(%1+16)]
+%endif
+ mova m10, [dstq+%4]
+ paddsw m8, m11, m9
+ psubsw m11, m9
+ paddsw m9, m%1, m%2
+ psubsw m%1, m%2
+ punpcklbw m%2, m10, m12
+ punpckhbw m10, m12
+ pmulhrsw m8, m13
+ pmulhrsw m9, m13
+ paddw m8, m%2
+ paddw m9, m10
+ mova m10, [r3+%5]
+ pmulhrsw m11, m13
+ pmulhrsw m%1, m13
+ mova [cq+64*( %3)], m12
+ mova [cq+64*(31-%3)], m12
+ punpcklbw m%2, m10, m12
+ punpckhbw m10, m12
+ packuswb m8, m9
+ paddw m11, m%2
+ paddw m%1, m10
+ packuswb m11, m%1
+ mova [dstq+%4], m8
+ mova [r3 +%5], m11
+%if %3 == 3 || %3 == 7 || %3 == 11
+ add dstq, r6
+ sub r3, r6
+%endif
+%endmacro
+ IDCT_64x32_END 0, 29, 0, strideq*0, r5
+ IDCT_64x32_END 1, 28, 1, strideq*1, strideq*2
+ IDCT_64x32_END 2, 27, 2, strideq*2, strideq*1
+ IDCT_64x32_END 3, 26, 3, r5 , strideq*0
+ IDCT_64x32_END 4, 25, 4, strideq*0, r5
+ IDCT_64x32_END 5, 24, 5, strideq*1, strideq*2
+ IDCT_64x32_END 6, 23, 6, strideq*2, strideq*1
+ IDCT_64x32_END 7, 22, 7, r5 , strideq*0
+ IDCT_64x32_END 0, 21, 8, strideq*0, r5
+ IDCT_64x32_END 1, 20, 9, strideq*1, strideq*2
+ IDCT_64x32_END 2, 19, 10, strideq*2, strideq*1
+ IDCT_64x32_END 3, 18, 11, r5 , strideq*0
+ IDCT_64x32_END 4, 17, 12, strideq*0, r5
+ IDCT_64x32_END 5, 16, 13, strideq*1, strideq*2
+ IDCT_64x32_END 6, 15, 14, strideq*2, strideq*1
+ IDCT_64x32_END 7, 14, 15, r5 , strideq*0
+ RET
+ALIGN function_align
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ imul r6d, 181
+ mov r3d, 32
+ add r6d, 128
+ sar r6d, 8
+ imul r6d, 181
+ add r6d, 128+256
+ sar r6d, 8+1
+ jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly2
+ALIGN function_align
+.pass1_end_part1:
+%macro IDCT_64x32_PASS1_END 3 ; src16, src32, src64
+%if %1 != %3
+ mova m%1, [cq+64*%1]
+%endif
+ mova m9, [r4+64*(%3-36)] ; idct64 32+n
+ mova m11, [r4+64*(-5-%3)] ; idct64 63-n
+ psubsw m8, m%1, m%2 ; idct32 31-n
+ paddsw m%1, m%2 ; idct32 0+n
+%if %1 == %3
+ psubsw m%2, m8, m9 ; out 32+n e
+ paddsw m8, m9 ; out 31-n d
+ psubsw m9, m%1, m11 ; out 63-n h
+ paddsw m%1, m11 ; out 0+n a
+%else
+ paddsw m%2, m8, m9 ; out 23-n c
+ psubsw m8, m9 ; out 40+n f
+ paddsw m9, m%1, m11 ; out 8+n b
+ psubsw m%1, m11 ; out 55-n g
+%endif
+ mova [r4+64*(%3-36)], m8
+ mova [r4+64*(-5-%3)], m9
+%endmacro
+ IDCT_64x32_PASS1_END 0, 29, 0
+ IDCT_64x32_PASS1_END 1, 28, 1
+ IDCT_64x32_PASS1_END 2, 27, 2
+ IDCT_64x32_PASS1_END 3, 26, 3
+ IDCT_64x32_PASS1_END 4, 25, 4
+ IDCT_64x32_PASS1_END 5, 24, 5
+ IDCT_64x32_PASS1_END 6, 23, 6
+ IDCT_64x32_PASS1_END 7, 22, 7
+.transpose_2x8x8_hi: ; m0-m7 + m22-m29 (inverted)
+ punpcklwd m8, m25, m24 ; e0 f0 e1 f1 e2 f2 e3 f3
+ punpckhwd m25, m24 ; e4 f4 e5 f5 e6 f6 e7 f7
+ punpcklwd m24, m23, m22 ; g0 h0 g1 h1 g2 h2 g3 h3
+ punpckhwd m23, m22 ; g4 h4 g5 h5 g6 h6 g7 h7
+ punpcklwd m22, m29, m28 ; a0 b0 a1 b1 a2 b2 a3 b3
+ punpckhwd m29, m28 ; a4 b4 a5 b5 a6 b6 a7 b7
+ punpcklwd m28, m27, m26 ; c0 d0 c1 d1 c2 d2 c3 d3
+ punpckhwd m27, m26 ; c4 d4 c5 d5 c6 d6 c7 d7
+ punpckldq m26, m29, m27 ; a4 b4 c4 d4 a5 b5 c5 d5
+ punpckhdq m29, m27 ; a6 b6 c6 d6 a7 b7 c7 d7
+ punpckldq m27, m8, m24 ; e0 f0 g0 h0 e1 f1 g1 h1
+ punpckhdq m8, m24 ; e2 f2 g2 h2 e3 f3 g3 h3
+ punpckhdq m24, m22, m28 ; a2 b2 c2 d2 a3 b3 c3 d3
+ punpckldq m22, m28 ; a0 b0 c0 d0 a1 b1 c1 d1
+ punpckldq m28, m25, m23 ; e4 f4 g4 h4 e5 f5 g5 h5
+ punpckhdq m25, m23 ; e6 f6 g6 h6 e7 f7 g7 h7
+ punpckhqdq m23, m22, m27 ; 1 23
+ punpcklqdq m22, m27 ; 0 22
+ punpckhqdq m27, m26, m28 ; 5 27
+ punpcklqdq m26, m28 ; 4 26
+ punpcklqdq m28, m29, m25 ; 6 28
+ punpckhqdq m29, m25 ; 7 29
+ punpckhqdq m25, m24, m8 ; 3 25
+ punpcklqdq m24, m8 ; 2 24
+.transpose_8x8:
+ punpckhwd m8, m4, m5
+ punpcklwd m4, m5
+ punpckhwd m5, m0, m1
+ punpcklwd m0, m1
+ punpckhwd m1, m6, m7
+ punpcklwd m6, m7
+ punpckhwd m7, m2, m3
+ punpcklwd m2, m3
+ punpckhdq m3, m0, m2
+ punpckldq m0, m2
+ punpckldq m2, m4, m6
+ punpckhdq m4, m6
+ punpckhdq m6, m5, m7
+ punpckldq m5, m7
+ punpckldq m7, m8, m1
+ punpckhdq m8, m1
+ punpckhqdq m1, m0, m2
+ punpcklqdq m0, m2
+ punpcklqdq m2, m3, m4
+ punpckhqdq m3, m4
+ punpcklqdq m4, m5, m7
+ punpckhqdq m5, m7
+ punpckhqdq m7, m6, m8
+ punpcklqdq m6, m8
+ ret
+.pass1_end_part2:
+ IDCT_64x32_PASS1_END 0, 21, 8
+ IDCT_64x32_PASS1_END 1, 20, 9
+ IDCT_64x32_PASS1_END 2, 19, 10
+ IDCT_64x32_PASS1_END 3, 18, 11
+ IDCT_64x32_PASS1_END 4, 17, 12
+ IDCT_64x32_PASS1_END 5, 16, 13
+ IDCT_64x32_PASS1_END 6, 15, 14
+ IDCT_64x32_PASS1_END 7, 14, 15
+.transpose_2x8x8_lo: ; m0-m7 (inverted) + m14-m21
+ punpcklwd m8, m3, m2
+ punpckhwd m3, m2
+ punpcklwd m2, m1, m0
+ punpckhwd m1, m0
+ punpcklwd m0, m7, m6
+ punpckhwd m7, m6
+ punpcklwd m6, m5, m4
+ punpckhwd m5, m4
+ punpckldq m4, m7, m5
+ punpckhdq m7, m5
+ punpckldq m5, m8, m2
+ punpckhdq m8, m2
+ punpckhdq m2, m0, m6
+ punpckldq m0, m6
+ punpckldq m6, m3, m1
+ punpckhdq m3, m1
+ punpckhqdq m1, m0, m5
+ punpcklqdq m0, m5
+ punpckhqdq m5, m4, m6
+ punpcklqdq m4, m6
+ punpcklqdq m6, m7, m3
+ punpckhqdq m7, m3
+ punpckhqdq m3, m2, m8
+ punpcklqdq m2, m8
+ punpckhwd m8, m18, m19
+ punpcklwd m18, m19
+ punpckhwd m19, m14, m15
+ punpcklwd m14, m15
+ punpckhwd m15, m20, m21
+ punpcklwd m20, m21
+ punpckhwd m21, m16, m17
+ punpcklwd m16, m17
+ punpckhdq m17, m14, m16
+ punpckldq m14, m16
+ punpckldq m16, m18, m20
+ punpckhdq m18, m20
+ punpckhdq m20, m19, m21
+ punpckldq m19, m21
+ punpckldq m21, m8, m15
+ punpckhdq m8, m15
+ punpckhqdq m15, m14, m16
+ punpcklqdq m14, m16
+ punpcklqdq m16, m17, m18
+ punpckhqdq m17, m18
+ punpcklqdq m18, m19, m21
+ punpckhqdq m19, m21
+ punpckhqdq m21, m20, m8
+ punpcklqdq m20, m8
+ ret
+.pass2_fast:
+ vshufi32x4 m24, m9, m15, q3131 ; 5
+ vshufi32x4 m22, m9, m15, q2020 ; 1
+ vshufi32x4 m15, m1, m16, q3131 ; 6
+ vshufi32x4 m14, m1, m16, q2020 ; 2
+ vshufi32x4 m1, m0, m3, q3131 ; 4
+ vshufi32x4 m0, m3, q2020 ; 0
+ vshufi32x4 m3, m8, m2, q3131 ; 12
+ vshufi32x4 m2, m8, m2, q2020 ; 8
+ vshufi32x4 m25, m11, m17, q3131 ; 7
+ vshufi32x4 m23, m11, m17, q2020 ; 3
+ vshufi32x4 m17, m5, m19, q3131 ; 14
+ vshufi32x4 m16, m5, m19, q2020 ; 10
+ vshufi32x4 m29, m6, m20, q3131 ; 15
+ vshufi32x4 m27, m6, m20, q2020 ; 11
+ vshufi32x4 m28, m4, m18, q3131 ; 13
+ vshufi32x4 m26, m4, m18, q2020 ; 9
+ jmp m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+
+cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 7, 0, dst, stride, c, eob
+ lea r5, [o_base]
+ test eobd, eobd
+ jz .dconly
+ PROLOGUE 0, 7, 30, 64*96, dst, stride, c, eob
+%undef cmp
+ cmp eobd, 136
+ jb .fast
+ mova m0, [cq+64* 1]
+ mova m1, [cq+64*31]
+ mova m2, [cq+64*17]
+ mova m3, [cq+64*15]
+ vpbroadcastd m10, [o(pd_2048)]
+ mov r4, rsp
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ mova m0, [cq+64* 7]
+ mova m1, [cq+64*25]
+ mova m2, [cq+64*23]
+ mova m3, [cq+64* 9]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ mova m0, [cq+64* 5]
+ mova m1, [cq+64*27]
+ mova m2, [cq+64*21]
+ mova m3, [cq+64*11]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ mova m0, [cq+64* 3]
+ mova m1, [cq+64*29]
+ mova m2, [cq+64*19]
+ mova m3, [cq+64*13]
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
+ mova m0, [cq+64* 0]
+ mova m1, [cq+64* 8]
+ mova m2, [cq+64*16]
+ mova m3, [cq+64*24]
+ mova m14, [cq+64* 4]
+ mova m15, [cq+64*12]
+ mova m16, [cq+64*20]
+ mova m17, [cq+64*28]
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ mova m22, [cq+64* 2]
+ mova m29, [cq+64*30]
+ mova m26, [cq+64*18]
+ mova m25, [cq+64*14]
+ mova m24, [cq+64*10]
+ mova m27, [cq+64*22]
+ mova m28, [cq+64*26]
+ mova m23, [cq+64* 6]
+ mova [cq+64* 0], m14
+ mova [cq+64* 1], m15
+ mova [cq+64* 2], m16
+ mova [cq+64* 3], m17
+ mova [cq+64* 4], m18
+ mova [cq+64* 5], m19
+ mova [cq+64* 6], m20
+ mova [cq+64* 7], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m13, [o(pw_8192)]
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part1
+ mova [r4+64*36], m1
+ mova [r4+64*37], m3
+ mova [r4+64*38], m5
+ mova [r4+64*39], m7
+ mova [r4+64*44], m23
+ mova [r4+64*45], m25
+ mova [r4+64*46], m27
+ mova [r4+64*47], m29
+ pmulhrsw m23, m13, m0 ; a0
+ pmulhrsw m25, m13, m2 ; a2
+ pmulhrsw m27, m13, m4 ; a4
+ pmulhrsw m29, m13, m6 ; a6
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).pass1_end_part2
+ lea r6, [r4-64*4]
+ add r4, 64*28
+ call .pass2_end
+ mov r4, rsp
+ mova m0, [r4+64*23]
+ mova m1, [r4+64*22]
+ mova m2, [r4+64*21]
+ mova m3, [r4+64*20]
+ mova m4, [r4+64*19]
+ mova m5, [r4+64*18]
+ mova m6, [r4+64*17]
+ mova m7, [r4+64*16]
+ mova m22, [r4+64*15]
+ mova m23, [r4+64*14]
+ mova m24, [r4+64*13]
+ mova m25, [r4+64*12]
+ mova m26, [r4+64*11]
+ mova m27, [r4+64*10]
+ mova m28, [r4+64* 9]
+ mova m29, [r4+64* 8]
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_hi
+ vpbroadcastd m13, [o(pw_8192)]
+ mova [r4+64* 8], m1
+ mova [r4+64* 9], m3
+ mova [r4+64*10], m5
+ mova [r4+64*11], m7
+ mova [r4+64*16], m23
+ mova [r4+64*17], m25
+ mova [r4+64*18], m27
+ mova [r4+64*19], m29
+ pmulhrsw m23, m13, m0 ; b0
+ pmulhrsw m25, m13, m2 ; b2
+ pmulhrsw m27, m13, m4 ; b4
+ pmulhrsw m29, m13, m6 ; b6
+ mova m0, [r4+64*31]
+ mova m1, [r4+64*30]
+ mova m2, [r4+64*29]
+ mova m3, [r4+64*28]
+ mova m4, [r4+64*27]
+ mova m5, [r4+64*26]
+ mova m6, [r4+64*25]
+ mova m7, [r4+64*24]
+ mova m14, [r4+64* 7]
+ mova m15, [r4+64* 6]
+ mova m16, [r4+64* 5]
+ mova m17, [r4+64* 4]
+ mova m18, [r4+64* 3]
+ mova m19, [r4+64* 2]
+ mova m20, [r4+64* 1]
+ mova m21, [r4+64* 0]
+ call m(inv_txfm_add_dct_dct_64x32_8bpc).transpose_2x8x8_lo
+ mov r6, cq
+ call .pass2_end
+ jmp .end
+.fast: ; bottom/right halves are zero
+ mova m28, [o(dup16_perm)]
+ pmovzxwd m9, [cq+64* 0]
+ vpermb m8, m28, [cq+64* 4]
+ vpermb ym1, ym28, [cq+64*12]
+ vpermb m7, m28, [cq+64* 8]
+ pslld m9, 16
+ call m(idct_16x16_internal_8bpc).main_fast2
+ vpermb m21, m28, [cq+64* 2]
+ vpermb ym15, ym28, [cq+64*14]
+ vpermb ym18, ym28, [cq+64*10]
+ vpermb m14, m28, [cq+64* 6]
+ call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast2
+ vpermb m22, m28, [cq+64* 1]
+ vpermb ym29, ym28, [cq+64*15]
+ vpermb ym26, ym28, [cq+64* 9]
+ vpermb m25, m28, [cq+64* 7]
+ vpermb m24, m28, [cq+64* 5]
+ vpermb ym27, ym28, [cq+64*11]
+ vpermb m23, m28, [cq+64* 3]
+ vpermb ym28, ym28, [cq+64*13]
+ mova [cq+64* 0], m14
+ mova [cq+64* 1], m15
+ mova [cq+64* 2], m16
+ mova [cq+64* 3], m17
+ mova [cq+64* 4], m18
+ mova [cq+64* 5], m19
+ mova [cq+64* 6], m20
+ mova [cq+64* 7], m21
+ call m(inv_txfm_add_dct_dct_16x64_8bpc).main_oddhalf_fast
+ vpbroadcastd m13, [o(pw_8192)]
+ mova [cq+64*16], m4
+ mova [cq+64*17], m5
+ mova [cq+64*18], m6
+ mova [cq+64*19], m7
+ mova [cq+64*28], m26
+ mova [cq+64*29], m27
+ mova [cq+64*30], m28
+ mova [cq+64*31], m29
+ call m(inv_txfm_add_dct_dct_64x16_8bpc).pass1_end
+ mova [cq+64*20], m22
+ mova [cq+64*21], m23
+ mova [cq+64*22], m24
+ mova [cq+64*23], m25
+ mova [cq+64*24], m26
+ mova [cq+64*25], m27
+ mova [cq+64*26], m28
+ mova [cq+64*27], m29
+ lea r4, [rsp+64*64]
+ lea r3, [rsp+64*32]
+ call .pass2_fast
+ pmulhrsw m0, m13, [cq+64*16]
+ pmulhrsw m1, m13, [cq+64*17]
+ pmulhrsw m2, m13, [cq+64*18]
+ pmulhrsw m3, m13, [cq+64*19]
+ pmulhrsw m4, m13, [cq+64*20]
+ pmulhrsw m5, m13, [cq+64*21]
+ pmulhrsw m6, m13, [cq+64*22]
+ pmulhrsw m7, m13, [cq+64*23]
+ pmulhrsw m14, m13, [cq+64*24]
+ pmulhrsw m15, m13, [cq+64*25]
+ pmulhrsw m16, m13, [cq+64*26]
+ pmulhrsw m17, m13, [cq+64*27]
+ pmulhrsw m18, m13, [cq+64*28]
+ pmulhrsw m19, m13, [cq+64*29]
+ pmulhrsw m20, m13, [cq+64*30]
+ pmulhrsw m21, m13, [cq+64*31]
+ call m(inv_txfm_add_dct_dct_64x16_8bpc).transpose_round
+ mov r4, rsp
+ mov r3, cq
+ call .pass2_fast
+.end:
+ vpbroadcastd m17, [o(pw_2048)]
+ lea r5, [strideq*8]
+ mov r3, dstq
+ pxor m16, m16
+ sub r4, 64*5 ; rsp+64*31
+ mov r6, rsp
+.end_loop:
+ mova m2, [r6+64*32] ; idct16 0+n lo
+ mova m7, [r6+64*48] ; idct32 31-n lo
+ mova m6, [cq+64* 0] ; idct16 0+n hi
+ mova m0, [cq+64*16] ; idct32 31-n hi
+ mova m4, [r4+64*64] ; idct64 63-n lo
+ mova m1, [r4+64* 0] ; idct64 63-n hi
+ mova m5, [r6+64*64] ; idct64 32+n lo
+ mova m8, [r6+64* 0] ; idct64 32+n hi
+ sub r3, strideq
+ paddsw m3, m2, m7 ; idct32 0+n lo
+ mova m12, [dstq+r5*0]
+ psubsw m2, m7 ; idct32 31-n lo
+ mova m15, [r3 +r5*8]
+ paddsw m7, m6, m0 ; idct32 0+n hi
+ mova m13, [r3 +r5*4]
+ psubsw m6, m0 ; idct32 31-n hi
+ mova m14, [dstq+r5*4]
+ paddsw m0, m3, m4 ; out 0+n lo
+ add r6, 64
+ psubsw m3, m4 ; out 63-n lo
+ sub r4, 64
+ paddsw m4, m7, m1 ; out 0+n hi
+ mova [cq+64* 0], m16
+ psubsw m7, m1 ; out 63-n hi
+ mova [cq+64*16], m16
+ paddsw m1, m2, m5 ; out 31-n lo
+ add cq, 64
+ psubsw m2, m5 ; out 32+n lo
+ paddsw m5, m6, m8 ; out 31-n hi
+ psubsw m6, m8 ; out 32+n hi
+ pmulhrsw m0, m17
+ punpcklbw m8, m12, m16
+ pmulhrsw m4, m17
+ punpckhbw m12, m16
+ pmulhrsw m3, m17
+ punpcklbw m11, m15, m16
+ pmulhrsw m7, m17
+ punpckhbw m15, m16
+ pmulhrsw m1, m17
+ punpcklbw m9, m13, m16
+ pmulhrsw m5, m17
+ punpckhbw m13, m16
+ pmulhrsw m2, m17
+ punpcklbw m10, m14, m16
+ pmulhrsw m6, m17
+ punpckhbw m14, m16
+ paddw m0, m8
+ paddw m4, m12
+ packuswb m0, m4
+ paddw m3, m11
+ paddw m7, m15
+ packuswb m3, m7
+ paddw m1, m9
+ paddw m5, m13
+ packuswb m1, m5
+ paddw m2, m10
+ paddw m6, m14
+ packuswb m2, m6
+ mova [dstq+r5*0], m0
+ mova [r3 +r5*8], m3
+ mova [r3 +r5*4], m1
+ mova [dstq+r5*4], m2
+ add dstq, strideq
+ cmp r6, r4
+ jb .end_loop
+ RET
+.dconly:
+ movsx r6d, word [cq]
+ mov [cq], eobd
+ mov r3d, 64
+ jmp m(inv_txfm_add_dct_dct_64x16_8bpc).dconly
+ALIGN function_align
+.pass2_end:
+ REPX {pmulhrsw x, m13}, m22, m24, m26, m28, m14, m16, m18, m20, m0, m2, m4, m6
+ mova [r4+64*20], m1
+ mova [r4+64*21], m3
+ mova [r4+64*22], m5
+ mova [r4+64*23], m7
+ vinserti32x8 m1, m23, ym14, 1 ; a00 a01 c00 c01
+ vshufi32x4 m3, m23, m14, q3232 ; a02 a03 c02 c03
+ vinserti32x8 m5, m22, ym0, 1 ; e00 e01 g00 g01
+ vshufi32x4 m14, m22, m0, q3232 ; e02 e03 g02 g03
+ mova [r4+64*12], m15
+ mova [r4+64*13], m17
+ mova [r4+64*14], m19
+ mova [r4+64*15], m21
+ vinserti32x8 m15, m27, ym18, 1 ; a40 a41 c40 c41
+ vshufi32x4 m17, m27, m18, q3232 ; a42 a43 c42 c43
+ vinserti32x8 m18, m26, ym4, 1 ; e40 e41 g40 g41
+ vshufi32x4 m19, m26, m4, q3232 ; e42 e43 g42 g43
+ vinserti32x8 m22, m25, ym16, 1 ; a20 a21 c20 c21
+ vshufi32x4 m26, m25, m16, q3232 ; a22 a23 c22 c23
+ vinserti32x8 m25, m24, ym2, 1 ; e20 e21 g20 g21
+ vshufi32x4 m27, m24, m2, q3232 ; e22 e23 g22 g23
+ vinserti32x8 m23, m29, ym20, 1 ; a60 a61 c60 c61
+ vshufi32x4 m29, m20, q3232 ; a62 a63 c62 c63
+ vshufi32x4 m13, m28, m6, q3232 ; e62 e63 g62 g63
+ vinserti32x8 m28, ym6, 1 ; e60 e61 g60 g61
+ vshufi32x4 m0, m1, m5, q2020 ; 0
+ vshufi32x4 m1, m5, q3131 ; 8
+ vshufi32x4 m2, m3, m14, q2020 ; 16
+ vshufi32x4 m3, m14, q3131 ; 24
+ vshufi32x4 m14, m15, m18, q2020 ; 4
+ vshufi32x4 m15, m18, q3131 ; 12
+ vshufi32x4 m16, m17, m19, q2020 ; 20
+ vshufi32x4 m17, m19, q3131 ; 28
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast
+ vshufi32x4 m24, m22, m25, q3131 ; 10
+ vshufi32x4 m22, m25, q2020 ; 2
+ vshufi32x4 m25, m23, m28, q3131 ; 14
+ vshufi32x4 m23, m28, q2020 ; 6
+ vshufi32x4 m28, m26, m27, q3131 ; 26
+ vshufi32x4 m26, m27, q2020 ; 18
+ vshufi32x4 m27, m29, m13, q2020 ; 22
+ vshufi32x4 m29, m13, q3131 ; 30
+ mova [r6+64* 0], m0
+ mova [r6+64* 1], m1
+ mova [r6+64* 2], m2
+ mova [r6+64* 3], m3
+ mova [r6+64* 4], m4
+ mova [r6+64* 5], m5
+ mova [r6+64* 6], m6
+ mova [r6+64* 7], m7
+ mova [r6+64* 8], m14
+ mova [r6+64* 9], m15
+ mova [r6+64*10], m16
+ mova [r6+64*11], m17
+ mova [r6+64*12], m18
+ mova [r6+64*13], m19
+ mova [r6+64*14], m20
+ mova [r6+64*15], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast
+ vpbroadcastd m13, [o(pw_8192)]
+ mova [r6+64*16], m29
+ mova [r6+64*17], m28
+ mova [r6+64*18], m27
+ mova [r6+64*19], m26
+ mova [r6+64*20], m25
+ mova [r6+64*21], m24
+ mova [r6+64*22], m23
+ mova [r6+64*23], m22
+ mova [r6+64*24], m21
+ mova [r6+64*25], m20
+ mova [r6+64*26], m19
+ mova [r6+64*27], m18
+ mova [r6+64*28], m17
+ mova [r6+64*29], m16
+ mova [r6+64*30], m15
+ mova [r6+64*31], m14
+ pmulhrsw m15, m13, [r4+64* 8] ; 1 9 17 25
+ pmulhrsw m16, m13, [r4+64*12]
+ pmulhrsw m17, m13, [r4+64*16]
+ pmulhrsw m18, m13, [r4+64*20]
+ pmulhrsw m19, m13, [r4+64*11] ; 7 15 23 31
+ pmulhrsw m20, m13, [r4+64*15]
+ pmulhrsw m21, m13, [r4+64*19]
+ pmulhrsw m22, m13, [r4+64*23]
+ vinserti32x8 m14, m15, ym16, 1 ; a1 a9 c1 c9
+ vshufi32x4 m15, m16, q3232 ; a17 a25 c17 c25
+ vinserti32x8 m16, m17, ym18, 1 ; e1 e9 g1 g9
+ vshufi32x4 m17, m18, q3232 ; e17 e25 g17 g25
+ pmulhrsw m23, m13, [r4+64*10] ; 5 13 21 29
+ pmulhrsw m24, m13, [r4+64*14]
+ pmulhrsw m25, m13, [r4+64*18]
+ pmulhrsw m26, m13, [r4+64*22]
+ vinserti32x8 m18, m19, ym20, 1 ; a7 a15 c7 c15
+ vshufi32x4 m19, m20, q3232 ; a23 a31 c23 c31
+ vinserti32x8 m20, m21, ym22, 1 ; e7 e15 g7 g15
+ vshufi32x4 m21, m22, q3232 ; e23 e31 g23 g31
+ pmulhrsw m27, m13, [r4+64* 9] ; 3 11 19 27
+ pmulhrsw m28, m13, [r4+64*13]
+ pmulhrsw m29, m13, [r4+64*17]
+ pmulhrsw m13, [r4+64*21]
+ vshufi32x4 m0, m14, m16, q2020 ; 1
+ vshufi32x4 m1, m19, m21, q3131 ; 31
+ vshufi32x4 m2, m15, m17, q2020 ; 17
+ vshufi32x4 m3, m18, m20, q3131 ; 15
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ vshufi32x4 m0, m18, m20, q2020 ; 7
+ vshufi32x4 m1, m15, m17, q3131 ; 25
+ vshufi32x4 m2, m19, m21, q2020 ; 23
+ vshufi32x4 m3, m14, m16, q3131 ; 9
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ vinserti32x8 m22, m23, ym24, 1 ; a5 a13 c5 c13
+ vshufi32x4 m23, m24, q3232 ; a21 a29 c21 c29
+ vinserti32x8 m24, m25, ym26, 1 ; e5 e13 g5 g13
+ vshufi32x4 m25, m26, q3232 ; e21 e29 g21 g29
+ vinserti32x8 m26, m27, ym28, 1 ; a3 a11 c3 c11
+ vshufi32x4 m27, m28, q3232 ; a19 a27 c19 c27
+ vinserti32x8 m28, m29, ym13, 1 ; e3 e11 g3 g11
+ vshufi32x4 m29, m13, q3232 ; e19 e17 g19 g27
+ vshufi32x4 m0, m22, m24, q2020 ; 5
+ vshufi32x4 m1, m27, m29, q3131 ; 27
+ vshufi32x4 m2, m23, m25, q2020 ; 21
+ vshufi32x4 m3, m26, m28, q3131 ; 11
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ vshufi32x4 m0, m26, m28, q2020 ; 3
+ vshufi32x4 m1, m23, m25, q3131 ; 29
+ vshufi32x4 m2, m27, m29, q2020 ; 19
+ vshufi32x4 m3, m22, m24, q3131 ; 13
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1
+ jmp m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
+ALIGN function_align
+.pass2_fast:
+ vshufi32x4 m23, m1, m16, q3131 ; 6
+ vshufi32x4 m22, m1, m16, q2020 ; 2
+ vshufi32x4 m14, m0, m3, q3131 ; 4
+ vshufi32x4 m26, m0, m3, q2020 ; 0
+ vshufi32x4 m28, m9, m15, q3131 ; 5
+ vshufi32x4 m0, m9, m15, q2020 ; 1
+ vshufi32x4 m16, m11, m17, q3131 ; 7
+ vshufi32x4 m29, m11, m17, q2020 ; 3
+ vshufi32x4 m15, m8, m2, q3131 ; 12
+ vshufi32x4 m27, m8, m2, q2020 ; 8
+ vshufi32x4 m25, m5, m19, q3131 ; 14
+ vshufi32x4 m24, m5, m19, q2020 ; 10
+ vshufi32x4 m3, m6, m20, q3131 ; 15
+ vshufi32x4 m19, m6, m20, q2020 ; 11
+ vshufi32x4 m17, m4, m18, q3131 ; 13
+ vshufi32x4 m18, m4, m18, q2020 ; 9
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
+ mova m0, m16
+ mova m3, m18
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
+ mova m0, m28
+ mova m3, m19
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
+ mova m0, m29
+ mova m3, m17
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part1_fast
+ call m(inv_txfm_add_dct_dct_32x64_8bpc).main_part2
+ mova m0, m26
+ mova m1, m27
+ call m(inv_txfm_add_dct_dct_32x16_8bpc).main_oddhalf_fast2
+ mova [r3+64* 0], m0
+ mova [r3+64* 1], m1
+ mova [r3+64* 2], m2
+ mova [r3+64* 3], m3
+ mova [r3+64* 4], m4
+ mova [r3+64* 5], m5
+ mova [r3+64* 6], m6
+ mova [r3+64* 7], m7
+ mova [r3+64* 8], m14
+ mova [r3+64* 9], m15
+ mova [r3+64*10], m16
+ mova [r3+64*11], m17
+ mova [r3+64*12], m18
+ mova [r3+64*13], m19
+ mova [r3+64*14], m20
+ mova [r3+64*15], m21
+ call m(inv_txfm_add_dct_dct_32x32_8bpc).main_oddhalf_fast2
+ mova [r3+64*16], m29
+ mova [r3+64*17], m28
+ mova [r3+64*18], m27
+ mova [r3+64*19], m26
+ mova [r3+64*20], m25
+ mova [r3+64*21], m24
+ mova [r3+64*22], m23
+ mova [r3+64*23], m22
+ mova [r3+64*24], m21
+ mova [r3+64*25], m20
+ mova [r3+64*26], m19
+ mova [r3+64*27], m18
+ mova [r3+64*28], m17
+ mova [r3+64*29], m16
+ mova [r3+64*30], m15
+ mova [r3+64*31], m14
+ ret
+
+%endif ; ARCH_X86_64